Signed-off-by: kevin <kevin@dsm.lmve.net>
This commit is contained in:
kevin
2024-01-31 23:09:11 +08:00
commit ab81ca865c
7 changed files with 943 additions and 0 deletions
+417
View File
@@ -0,0 +1,417 @@
import requests
import re
import html
import json
import jieba
from urllib.parse import urlparse
class myfunsion:
def a():
print("test")
class det:#测试url类型
def image(url):
style_type_list=[".jpg",".jpeg",".png",".gif",".svg"]
#将url转小写
url=url.lower()
for i in style_type_list:
if i in url:
return True
return False
def style(url):
style_type_list=[".js",".css"]
url=url.lower()
for i in style_type_list:
if i in url:
return True
return False
#检查是否有重复
# arr 数组
# i 数据
def repeat(arr,i):
b=0
for a in arr:
if a==i:
return b
b+=1
return -1
#检测url域名
def domin(url):
domin=urlparse(url)
return domin
class net:
headers = {
'user-agent': 'my-app/0.0.1',
'Content-Type': 'text/html; charset=utf-8'
}
def get_html(url):
return_js={}
return_js['url']=url
try :
r = requests.get(url,headers=net.headers,verify=False,timeout=(60, 60))
return_js['status']=r.status_code
get_content_type='text/html'
content_type=re.findall(get_content_type,r.headers['Content-Type'])
if(len(content_type)==0):
return_js['contenttype']=r.headers['Content-Type']
else:
return_js['contenttype']=get_content_type
if(return_js['contenttype']=='text/html'):
get_gb2312='gb2312'
charset=re.findall(get_gb2312,r.headers['Content-Type'])
if(len(charset)==0):
return_js['charset_type']='utf-8'
else:
return_js['charset_type']='gb2312'
htmlx_byt=r.content
htmlx=str(htmlx_byt,return_js['charset_type'])
return_js['html']=htmlx
except:
return_js['status']=0
return return_js
def ana_html(webdata):
return_js={}
return_js['status']=webdata['status']
return_js['url']=webdata['url']
if webdata['status']==0:
return_js['score']="0"
return_js['contenttype']="none"
return return_js
return_js['contenttype']=webdata['contenttype']
#return_js['charset_type']=webdata['charset_type']
if(return_js['contenttype']=='text/html'):
thisurlscore=100
htmlx=webdata['html']
ur=urlparse(return_js['url'])
domain=ur.netloc
return_js['domain']=domain
#根url
if(ur.path==''):
thisurlscore+=50
if(ur.path=='/'):
thisurlscore+=50
#是否带参数
if(ur.query!=''):
thisurlscore-=10
#是否带位置
if(ur.fragment!=''):
thisurlscore-=50
#获取js
get_htmlx_js='<script[\s\S]*?</script>'
htmlx_js=re.findall(get_htmlx_js,htmlx)
#清除htmlx js
for i in htmlx_js:
htmlx=htmlx.replace(i,"")
htmlx_kuaizhao=html.escape(htmlx) #干掉js后截取快照
htmlx_yasuo=["\n","\r","\t"," "]
for i in htmlx_yasuo:
htmlx_kuaizhao=htmlx_kuaizhao.replace(i,"")#压缩一下
#获取style
get_htmlx_style='<style[\s\S]*?</style>'
htmlx_style=re.findall(get_htmlx_style,htmlx)
#print(htmlx_style)
#清除htmlx style
for i in htmlx_style:
htmlx=htmlx.replace(i,"")
#获取textarea ,像百度这样的阴间网站有隐藏的输入框里面会有样式什么的奇怪东西
get_htmlx_textarea='<textarea[\s\S]*?</textarea>'
htmlx_textarea=re.findall(get_htmlx_textarea,htmlx)
for i in htmlx_textarea:
htmlx=htmlx.replace(i,"")
#获取页面中的url并分类
get_url=r'\"(http[s]?://[\S]*)\"'
all_url=re.findall(get_url,htmlx)
style_url_list=[]
image_url_list=[]
new_url_list=[]
for i in all_url:
if det.style(i):
style_url_list.append(i)
else:
if det.image(i):
image_url_list.append(i)
else:
new_url_list.append(i)
#干掉url
image_url=[]
new_url=[]
image_url_js={}
new_url_js={}
image_url_int=0
new_url_int=0
for i in style_url_list:
htmlx=htmlx.replace(i,"")
for i in image_url_list:
htmlx=htmlx.replace(i,"")
if det.repeat(image_url,i)==-1:
image_url.append(i)
image_url_js[str(image_url_int)]=i
image_url_int+=1
thisurlscore+=2#图片加分
for i in new_url_list:
htmlx=htmlx.replace(i,"")
if det.repeat(new_url,i)==-1:
new_url.append(i)
new_url_js[str(new_url_int)]=i
new_url_int+=1
#pending
image_url_str=json.dumps(image_url_js)
new_url_str=json.dumps(new_url_js)
return_js['images']=image_url_str
return_js['newurls']=new_url_str
#对新url评分
new_urlscore_js={}
new_urldomain_js={}
new_url_int=0
for i in new_url:
score=100
try :
te=urlparse(i)
#根url
if(te.path==''):
score+=10
if(te.path=='/'):
score+=10
#是否站内url
if(te.netloc==domain):
score-=10
else:
score+=10
thisurlscore+=5#站外url加分
#是否带参数
if(te.query!=''):
score-=10
#是否带位置
if(te.fragment!=''):
score-=50
except:
score=0
new_urlscore_js[str(new_url_int)]=score
new_urldomain_js[str(new_url_int)]=te.netloc
new_url_int+=1
new_urlscore_js_str=json.dumps(new_urlscore_js)
new_urldomain_js_str=json.dumps(new_urldomain_js)
return_js['newurlscore']=new_urlscore_js_str
return_js['newurldomain']=new_urldomain_js_str
#获取标题
get_htmlx_title='<title[\s\S]*?>([\s\S]*?)</title>'
htmlx_title=re.findall(get_htmlx_title,htmlx)
if(len(htmlx_title)>0):
return_js['title']=htmlx_title[0]
print(htmlx_title)
else:
return_js['title']=return_js['url']
#get all mate
get_htmlx_mate_lab='<meta[\S ]*?>'
htmlx_mate_lab=re.findall(get_htmlx_mate_lab,htmlx)
htmlx_mate=[]
htmlx_mate_str=""
htmlx_mate_js={}
for i in htmlx_mate_lab:
get_htmlx_mate_name='name="([\S ]*?)"'
htmlx_mate_name=re.findall(get_htmlx_mate_name,i)
if len(htmlx_mate_name)==1:#只允许有1个名字 其他阴间标签不要
get_htmlx_mate_val='content="([\S\s]*?)"'
htmlx_mate_val=re.findall(get_htmlx_mate_val,i)
if len(htmlx_mate_val)==1:
if htmlx_mate_name[0]!='':
#print(htmlx_mate_name[0])
#print(htmlx_mate_val[0])
#htmlx_mate_str+="\""+htmlx_mate_name[0]+"\":\""+htmlx_mate_val[0]+"\","
htmlx_mate_js[htmlx_mate_name[0]]=htmlx_mate_val[0]
htmlx_mate.append(htmlx_mate_val[0])
htmlx_mate_str=json.dumps(htmlx_mate_js)
return_js['mate']=htmlx_mate_str
#获取所有标签内容
get_htmlx_all_tab='>([\s\S]*?)<'
htmlx_tab=re.findall(get_htmlx_all_tab,htmlx)
#print(htmlx_tab)
#inpute mate
for i in htmlx_mate:
htmlx_tab.append(i)
#先干掉\r\n\t 保留其他标点符号做语义识别
get_text_rnt=['\r','\n','\t',' ']
htmlx_tab_len=len(htmlx_tab)
for i in range(htmlx_tab_len-1,-1,-1):#从后往前数
for t in get_text_rnt:htmlx_tab[i]=htmlx_tab[i].replace(t,"")
htmlx_tab[i]=htmlx_tab[i].strip()
if htmlx_tab[i]=='':del htmlx_tab[i]#删除空
#print(htmlx_tab)
get_htmlx_BDFH=["\"","\\","^","","=","/","","","","#","©","|","_","-"," ","*",";","&","$","%","!","?",",",".","(",")","[","]","{","}","<",">","","%","","","","","","","","","","",""]
#提取关键词
htmlx_sents_js={}
htmlx_words_js={}
sents=[]
words=[]
words_int=0
links_js={}
links_int=0
sents_int=0
for i in range(len(htmlx_tab)):
sent=htmlx_tab[i] #获取每个句子
word=jieba.cut_for_search(sent)#从每个句子获取单词
#数据库限制每个句子256字符
sent=sent[0:256]
if det.repeat(sents,sent)==-1: #查重
sents.append(sent)
htmlx_sents_js[str(sents_int)]=sent
for t in word:
#删除一些意义不大的符号
#数据库限制每个单词8字符
w=t[0:8]
for n in get_htmlx_BDFH:
w=w.replace(n,"")
if w!='':
words_det_int=-1
words_det_int=det.repeat(words,w)
if words_det_int==-1:
words.append(w)
words_det_int=words_int
htmlx_words_js[str(words_int)]=w
words_int+=1
links_js[str(links_int)]=str(words_det_int)+":"+str(sents_int)
links_int+=1
sents_int+=1
else:
#重复的内容
thisurlscore-=1
return_js['sents_int']=sents_int
return_js['words_int']=words_int
return_js['links_int']=links_int
htmlx_sents_str=json.dumps(htmlx_sents_js)
htmlx_words_str=json.dumps(htmlx_words_js)
links_str=json.dumps(links_js)
return_js['sents']=htmlx_sents_str
return_js['words']=htmlx_words_str
return_js['links']=links_str
return_js['htmlx']=htmlx_kuaizhao
return_js['score']=thisurlscore#对当前url评分
else:
return_js['title']=return_js['url']
return return_js
test=0
getone={}
getone['password']="(*&RV^*(&VRH*(V)))"
getone_post_url="https://lmve.net/php/getoneurl.php"
post_url="https://lmve.net/php/newurlreptile.php"
if test==0:
while 1:
print("**本地消息**\n获取url..")
try:
r = requests.post(getone_post_url,data=getone)
if r.status_code==200:
html_byt=r.content
html_text=str(html_byt,'utf-8')
print("获取url成功:")
print(html_text)
print("\n")
r.close()
if html_text!='':
print("获取text..")
text=net.get_html(html_text)
print("开始分析..")
postdata=net.ana_html(text)
print("分析完成")
postdata['password']="(*&RV^*(&VRH*(V)))"
if postdata['contenttype']=='text/html':
print("分离数据成功")
print("url:")
print(postdata['url'])
print("status code:")
print(postdata['status'])
print("content type:")
print(postdata['contenttype'])
print("Title:")
print(postdata['title'])
print("Sents const:")
print(postdata['sents_int'])
print("Words const:")
print(postdata['words_int'])
print("Links const:")
print(postdata['links_int'])
else :
print("分离数据成功 no html")
print("url:")
print(postdata['url'])
print("status code:")
print(postdata['status'])
print("content type:")
print(postdata['contenttype'])
print("开始发送..")
try:
r1 = requests.post(post_url,data=postdata)
if r1.status_code==200:
html_byt=r1.content
html_text=str(html_byt,'utf-8')
print("发送成功:200")
print(html_text)
r1.close()
else:
print("error"+str(r1.status_code))
except:
print("try aga")
else:
print("获取url失败")
except:
print("tyr agin")
if test==1:
postdata=net.ana_html(net.get_html("https://www.foreverblog.cn/blog/1958.html"))
postdata['password']="(*&RV^*(&VRH*(V)))"
r1 = requests.post(post_url,data=postdata)
if r1.status_code==200:
html_byt=r1.content
html_text=str(html_byt,'utf-8')
print("发送成功:200\n")
print(html_text)
r1.close()
else:
print("error"+str(r1.status_code))