import requests
import re
import html
import json
import jieba
from urllib.parse import urlparse
class myfunsion:
def a():
print("test")
class det:#测试url类型
def image(url):
style_type_list=[".jpg",".jpeg",".png",".gif",".svg"]
#将url转小写
url=url.lower()
for i in style_type_list:
if i in url:
return True
return False
def style(url):
style_type_list=[".js",".css"]
url=url.lower()
for i in style_type_list:
if i in url:
return True
return False
#检查是否有重复
# arr 数组
# i 数据
def repeat(arr,i):
b=0
for a in arr:
if a==i:
return b
b+=1
return -1
#检测url域名
def domin(url):
domin=urlparse(url)
return domin
class net:
headers = {
'user-agent': 'my-app/0.0.1',
'Content-Type': 'text/html; charset=utf-8'
}
def get_html(url):
return_js={}
return_js['url']=url
try :
r = requests.get(url,headers=net.headers,verify=False,timeout=(60, 60))
return_js['status']=r.status_code
get_content_type='text/html'
content_type=re.findall(get_content_type,r.headers['Content-Type'])
if(len(content_type)==0):
return_js['contenttype']=r.headers['Content-Type']
else:
return_js['contenttype']=get_content_type
if(return_js['contenttype']=='text/html'):
get_gb2312='gb2312'
charset=re.findall(get_gb2312,r.headers['Content-Type'])
if(len(charset)==0):
return_js['charset_type']='utf-8'
else:
return_js['charset_type']='gb2312'
htmlx_byt=r.content
htmlx=str(htmlx_byt,return_js['charset_type'])
return_js['html']=htmlx
except:
return_js['status']=0
return return_js
def ana_html(webdata):
return_js={}
return_js['status']=webdata['status']
return_js['url']=webdata['url']
if webdata['status']==0:
return_js['score']="0"
return_js['contenttype']="none"
return return_js
return_js['contenttype']=webdata['contenttype']
#return_js['charset_type']=webdata['charset_type']
if(return_js['contenttype']=='text/html'):
thisurlscore=100
htmlx=webdata['html']
ur=urlparse(return_js['url'])
domain=ur.netloc
return_js['domain']=domain
#根url
if(ur.path==''):
thisurlscore+=50
if(ur.path=='/'):
thisurlscore+=50
#是否带参数
if(ur.query!=''):
thisurlscore-=10
#是否带位置
if(ur.fragment!=''):
thisurlscore-=50
#获取js
get_htmlx_js='