+374
@@ -0,0 +1,374 @@
|
||||
# encoding:utf-8
|
||||
#简易爬虫
|
||||
|
||||
# requests 用于下载网页源码 第三方库 通过pip安装
|
||||
# re 正则表达式要用到
|
||||
# threading 启用多线程
|
||||
# os,sys 用于创建目录
|
||||
# time 用于获得当天时间
|
||||
from asyncio.windows_events import NULL
|
||||
import requests
|
||||
import re
|
||||
import jieba
|
||||
import html
|
||||
import json
|
||||
from myfunsion import myfunsion
|
||||
#from bs4 import BeautifulSoup as bs
|
||||
#from lxml import etree
|
||||
|
||||
def det_image(url):
|
||||
style_type_list=[".jpg",".jpeg",".png",".gif"]
|
||||
#将url转小写
|
||||
url=url.lower()
|
||||
for i in style_type_list:
|
||||
if i in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def det_style(url):
|
||||
style_type_list=[".js",".css"]
|
||||
url=url.lower()
|
||||
for i in style_type_list:
|
||||
if i in url:
|
||||
return True
|
||||
return False
|
||||
|
||||
def det_repeat(arr,i):
|
||||
b=0
|
||||
for a in arr:
|
||||
if a==i:
|
||||
return b
|
||||
b+=1
|
||||
return -1
|
||||
|
||||
def get_page_words(url):
|
||||
|
||||
retunrn_js={}
|
||||
begin_url=url
|
||||
headers = {
|
||||
'user-agent': 'my-app/0.0.1',
|
||||
'Content-Type': 'text/html; charset=utf-8'
|
||||
}
|
||||
|
||||
retunrn_js['url']=begin_url
|
||||
#print(begin_url)
|
||||
#获取整个页面
|
||||
try:
|
||||
r = requests.get(begin_url,headers=headers,verify=False)
|
||||
retunrn_js['status']=r.status_code
|
||||
get_content_type='text/html'
|
||||
content_type=re.findall(get_content_type,r.headers['Content-Type'])
|
||||
if(len(content_type)==0):
|
||||
retunrn_js['contenttype']=r.headers['Content-Type']
|
||||
else:
|
||||
retunrn_js['contenttype']=get_content_type
|
||||
|
||||
get_gb2312='gb2312'
|
||||
charset=re.findall(get_gb2312,r.headers['Content-Type'])
|
||||
if(len(charset)==0):
|
||||
charset_type='utf-8'
|
||||
else:
|
||||
charset_type='gb2312'
|
||||
|
||||
if(retunrn_js['contenttype']=='text/html'):
|
||||
|
||||
htmlx_byt=r.content
|
||||
htmlx=str(htmlx_byt,charset_type)
|
||||
#print(htmlx)
|
||||
|
||||
#获取js
|
||||
get_htmlx_js='<script[\s\S]*?</script>'
|
||||
htmlx_js=re.findall(get_htmlx_js,htmlx)
|
||||
#清除htmlx js
|
||||
for i in htmlx_js:
|
||||
htmlx=htmlx.replace(i,"")
|
||||
|
||||
htmlx_kuaizhao=html.escape(htmlx) #干掉js后截取快照
|
||||
htmlx_yasuo=["\n","\r","\t"," "]
|
||||
for i in htmlx_yasuo:
|
||||
htmlx_kuaizhao=htmlx_kuaizhao.replace(i,"")#压缩一下
|
||||
|
||||
#获取style
|
||||
get_htmlx_style='<style[\s\S]*?</style>'
|
||||
htmlx_style=re.findall(get_htmlx_style,htmlx)
|
||||
#print(htmlx_style)
|
||||
#清除htmlx style
|
||||
for i in htmlx_style:
|
||||
htmlx=htmlx.replace(i,"")
|
||||
|
||||
#print(htmlx)
|
||||
|
||||
#获取textarea ,像百度这样的阴间网站有隐藏的输入框里面会有样式什么的奇怪东西
|
||||
get_htmlx_textarea='<textarea[\s\S]*?</textarea>'
|
||||
htmlx_textarea=re.findall(get_htmlx_textarea,htmlx)
|
||||
for i in htmlx_textarea:
|
||||
htmlx=htmlx.replace(i,"")
|
||||
|
||||
#获取页面中的url并分类
|
||||
get_url=r'"(http[s]?://[\S]*)"'
|
||||
all_url=re.findall(get_url,htmlx)
|
||||
style_url_list=[]
|
||||
image_url_list=[]
|
||||
new_url_list=[]
|
||||
for i in all_url:
|
||||
if det_style(i):
|
||||
style_url_list.append(i)
|
||||
else:
|
||||
if det_image(i):
|
||||
image_url_list.append(i)
|
||||
else:
|
||||
new_url_list.append(i)
|
||||
|
||||
#干掉url
|
||||
image_url=[]
|
||||
new_url=[]
|
||||
image_url_js={}
|
||||
new_url_js={}
|
||||
image_url_int=0
|
||||
new_url_int=0
|
||||
for i in style_url_list:
|
||||
htmlx=htmlx.replace(i,"")
|
||||
for i in image_url_list:
|
||||
htmlx=htmlx.replace(i,"")
|
||||
if det_repeat(image_url,i)==-1:
|
||||
image_url.append(i)
|
||||
image_url_js[str(image_url_int)]=i
|
||||
image_url_int+=1
|
||||
for i in new_url_list:
|
||||
htmlx=htmlx.replace(i,"")
|
||||
if det_repeat(new_url,i)==-1:
|
||||
new_url.append(i)
|
||||
new_url_js[str(new_url_int)]=i
|
||||
new_url_int+=1
|
||||
|
||||
image_url_str=json.dumps(image_url_js)
|
||||
new_url_str=json.dumps(new_url_js)
|
||||
|
||||
#获取标签内容
|
||||
#soup=bs(htmlx,'htmlx.parser')
|
||||
#dom = etree.htmlx(htmlx)
|
||||
#print(soup.body)
|
||||
|
||||
#获取标题
|
||||
get_htmlx_title='<title[\s\S]*?>([\s\S]*?)</title>'
|
||||
htmlx_title=re.findall(get_htmlx_title,htmlx)
|
||||
#print(htmlx_title)
|
||||
|
||||
#get all mate
|
||||
get_htmlx_mate_lab='<meta[\S ]*?>'
|
||||
htmlx_mate_lab=re.findall(get_htmlx_mate_lab,htmlx)
|
||||
htmlx_mate=[]
|
||||
htmlx_mate_str=""
|
||||
htmlx_mate_js={}
|
||||
for i in htmlx_mate_lab:
|
||||
get_htmlx_mate_name='name="([\S ]*?)"'
|
||||
htmlx_mate_name=re.findall(get_htmlx_mate_name,i)
|
||||
if len(htmlx_mate_name)==1:#只允许有1个名字 其他阴间标签不要
|
||||
get_htmlx_mate_val='content="([\S\s]*?)"'
|
||||
htmlx_mate_val=re.findall(get_htmlx_mate_val,i)
|
||||
if len(htmlx_mate_val)==1:
|
||||
if htmlx_mate_name[0]!='':
|
||||
#print(htmlx_mate_name[0])
|
||||
#print(htmlx_mate_val[0])
|
||||
#htmlx_mate_str+="\""+htmlx_mate_name[0]+"\":\""+htmlx_mate_val[0]+"\","
|
||||
htmlx_mate_js[htmlx_mate_name[0]]=htmlx_mate_val[0]
|
||||
htmlx_mate.append(htmlx_mate_val[0])
|
||||
htmlx_mate_str=json.dumps(htmlx_mate_js)
|
||||
|
||||
#获取所有标签内容
|
||||
get_htmlx_all_tab='>([\s\S]*?)<'
|
||||
htmlx_tab=re.findall(get_htmlx_all_tab,htmlx)
|
||||
#print(htmlx_tab)
|
||||
|
||||
#inpute mate
|
||||
for i in htmlx_mate:
|
||||
htmlx_tab.append(i)
|
||||
|
||||
#先干掉\r\n\t 保留其他标点符号做语义识别
|
||||
get_text_rnt=['\r','\n','\t',' ']
|
||||
htmlx_tab_len=len(htmlx_tab)
|
||||
for i in range(htmlx_tab_len-1,-1,-1):#从后往前数
|
||||
for t in get_text_rnt:htmlx_tab[i]=htmlx_tab[i].replace(t,"")
|
||||
htmlx_tab[i]=htmlx_tab[i].strip()
|
||||
if htmlx_tab[i]=='':del htmlx_tab[i]#删除空
|
||||
#print(htmlx_tab)
|
||||
|
||||
get_htmlx_BDFH=["\"","\\","^","’","=","/","、","“","”","#","©","|","_","-"," ","*",";","&","$","%","!","?",",",".","(",")","[","]","{","}","<",">","¥","%","!","?",",","。","(",")","【","】","《","》",":"]
|
||||
#提取关键词
|
||||
htmlx_sents_js={}
|
||||
htmlx_words_js={}
|
||||
sents=[]
|
||||
words=[]
|
||||
words_int=0
|
||||
links_js={}
|
||||
links_int=0
|
||||
sents_int=0
|
||||
for i in range(len(htmlx_tab)):
|
||||
sent=htmlx_tab[i] #获取每个句子
|
||||
word=jieba.cut_for_search(sent)#从每个句子获取单词
|
||||
#数据库限制每个句子256字符
|
||||
sent=sent[0:256]
|
||||
if det_repeat(sents,sent)==-1: #查重
|
||||
sents.append(sent)
|
||||
htmlx_sents_js[str(sents_int)]=sent
|
||||
|
||||
for t in word:
|
||||
#删除一些意义不大的符号
|
||||
#数据库限制每个单词8字符
|
||||
w=t[0:8]
|
||||
for n in get_htmlx_BDFH:
|
||||
w=w.replace(n,"")
|
||||
if w!='':
|
||||
words_det_int=-1
|
||||
words_det_int=det_repeat(words,w)
|
||||
if words_det_int==-1:
|
||||
words.append(w)
|
||||
words_det_int=words_int
|
||||
htmlx_words_js[str(words_int)]=w
|
||||
words_int+=1
|
||||
links_js[str(links_int)]=str(words_det_int)+":"+str(sents_int)
|
||||
links_int+=1
|
||||
sents_int+=1
|
||||
|
||||
htmlx_sents_str=json.dumps(htmlx_sents_js)
|
||||
htmlx_words_str=json.dumps(htmlx_words_js)
|
||||
links_str=json.dumps(links_js)
|
||||
|
||||
|
||||
|
||||
if len(htmlx_title)==0:
|
||||
retunrn_js['title']=begin_url
|
||||
else:
|
||||
retunrn_js['title']=htmlx_title[0]
|
||||
retunrn_js['mate']=htmlx_mate_str
|
||||
retunrn_js['sents']=htmlx_sents_str
|
||||
retunrn_js['words']=htmlx_words_str
|
||||
retunrn_js['links']=links_str
|
||||
retunrn_js['images']=image_url_str
|
||||
retunrn_js['new_url']=new_url_str
|
||||
retunrn_js['htmlx']=htmlx_kuaizhao
|
||||
|
||||
print("分离数据成功")
|
||||
print("url:")
|
||||
print(retunrn_js['url'])
|
||||
print("status code:")
|
||||
print(retunrn_js['status'])
|
||||
print("content type:")
|
||||
print(retunrn_js['contenttype'])
|
||||
print("Title:")
|
||||
print(retunrn_js['title'])
|
||||
print("Sents const:")
|
||||
print(sents_int)
|
||||
print("Words const:")
|
||||
print(words_int)
|
||||
print("Links const:")
|
||||
print(links_int)
|
||||
else:
|
||||
retunrn_js['title']=begin_url
|
||||
print("分离数据成功 no html")
|
||||
print("url:")
|
||||
print(retunrn_js['url'])
|
||||
print("status code:")
|
||||
print(retunrn_js['status'])
|
||||
print("content type:")
|
||||
print(retunrn_js['contenttype'])
|
||||
print("Title:")
|
||||
print(retunrn_js['title'])
|
||||
r.close()
|
||||
except:
|
||||
print("连接失败")
|
||||
retunrn_js['status']=0
|
||||
retunrn_js['contenttype']="Cant connect"
|
||||
|
||||
|
||||
return retunrn_js
|
||||
|
||||
|
||||
def getandpost(bgurl):
|
||||
page_data={}
|
||||
page_data=get_page_words(bgurl)
|
||||
page_data['password']="(*&RV^*(&VRH*(V)))"
|
||||
print("本地爬取完成,开始发送\n")
|
||||
post_url="https://lmve.net/php/reptile.php"
|
||||
r1 = requests.post(post_url,data=page_data)
|
||||
if r1.status_code==200:
|
||||
html_byt=r1.content
|
||||
html_text=str(html_byt,'utf-8')
|
||||
print("发送成功:200\n")
|
||||
print(html_text)
|
||||
r1.close()
|
||||
else:
|
||||
print("error"+str(r.status_code))
|
||||
|
||||
def getoneurl():
|
||||
page_data={}
|
||||
page_data['password']="(*&RV^*(&VRH*(V)))"
|
||||
post_url="https://lmve.net/php/getoneurl.php"
|
||||
try :
|
||||
r = requests.post(post_url,data=page_data)
|
||||
if r.status_code==200:
|
||||
html_byt=r.content
|
||||
html_text=str(html_byt,'utf-8')
|
||||
print("获取url成功:")
|
||||
print(html_text)
|
||||
print("\n")
|
||||
r.close()
|
||||
return html_text
|
||||
else:
|
||||
print("获取url失败\n")
|
||||
return "-1"
|
||||
except:
|
||||
print("获取url失败\n")
|
||||
return "-1"
|
||||
|
||||
|
||||
|
||||
test=1
|
||||
|
||||
if test==0:
|
||||
while 1:
|
||||
theurl=getoneurl()
|
||||
if theurl!='-1':
|
||||
getandpost(theurl)
|
||||
else :
|
||||
print("获取url失败,自动重试")
|
||||
|
||||
if test==1:
|
||||
theurl="https://lmve.net"
|
||||
getandpost(theurl)
|
||||
|
||||
if test==2:
|
||||
theurl=getoneurl()
|
||||
print(theurl)
|
||||
getandpost(theurl)
|
||||
|
||||
if test==3:
|
||||
headers = {
|
||||
'user-agent': 'my-app/0.0.1',
|
||||
'Content-Type': 'text/html; charset=utf-8'
|
||||
}
|
||||
|
||||
#print(begin_url)
|
||||
#获取整个页面
|
||||
url1="https://git.lmve.net/kevin/um-all-index-web/-/avatar"
|
||||
url2="https://yyyyyyounger.com/"
|
||||
try :
|
||||
r = requests.get(url2,headers=headers,verify=False)
|
||||
htmlx_byt=r.content
|
||||
print(r.headers['Content-Type'])
|
||||
except:
|
||||
print("11")
|
||||
if test==4:
|
||||
#js=get_page_words("https://lmve.net")
|
||||
#print(js)
|
||||
myfunsion.a()
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user