331 lines
11 KiB
Python
331 lines
11 KiB
Python
# encoding:utf-8
|
||
#简易爬虫
|
||
|
||
# requests 用于下载网页源码 第三方库 通过pip安装
|
||
# re 正则表达式要用到
|
||
# threading 启用多线程
|
||
# os,sys 用于创建目录
|
||
# time 用于获得当天时间
|
||
from asyncio.windows_events import NULL
|
||
import requests
|
||
import re
|
||
import jieba
|
||
import html
|
||
import json
|
||
|
||
#from bs4 import BeautifulSoup as bs
|
||
#from lxml import etree
|
||
|
||
def det_image(url):
|
||
style_type_list=[".jpg",".jpeg",".png",".gif"]
|
||
#将url转小写
|
||
url=url.lower()
|
||
for i in style_type_list:
|
||
if i in url:
|
||
return True
|
||
return False
|
||
|
||
def det_style(url):
|
||
style_type_list=[".js",".css"]
|
||
url=url.lower()
|
||
for i in style_type_list:
|
||
if i in url:
|
||
return True
|
||
return False
|
||
|
||
def det_repeat(arr,i):
|
||
b=0
|
||
for a in arr:
|
||
if a==i:
|
||
return b
|
||
b+=1
|
||
return -1
|
||
|
||
def get_page_words(url):
|
||
|
||
retunrn_js={}
|
||
begin_url=url
|
||
headers = {
|
||
'user-agent': 'my-app/0.0.1',
|
||
'Content-Type': 'text/html; charset=utf-8'
|
||
}
|
||
|
||
retunrn_js['url']=begin_url
|
||
#print(begin_url)
|
||
#获取整个页面
|
||
try:
|
||
r = requests.get(begin_url,headers=headers,verify=False)
|
||
retunrn_js['status']=r.status_code
|
||
get_content_type='text/html'
|
||
content_type=re.findall(get_content_type,r.headers['Content-Type'])
|
||
if(len(content_type)==0):
|
||
retunrn_js['contenttype']=r.headers['Content-Type']
|
||
else:
|
||
retunrn_js['contenttype']=get_content_type
|
||
|
||
get_gb2312='gb2312'
|
||
charset=re.findall(get_gb2312,r.headers['Content-Type'])
|
||
if(len(charset)==0):
|
||
charset_type='utf-8'
|
||
else:
|
||
charset_type='gb2312'
|
||
|
||
if(retunrn_js['contenttype']=='text/html'):
|
||
|
||
htmlx_byt=r.content
|
||
htmlx=str(htmlx_byt,charset_type)
|
||
#print(htmlx)
|
||
|
||
#获取js
|
||
get_htmlx_js='<script[\s\S]*?</script>'
|
||
htmlx_js=re.findall(get_htmlx_js,htmlx)
|
||
#清除htmlx js
|
||
for i in htmlx_js:
|
||
htmlx=htmlx.replace(i,"")
|
||
|
||
htmlx_kuaizhao=html.escape(htmlx) #干掉js后截取快照
|
||
htmlx_yasuo=["\n","\r","\t"," "]
|
||
for i in htmlx_yasuo:
|
||
htmlx_kuaizhao=htmlx_kuaizhao.replace(i,"")#压缩一下
|
||
|
||
#获取style
|
||
get_htmlx_style='<style[\s\S]*?</style>'
|
||
htmlx_style=re.findall(get_htmlx_style,htmlx)
|
||
#print(htmlx_style)
|
||
#清除htmlx style
|
||
for i in htmlx_style:
|
||
htmlx=htmlx.replace(i,"")
|
||
|
||
#print(htmlx)
|
||
|
||
#获取textarea ,像百度这样的阴间网站有隐藏的输入框里面会有样式什么的奇怪东西
|
||
get_htmlx_textarea='<textarea[\s\S]*?</textarea>'
|
||
htmlx_textarea=re.findall(get_htmlx_textarea,htmlx)
|
||
for i in htmlx_textarea:
|
||
htmlx=htmlx.replace(i,"")
|
||
|
||
#获取页面中的url并分类
|
||
get_url=r'"(http[s]?://[\S]*)"'
|
||
all_url=re.findall(get_url,htmlx)
|
||
style_url_list=[]
|
||
image_url_list=[]
|
||
new_url_list=[]
|
||
for i in all_url:
|
||
if det_style(i):
|
||
style_url_list.append(i)
|
||
else:
|
||
if det_image(i):
|
||
image_url_list.append(i)
|
||
else:
|
||
new_url_list.append(i)
|
||
|
||
#干掉url
|
||
image_url=[]
|
||
new_url=[]
|
||
image_url_js={}
|
||
new_url_js={}
|
||
image_url_int=0
|
||
new_url_int=0
|
||
for i in style_url_list:
|
||
htmlx=htmlx.replace(i,"")
|
||
for i in image_url_list:
|
||
htmlx=htmlx.replace(i,"")
|
||
if det_repeat(image_url,i)==-1:
|
||
image_url.append(i)
|
||
image_url_js[str(image_url_int)]=i
|
||
image_url_int+=1
|
||
for i in new_url_list:
|
||
htmlx=htmlx.replace(i,"")
|
||
if det_repeat(new_url,i)==-1:
|
||
new_url.append(i)
|
||
new_url_js[str(new_url_int)]=i
|
||
new_url_int+=1
|
||
|
||
image_url_str=json.dumps(image_url_js)
|
||
new_url_str=json.dumps(new_url_js)
|
||
|
||
#获取标签内容
|
||
#soup=bs(htmlx,'htmlx.parser')
|
||
#dom = etree.htmlx(htmlx)
|
||
#print(soup.body)
|
||
|
||
#获取标题
|
||
get_htmlx_title='<title[\s\S]*?>([\s\S]*?)</title>'
|
||
htmlx_title=re.findall(get_htmlx_title,htmlx)
|
||
#print(htmlx_title)
|
||
|
||
#get all mate
|
||
get_htmlx_mate_lab='<meta[\S ]*?>'
|
||
htmlx_mate_lab=re.findall(get_htmlx_mate_lab,htmlx)
|
||
htmlx_mate=[]
|
||
htmlx_mate_str=""
|
||
htmlx_mate_js={}
|
||
for i in htmlx_mate_lab:
|
||
get_htmlx_mate_name='name="([\S ]*?)"'
|
||
htmlx_mate_name=re.findall(get_htmlx_mate_name,i)
|
||
if len(htmlx_mate_name)==1:#只允许有1个名字 其他阴间标签不要
|
||
get_htmlx_mate_val='content="([\S\s]*?)"'
|
||
htmlx_mate_val=re.findall(get_htmlx_mate_val,i)
|
||
if len(htmlx_mate_val)==1:
|
||
if htmlx_mate_name[0]!='':
|
||
#print(htmlx_mate_name[0])
|
||
#print(htmlx_mate_val[0])
|
||
#htmlx_mate_str+="\""+htmlx_mate_name[0]+"\":\""+htmlx_mate_val[0]+"\","
|
||
htmlx_mate_js[htmlx_mate_name[0]]=htmlx_mate_val[0]
|
||
htmlx_mate.append(htmlx_mate_val[0])
|
||
htmlx_mate_str=json.dumps(htmlx_mate_js)
|
||
|
||
#获取所有标签内容
|
||
get_htmlx_all_tab='>([\s\S]*?)<'
|
||
htmlx_tab=re.findall(get_htmlx_all_tab,htmlx)
|
||
#print(htmlx_tab)
|
||
|
||
#inpute mate
|
||
for i in htmlx_mate:
|
||
htmlx_tab.append(i)
|
||
|
||
#先干掉\r\n\t 保留其他标点符号做语义识别
|
||
get_text_rnt=['\r','\n','\t',' ']
|
||
htmlx_tab_len=len(htmlx_tab)
|
||
for i in range(htmlx_tab_len-1,-1,-1):#从后往前数
|
||
for t in get_text_rnt:htmlx_tab[i]=htmlx_tab[i].replace(t,"")
|
||
htmlx_tab[i]=htmlx_tab[i].strip()
|
||
if htmlx_tab[i]=='':del htmlx_tab[i]#删除空
|
||
#print(htmlx_tab)
|
||
|
||
get_htmlx_BDFH=["\"","\\","^","’","=","/","、","“","”","#","©","|","_","-"," ","*",";","&","$","%","!","?",",",".","(",")","[","]","{","}","<",">","¥","%","!","?",",","。","(",")","【","】","《","》",":"]
|
||
#提取关键词
|
||
htmlx_sents_js={}
|
||
htmlx_words_js={}
|
||
sents=[]
|
||
words=[]
|
||
words_int=0
|
||
links_js={}
|
||
links_int=0
|
||
sents_int=0
|
||
for i in range(len(htmlx_tab)):
|
||
sent=htmlx_tab[i] #获取每个句子
|
||
word=jieba.cut_for_search(sent)#从每个句子获取单词
|
||
#数据库限制每个句子256字符
|
||
sent=sent[0:256]
|
||
if det_repeat(sents,sent)==-1: #查重
|
||
sents.append(sent)
|
||
htmlx_sents_js[str(sents_int)]=sent
|
||
|
||
for t in word:
|
||
#删除一些意义不大的符号
|
||
#数据库限制每个单词8字符
|
||
w=t[0:8]
|
||
for n in get_htmlx_BDFH:
|
||
w=w.replace(n,"")
|
||
if w!='':
|
||
words_det_int=-1
|
||
words_det_int=det_repeat(words,w)
|
||
if words_det_int==-1:
|
||
words.append(w)
|
||
words_det_int=words_int
|
||
htmlx_words_js[str(words_int)]=w
|
||
words_int+=1
|
||
links_js[str(links_int)]=str(words_det_int)+":"+str(sents_int)
|
||
links_int+=1
|
||
sents_int+=1
|
||
|
||
htmlx_sents_str=json.dumps(htmlx_sents_js)
|
||
htmlx_words_str=json.dumps(htmlx_words_js)
|
||
links_str=json.dumps(links_js)
|
||
|
||
|
||
|
||
if len(htmlx_title)==0:
|
||
retunrn_js['title']=begin_url
|
||
else:
|
||
retunrn_js['title']=htmlx_title[0]
|
||
retunrn_js['mate']=htmlx_mate_str
|
||
retunrn_js['sents']=htmlx_sents_str
|
||
retunrn_js['words']=htmlx_words_str
|
||
retunrn_js['links']=links_str
|
||
retunrn_js['images']=image_url_str
|
||
retunrn_js['new_url']=new_url_str
|
||
retunrn_js['htmlx']=htmlx_kuaizhao
|
||
|
||
print("分离数据成功")
|
||
print("url:")
|
||
print(retunrn_js['url'])
|
||
print("status code:")
|
||
print(retunrn_js['status'])
|
||
print("content type:")
|
||
print(retunrn_js['contenttype'])
|
||
print("Title:")
|
||
print(retunrn_js['title'])
|
||
print("Sents const:")
|
||
print(sents_int)
|
||
print("Words const:")
|
||
print(words_int)
|
||
print("Links const:")
|
||
print(links_int)
|
||
else:
|
||
retunrn_js['title']=begin_url
|
||
print("分离数据成功 no html")
|
||
print("url:")
|
||
print(retunrn_js['url'])
|
||
print("status code:")
|
||
print(retunrn_js['status'])
|
||
print("content type:")
|
||
print(retunrn_js['contenttype'])
|
||
print("Title:")
|
||
print(retunrn_js['title'])
|
||
r.close()
|
||
except:
|
||
print("连接失败")
|
||
retunrn_js['status']=0
|
||
retunrn_js['contenttype']="Cant connect"
|
||
|
||
|
||
return retunrn_js
|
||
|
||
|
||
def getandpost(bgurl):
|
||
page_data={}
|
||
page_data=get_page_words(bgurl)
|
||
page_data['password']="(*&RV^*(&VRH*(V)))"
|
||
print("本地爬取完成,开始发送\n")
|
||
post_url="https://lmve.net/php/reptile.php"
|
||
r1 = requests.post(post_url,data=page_data)
|
||
if r1.status_code==200:
|
||
html_byt=r1.content
|
||
html_text=str(html_byt,'utf-8')
|
||
print("发送成功:200\n")
|
||
print(html_text)
|
||
r1.close()
|
||
else:
|
||
print("error"+str(r.status_code))
|
||
|
||
def getoneurl():
|
||
page_data={}
|
||
page_data['password']="(*&RV^*(&VRH*(V)))"
|
||
post_url="https://lmve.net/php/getoneurl.php"
|
||
try :
|
||
r = requests.post(post_url,data=page_data)
|
||
if r.status_code==200:
|
||
html_byt=r.content
|
||
html_text=str(html_byt,'utf-8')
|
||
print("获取url成功:")
|
||
print(html_text)
|
||
print("\n")
|
||
r.close()
|
||
return html_text
|
||
else:
|
||
print("获取url失败\n")
|
||
return "-1"
|
||
except:
|
||
print("获取url失败\n")
|
||
return "-1"
|
||
|
||
|
||
|
||
|
||
theurl="https://lmve.net"
|
||
redata=get_page_words(theurl)
|
||
print(redata['words'])
|
||
|