Files
py_reptile/reptile.py
T

331 lines
11 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# encoding:utf-8
#简易爬虫
# requests 用于下载网页源码 第三方库 通过pip安装
# re 正则表达式要用到
# threading 启用多线程
# os,sys 用于创建目录
# time 用于获得当天时间
from asyncio.windows_events import NULL
import requests
import re
import jieba
import html
import json
#from bs4 import BeautifulSoup as bs
#from lxml import etree
def det_image(url):
style_type_list=[".jpg",".jpeg",".png",".gif"]
#将url转小写
url=url.lower()
for i in style_type_list:
if i in url:
return True
return False
def det_style(url):
style_type_list=[".js",".css"]
url=url.lower()
for i in style_type_list:
if i in url:
return True
return False
def det_repeat(arr,i):
b=0
for a in arr:
if a==i:
return b
b+=1
return -1
def get_page_words(url):
retunrn_js={}
begin_url=url
headers = {
'user-agent': 'my-app/0.0.1',
'Content-Type': 'text/html; charset=utf-8'
}
retunrn_js['url']=begin_url
#print(begin_url)
#获取整个页面
try:
r = requests.get(begin_url,headers=headers,verify=False)
retunrn_js['status']=r.status_code
get_content_type='text/html'
content_type=re.findall(get_content_type,r.headers['Content-Type'])
if(len(content_type)==0):
retunrn_js['contenttype']=r.headers['Content-Type']
else:
retunrn_js['contenttype']=get_content_type
get_gb2312='gb2312'
charset=re.findall(get_gb2312,r.headers['Content-Type'])
if(len(charset)==0):
charset_type='utf-8'
else:
charset_type='gb2312'
if(retunrn_js['contenttype']=='text/html'):
htmlx_byt=r.content
htmlx=str(htmlx_byt,charset_type)
#print(htmlx)
#获取js
get_htmlx_js='<script[\s\S]*?</script>'
htmlx_js=re.findall(get_htmlx_js,htmlx)
#清除htmlx js
for i in htmlx_js:
htmlx=htmlx.replace(i,"")
htmlx_kuaizhao=html.escape(htmlx) #干掉js后截取快照
htmlx_yasuo=["\n","\r","\t"," "]
for i in htmlx_yasuo:
htmlx_kuaizhao=htmlx_kuaizhao.replace(i,"")#压缩一下
#获取style
get_htmlx_style='<style[\s\S]*?</style>'
htmlx_style=re.findall(get_htmlx_style,htmlx)
#print(htmlx_style)
#清除htmlx style
for i in htmlx_style:
htmlx=htmlx.replace(i,"")
#print(htmlx)
#获取textarea ,像百度这样的阴间网站有隐藏的输入框里面会有样式什么的奇怪东西
get_htmlx_textarea='<textarea[\s\S]*?</textarea>'
htmlx_textarea=re.findall(get_htmlx_textarea,htmlx)
for i in htmlx_textarea:
htmlx=htmlx.replace(i,"")
#获取页面中的url并分类
get_url=r'"(http[s]?://[\S]*)"'
all_url=re.findall(get_url,htmlx)
style_url_list=[]
image_url_list=[]
new_url_list=[]
for i in all_url:
if det_style(i):
style_url_list.append(i)
else:
if det_image(i):
image_url_list.append(i)
else:
new_url_list.append(i)
#干掉url
image_url=[]
new_url=[]
image_url_js={}
new_url_js={}
image_url_int=0
new_url_int=0
for i in style_url_list:
htmlx=htmlx.replace(i,"")
for i in image_url_list:
htmlx=htmlx.replace(i,"")
if det_repeat(image_url,i)==-1:
image_url.append(i)
image_url_js[str(image_url_int)]=i
image_url_int+=1
for i in new_url_list:
htmlx=htmlx.replace(i,"")
if det_repeat(new_url,i)==-1:
new_url.append(i)
new_url_js[str(new_url_int)]=i
new_url_int+=1
image_url_str=json.dumps(image_url_js)
new_url_str=json.dumps(new_url_js)
#获取标签内容
#soup=bs(htmlx,'htmlx.parser')
#dom = etree.htmlx(htmlx)
#print(soup.body)
#获取标题
get_htmlx_title='<title[\s\S]*?>([\s\S]*?)</title>'
htmlx_title=re.findall(get_htmlx_title,htmlx)
#print(htmlx_title)
#get all mate
get_htmlx_mate_lab='<meta[\S ]*?>'
htmlx_mate_lab=re.findall(get_htmlx_mate_lab,htmlx)
htmlx_mate=[]
htmlx_mate_str=""
htmlx_mate_js={}
for i in htmlx_mate_lab:
get_htmlx_mate_name='name="([\S ]*?)"'
htmlx_mate_name=re.findall(get_htmlx_mate_name,i)
if len(htmlx_mate_name)==1:#只允许有1个名字 其他阴间标签不要
get_htmlx_mate_val='content="([\S\s]*?)"'
htmlx_mate_val=re.findall(get_htmlx_mate_val,i)
if len(htmlx_mate_val)==1:
if htmlx_mate_name[0]!='':
#print(htmlx_mate_name[0])
#print(htmlx_mate_val[0])
#htmlx_mate_str+="\""+htmlx_mate_name[0]+"\":\""+htmlx_mate_val[0]+"\","
htmlx_mate_js[htmlx_mate_name[0]]=htmlx_mate_val[0]
htmlx_mate.append(htmlx_mate_val[0])
htmlx_mate_str=json.dumps(htmlx_mate_js)
#获取所有标签内容
get_htmlx_all_tab='>([\s\S]*?)<'
htmlx_tab=re.findall(get_htmlx_all_tab,htmlx)
#print(htmlx_tab)
#inpute mate
for i in htmlx_mate:
htmlx_tab.append(i)
#先干掉\r\n\t 保留其他标点符号做语义识别
get_text_rnt=['\r','\n','\t',' ']
htmlx_tab_len=len(htmlx_tab)
for i in range(htmlx_tab_len-1,-1,-1):#从后往前数
for t in get_text_rnt:htmlx_tab[i]=htmlx_tab[i].replace(t,"")
htmlx_tab[i]=htmlx_tab[i].strip()
if htmlx_tab[i]=='':del htmlx_tab[i]#删除空
#print(htmlx_tab)
get_htmlx_BDFH=["\"","\\","^","","=","/","、","“","”","#","©","|","_","-"," ","*",";","&","$","%","!","?",",",".","(",")","[","]","{","}","<",">","¥","%","","","","。","","","【","】","《","》",""]
#提取关键词
htmlx_sents_js={}
htmlx_words_js={}
sents=[]
words=[]
words_int=0
links_js={}
links_int=0
sents_int=0
for i in range(len(htmlx_tab)):
sent=htmlx_tab[i] #获取每个句子
word=jieba.cut_for_search(sent)#从每个句子获取单词
#数据库限制每个句子256字符
sent=sent[0:256]
if det_repeat(sents,sent)==-1: #查重
sents.append(sent)
htmlx_sents_js[str(sents_int)]=sent
for t in word:
#删除一些意义不大的符号
#数据库限制每个单词8字符
w=t[0:8]
for n in get_htmlx_BDFH:
w=w.replace(n,"")
if w!='':
words_det_int=-1
words_det_int=det_repeat(words,w)
if words_det_int==-1:
words.append(w)
words_det_int=words_int
htmlx_words_js[str(words_int)]=w
words_int+=1
links_js[str(links_int)]=str(words_det_int)+":"+str(sents_int)
links_int+=1
sents_int+=1
htmlx_sents_str=json.dumps(htmlx_sents_js)
htmlx_words_str=json.dumps(htmlx_words_js)
links_str=json.dumps(links_js)
if len(htmlx_title)==0:
retunrn_js['title']=begin_url
else:
retunrn_js['title']=htmlx_title[0]
retunrn_js['mate']=htmlx_mate_str
retunrn_js['sents']=htmlx_sents_str
retunrn_js['words']=htmlx_words_str
retunrn_js['links']=links_str
retunrn_js['images']=image_url_str
retunrn_js['new_url']=new_url_str
retunrn_js['htmlx']=htmlx_kuaizhao
print("分离数据成功")
print("url:")
print(retunrn_js['url'])
print("status code:")
print(retunrn_js['status'])
print("content type:")
print(retunrn_js['contenttype'])
print("Title:")
print(retunrn_js['title'])
print("Sents const:")
print(sents_int)
print("Words const:")
print(words_int)
print("Links const:")
print(links_int)
else:
retunrn_js['title']=begin_url
print("分离数据成功 no html")
print("url:")
print(retunrn_js['url'])
print("status code:")
print(retunrn_js['status'])
print("content type:")
print(retunrn_js['contenttype'])
print("Title:")
print(retunrn_js['title'])
r.close()
except:
print("连接失败")
retunrn_js['status']=0
retunrn_js['contenttype']="Cant connect"
return retunrn_js
def getandpost(bgurl):
page_data={}
page_data=get_page_words(bgurl)
page_data['password']="(*&RV^*(&VRH*(V)))"
print("本地爬取完成,开始发送\n")
post_url="https://lmve.net/php/reptile.php"
r1 = requests.post(post_url,data=page_data)
if r1.status_code==200:
html_byt=r1.content
html_text=str(html_byt,'utf-8')
print("发送成功:200\n")
print(html_text)
r1.close()
else:
print("error"+str(r.status_code))
def getoneurl():
page_data={}
page_data['password']="(*&RV^*(&VRH*(V)))"
post_url="https://lmve.net/php/getoneurl.php"
try :
r = requests.post(post_url,data=page_data)
if r.status_code==200:
html_byt=r.content
html_text=str(html_byt,'utf-8')
print("获取url成功:")
print(html_text)
print("\n")
r.close()
return html_text
else:
print("获取url失败\n")
return "-1"
except:
print("获取url失败\n")
return "-1"
theurl="https://lmve.net"
redata=get_page_words(theurl)
print(redata['words'])