From ab81ca865c34a12a172aec81bee36486edc790a8 Mon Sep 17 00:00:00 2001 From: kevin Date: Wed, 31 Jan 2024 23:09:11 +0800 Subject: [PATCH] init Signed-off-by: kevin --- .gitignore | 2 + data_demo.txt | 96 ++++++++++ myfunsion.py | 417 ++++++++++++++++++++++++++++++++++++++++++++ post.py | 19 ++ reptile.py | 374 +++++++++++++++++++++++++++++++++++++++ test.txt | 6 + 搜索引擎工作思路.md | 29 +++ 7 files changed, 943 insertions(+) create mode 100644 .gitignore create mode 100644 data_demo.txt create mode 100644 myfunsion.py create mode 100644 post.py create mode 100644 reptile.py create mode 100644 test.txt create mode 100644 搜索引擎工作思路.md diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3d200cb --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__ +.vs diff --git a/data_demo.txt b/data_demo.txt new file mode 100644 index 0000000..c26e88e --- /dev/null +++ b/data_demo.txt @@ -0,0 +1,96 @@ +begin_url:https://wnfed.com +image: +['https://wnfed.com/usr/themes/handsome/assets/img/sj2/1.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/10.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/4.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/9.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/2.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/1.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/10.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/4.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/9.jpg', 'https://wnfed.com/usr/themes/handsome/assets/img/sj2/2.jpg'] +new_url: +['https://wnfed.com/action/xmlrpc', 'https://wnfed.com/action/xmlrpc?rsd', 'https://wnfed.com/action/xmlrpc?wlw', 'https://wnfed.com/feed/', 'https://wnfed.com/feed/rss/', 'https://wnfed.com/feed/atom/', 'https://wnfed.com/', 'https://wnfed.com/cross.html', +'https://wnfed.com/cross.html', 'https://wnfed.com/cross.html', 'https://wnfed.com/index.php/action/login?_=0b839b99f2d1bd12725b159914ec85d3', 'https://wnfed.com/', 'https://wnfed.com/cross.html', 'https://wnfed.com/', 'https://wnfed.com/index.php/category/image/', 'https://wnfed.com/index.php/cross.html', 'https://wnfed.com/index.php/8.html', 'https://wnfed.com/index.php/7.html', 'https://wnfed.com/index.php/12.html', 'https://wnfed.com/index.php/838.html', 'https://wnfed.com/index.php/archives/758/', 'https://www.ultralibrarian.com/', 'https://www.51zxw.net/', 'https://www.foreverblog.cn/', 'https://www.foreverblog.cn/feeds.html', 'https://www.foreverblog.cn/go.html', 'https://wnfed.com/category/default/', 'https://wnfed.com/category/image/', 'https://wnfed.com/category/DATA/', 'https://wnfed.com/category/A/', 'https://wnfed.com/category/B/', 'https://wnfed.com/category/C/', 'https://wnfed.com/category/D/', 'https://wnfed.com/category/E/', 'https://wnfed.com/category/F/', 'https://wnfed.com/category/EN/', 'https://wnfed.com/category/G/', 'https://wnfed.com/category/H/', 'https://wnfed.com/840.html', 'https://wnfed.com/838.html', 'https://wnfed.com/12.html', 'https://wnfed.com/cross.html', 'https://wnfed.com/7.html', 'https://wnfed.com/8.html', 'https://lmve.net', 'http://aheboke.com/', 'https://www.ujslxw.com/', 'https://yyyyyyounger.com/', 'https://www.57blog.cn/', 'https://wnfed.com/admin/', +'https://wnfed.com/feed/', 'https://wnfed.com/feed/comments/', 'https://wnfed.com/archives/1071/', 'https://wnfed.com/archives/1071/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/1071/#comments', 'https://wnfed.com/archives/1066/', 'https://wnfed.com/archives/1066/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/1066/#comments', 'https://wnfed.com/archives/1031/', 'https://wnfed.com/archives/1031/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/1031/#comments', 'https://wnfed.com/archives/1018/', 'https://wnfed.com/archives/1018/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/1018/#comments', 'https://wnfed.com/archives/959/', 'https://wnfed.com/archives/959/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/959/#comments', 'https://wnfed.com/archives/945/', 'https://wnfed.com/archives/945/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/945/#comments', 'https://wnfed.com/archives/941/', 'https://wnfed.com/archives/941/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/941/#comments', 'https://wnfed.com/archives/933/', 'https://wnfed.com/archives/933/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/933/#comments', 'https://wnfed.com/archives/924/', 'https://wnfed.com/archives/924/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/924/#comments', 'https://wnfed.com/archives/899/', 'https://wnfed.com/archives/899/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/899/#comments', 'https://wnfed.com/archives/874/', 'https://wnfed.com/archives/874/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/874/#comments', 'https://wnfed.com/archives/871/', 'https://wnfed.com/archives/871/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/871/#comments', 'https://wnfed.com/archives/851/', 'https://wnfed.com/archives/851/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/851/#comments', 'https://wnfed.com/archives/846/', 'https://wnfed.com/archives/846/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/846/#comments', 'https://wnfed.com/archives/811/', 'https://wnfed.com/archives/811/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/811/#comments', 'https://wnfed.com/archives/808/', 'https://wnfed.com/archives/808/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/808/#comments', 'https://wnfed.com/archives/789/', 'https://wnfed.com/archives/789/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/789/#comments', 'https://wnfed.com/archives/778/', 'https://wnfed.com/archives/778/', 'https://wnfed.com/archives/778/', 'https://wnfed.com/archives/762/', 'https://wnfed.com/archives/762/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/762/#comments', 'https://wnfed.com/archives/757/', 'https://wnfed.com/archives/757/', 'https://wnfed.com/author/1/', 'https://wnfed.com/archives/757/#comments', 'https://wnfed.com/page/1/', 'https://wnfed.com/page/2/', 'https://wnfed.com/page/3/', 'https://wnfed.com/page/4/', 'https://wnfed.com/page/6/', 'https://wnfed.com/page/2/', 'https://wnfed.com/archives/959/', 'https://wnfed.com/archives/959/', 'https://wnfed.com/archives/945/', 'https://wnfed.com/archives/945/', 'https://wnfed.com/archives/874/', 'https://wnfed.com/archives/874/', 'https://wnfed.com/archives/899/', +'https://wnfed.com/archives/899/', 'https://wnfed.com/archives/924/', 'https://wnfed.com/archives/924/', 'https://wnfed.com/archives/1066/#comment-190', 'https://secure.gravatar.com/avatar/9007c32dd9dd47971f12f111809e4e50?s=65&r=G&d=', 'https://wnfed.com/archives/1066/#comment-190', 'https://wnfed.com/archives/1066/#comment-190', 'https://wnfed.com/archives/1071/#comment-188', 'https://secure.gravatar.com/avatar/05adf5179ce2ed8b675ce0949495b621?s=65&r=G&d=', 'https://wnfed.com/archives/1071/#comment-188', 'https://wnfed.com/archives/1071/#comment-188', 'https://wnfed.com/archives/1018/#comment-183', 'https://q2.qlogo.cn/g?b=qq&nk=3320870795&s=100', 'https://wnfed.com/archives/1018/#comment-183', 'https://wnfed.com/archives/1018/#comment-183', 'https://wnfed.com/840.html#comment-179', 'https://q2.qlogo.cn/g?b=qq&nk=3320870795&s=100', 'https://wnfed.com/840.html#comment-179', 'https://wnfed.com/840.html#comment-179', 'https://wnfed.com/840.html#comment-176', 'https://q2.qlogo.cn/g?b=qq&nk=3320870795&s=100', 'https://wnfed.com/840.html#comment-176', 'https://wnfed.com/840.html#comment-176', 'https://wnfed.com/archives/1013/', 'https://wnfed.com/archives/1013/', 'https://wnfed.com/archives/233/', 'https://wnfed.com/archives/233/', 'https://wnfed.com/archives/289/', 'https://wnfed.com/archives/289/', 'https://wnfed.com/archives/268/', 'https://wnfed.com/archives/268/', 'https://wnfed.com/archives/212/', 'https://wnfed.com/archives/212/', 'https://wnfed.com/tag/10/', 'https://wnfed.com/tag/C/', 'https://wnfed.com/tag/%E6%8E%A7%E5%88%B6%E5%8F%B0/', 'https://wnfed.com/tag/8/', 'https://wnfed.com/tag/7/', 'https://wnfed.com/tag/QT/', 'https://wnfed.com/tag/5/', 'https://wnfed.com/tag/6/', 'https://wnfed.com/tag/BB/', 'https://wnfed.com/tag/stm32/', 'https://wnfed.com/tag/HAL/', 'https://wnfed.com/tag/arduino/', 'https://wnfed.com/tag/16/', 'https://wnfed.com/tag/15/', 'https://wnfed.com/tag/world-skills/', 'https://wnfed.com/tag/13/', 'https://wnfed.com/tag/2021/', 'https://wnfed.com/tag/12/', 'https://wnfed.com/tag/11/', 'https://wnfed.com/tag/9/', 'https://wnfed.com/tag/DFU/', 'https://wnfed.com/tag/PHP/', 'https://wnfed.com/tag/MYSQL/', 'https://wnfed.com/tag/OpenCV/', 'https://wnfed.com/tag/Linux/', 'https://wnfed.com/tag/LCD/', 'https://wnfed.com/tag/4/', 'https://wnfed.com/tag/FSMC/', 'https://wnfed.com/tag/FAT32/', 'https://wnfed.com/tag/Fatfs/', 'https://www.ihewro.com/archives/489/', 'https://www.ddg.ink/', +'http://www.typecho.org', 'https://www.ihewro.com/archives/489/'] +title: +['无闻风 - Blog'] +mate: +['爱生活更爱技术,文能放下键盘敲代码,武能拿起烙铁焊电路。', '吳文峰,无闻风,博客', 'Typecho 1.1/17.10.30', 'handsome'] +html: +无闻风 - Blog

无闻风

加载中……

World skills 2021 澳门区宣传片

© 2022 Copyright Powered by our story...
+words: +['无', '闻风', 'Blog', 'WNFED', '动态', '日历', 'Loading', '分类', '雷达', '图', '发布', '统计', '统计图', '标签', '碎语', '闲言 +碎语', '邮箱', '炸', '了', 'March', '23rd', '2022', 'at', '03', ':', '20', 'pm', '新', '版本', 'Kicad', '依然', '无法', '在', '原 +生', 'ARM', '运行', 'December', '26th', '2021', '12', '57', '吵死', '人', '隔壁', '又', '装修', '11th', '11', '34', 'am', '用户', '户名', '用户名', '密码', '登录', '中', '刷新', '页面', '后', 'Kevin', '爱', '生活', '更爱', '技术', '文能', '放下', '键盘', '敲 +', '代码', '武能', '拿', '起', '烙铁', '焊', '电路', '下午', '好', '是', '时候', '打个', '盹', '导航', '首页', '相册', '闲言', ' +留言', '归档', '观影', '仓库', '记事', '推荐', 'ultralibrarian', '我要', '自学', '网', '十年', '之约', '聚合', '虫', '洞', '组成', '18', '瞎', '写', '14', '资源', '21', '单片', '单片机', '编程', '29', '计算', '算机', '计算机', '19', '杂谈', '4', '折腾', '瞎 +折腾', '1', '音乐', '5', '视频', '3', 'EN', '2', '啊', '8', '资讯', '友情', '链接', '友情链接', '走马', '走马观花', '友链', '电子 +', '硬件', '社区', '阿', '和', '博客', '刘', '希望', 'Y', '哥', 'の', '小站', '无心', '管理', '文章', '评论', '加载', '…', 'Git', '服务', '务器', '服务器', '从', 'gitblit', '升级', '到', 'gitlab', '其实', '挺好用', '的', '就是', '功能', '有点', '少', '玩玩', 'Gitlab', 'lt', '点击', '进入', '安装', '方法', '去', '官网', '参考', '命令', '主要', '就', '两条', 'curl', 'https', '/', 'packages', 'com', 'install', 'repositories', 'ee', 'script', 'rpm', 'sh', 'sudo', 'bashsudo', 'EXTERNAL', 'URL', '=', 'qu', '年', '月', '24', '日', '条', 'QT', '学习', '作品', '多线', '线程', '多线程', 'IP', '扫描', '扫描器', '摸', '很', '时间', '长时间', '把', '捡起', '起来', '捡起来', '突然', '想到', '一个', '项目', '想', '一下', '网上', '有', '多少', '隐藏', 'Minecraft', '也', '很多', '我', '用', '过来', '01', '08', '小', '装', '个', 'OpenWrt', '做', '端口', '转发', '起因', '最新', '发现', '即使', '局域', '局域网 +', '网站', '大量', '图片', '会', '延时', '有时', '有时候', '长达', '秒', '不止', '其他', '比如', '猜测', '辣鸡', '路由', '路由器', '暂无', '使用', 'Gitblit', '搭建', '个人', '小型', 'git', 'lmve', '入口', '这个', '很久', '以前', '很久以前', '已', '将', '一直 +', '效果', '还', '不错', '足够', '给力', '先去', '下载', '包', 'JAVA', '解压', '里面', '应该', '这些', '文件', '修改', '先', 'cd', '执行', '路径', '改成', '当前', '这里', 'java', 'cp', 'quot', 'jar', 'ext', 'gitb', '04', '可', '什么', '没什么', '开服', '经验 +', '只是', 'Spigot', 'ubuntu', '上开', '插件', '服', '117', '可以', '通过', 'net', 'wnfed', '这', '两个', '域名', '测试', '过', '性能', '可好', 'aru', 'thumb', '鞘翅', '到处', '飞', '都', '不卡', '群辉', '搭', '服用', '飞会', '卡', '换', '机箱', '眼馋', '小 +贵', '最近', '才', '狠下', '心', '放血', '购买', '旧', '原因', '太', '‘', '赛博', '朋克', '’', '摇摇', '摇晃', '晃晃', '摇摇晃晃', '经过', '几次', '丢失', '数据', '真的', '怕', '蜗牛', '回档', '前', '几天', '公司', '摸鱼', '服务器上安装', '替代', '现在', '试 +', '多次', '没', '成功', '好像', '自带', 'nginx', '宝塔', '冲突', '重装', '注意', 'Esxi', '硬盘', '满', '然后', '哪里', '没有', '办法', '只能', '重启', '以后', '系统', '进不去', '挂', '问题', '不', '大', '最', '多', '重装系统', '麻烦', '一点', '等', '开始', +'不对', '不对经', '先是', '恢复', 'MySQL', '各种', '报错', '忽略', '错误', '装好', '东', 'MAX30102', '血氧', '心率', '传感', '感 +器', '传感器', '开发', '研究', '文档', 'pdf', '平台', 'stm32f103ze', 'm3s', '开发板', '连接', '方式', 'MAX3012', 'IICLCD', 'sfmcLCD', 'touc', '31', '常用', '常用命令', '默认', '名称', 'master', '分支', 'origin', '远程', '库', 'Head', '^', '父', '提交', '创建 +', 'clone', 'url', 'gt', '克隆', 'init', '初始', '初始化', '本地', 'status', '查看', '状态', 'diff', '07', '05', '黑群晖', '记', +'自从', '开通', '公网', '我用', '一台', '退役', '游戏', '本', 'winserver', '而且', '已经', '开机', '不多', '差不多', '两年', '这 +台', '经历', '大大', '大小', '小小', '大大小小', '灾难', '存活', '至今', '幸好', '30', 'C', '控制', '控制台', '华容', '华容道', '小游戏', '去年', '年底', '玩法', '方向', '方向键', '空格', '随机', '打乱', '按下', '字母', 'a', '自动', '还原', '介绍', '递归', '实现', '搜索', '简单', '驱动', '某', '坐标', '写入', '字符', 'HANDLE', 'hOu', '2020', '06', '2048', '意思', '一些', '函数', 'hOut', 'GetStdHandle', 'STD', 'OUTPUT', 'CONSOLE', 'CU', '推', '箱子', '里', '躺', '久', '贴出', '出来', '贴出来', '分享', '自定', ' +定义', '自定义', '关卡', 'RGB', '加持', 'RBG', 'envious', 'and', '获取', '按键', 'HAN', '一', '笔画', '完', '2019', '某月', '程序 +', '角色', '一次', '一次性', '走', '所有', '格子', '重复', '操作', '按', '蜂鸣器', '以', '链表', '播放', '下', '而已', '建议', ' +容易', '爆', '内存', 'struct', 'notes', '音符', 'uint16', 't', 'freq', '频率', 'uint8', 'duty', '占空比', 'deley', '时长', 'next', 'note', '储存', '结构', '体', 'void', '10', '更', '编码', '码器', '编码器', 'encoder', 'c#', 'include', 'h', 'en', '#', 'define', 'BU', 'stm32', 'DFU', 'HAL', '下载方式', '多种', 'SW', '不过', '这种', '占用', 'IO', '口', '虽然', '复用', '但是', 'pcb', '设 +计', '、', '后期', '一定', '不便', '所以', '加入', 'D', 'World', 'skills', '澳门', '区', '宣传', '宣传片', 'Windows', 'server', '2016', '评估', '板', '完整', '整版', '完整版', '版', '限制', '控制器', '域控制器', '最好', '第一', 'Server', 'Eval', 'Datacenter', 'Standard', 'Full', '转换', '为', '许可', '要', 'Evaluation', '您', '需要', '公共', 'KMS', '23', 'STM32', '+', 'MPU6050', 'MXcube', '启动', 'IIC', '接口', 'extern', 'I2C', 'HandleTypeDef', 'hi2c1', '字节', 'reg', '寄存', '寄存器', '地址', 'da', '6', '热门', '数', '芭比', '看不懂', '萧瑟', '冒昧', '问', '一句', '私服', '云', '上', '还是', '自己', '家里', 'v', '哇', '更新', '头像', 'cdn', 'jsdeli', '您好', '介于', '未', '看到', '本站', '故', '决定', '取消', '贵', '站友', '链', '请悉', '知', '中山', '中山市', ' +技师', '学院', '0', 'eagle', '仿真', '电容', '属性', '简易', 'P', '站', '爬虫', '脚本', '中文', '英文', '手册', '使用手册', 'DS3231', '信息', '130', '数目', '153', '192', '天', '天数', '最后', '活动', '算法', '笔记', 'C++', 'BB', 'arduino', '无人', '人机', '无人机', '四', '旋翼', '飞行', '飞行器', 'world', '世界', '职业', '技能', '职业技能', '大赛', '语言', 'C语言', 'PHP', 'MYSQL', 'OpenCV', 'Linux', 'LCD', '彩屏', 'FSMC', 'FAT32', 'Fatfs', '目录', 'Powered', 'by', 'typecho', 'Theme', 'handsome', 'Copyright', 'Typecho', 'nbsp', 'copy', 'our', 'story'] \ No newline at end of file diff --git a/myfunsion.py b/myfunsion.py new file mode 100644 index 0000000..5e30fb9 --- /dev/null +++ b/myfunsion.py @@ -0,0 +1,417 @@ +import requests +import re +import html +import json +import jieba +from urllib.parse import urlparse +class myfunsion: + def a(): + print("test") + +class det:#测试url类型 + def image(url): + style_type_list=[".jpg",".jpeg",".png",".gif",".svg"] + #将url转小写 + url=url.lower() + for i in style_type_list: + if i in url: + return True + return False + + def style(url): + style_type_list=[".js",".css"] + url=url.lower() + for i in style_type_list: + if i in url: + return True + return False + + #检查是否有重复 + # arr 数组 + # i 数据 + def repeat(arr,i): + b=0 + for a in arr: + if a==i: + return b + b+=1 + return -1 + + #检测url域名 + def domin(url): + domin=urlparse(url) + return domin + +class net: + headers = { + 'user-agent': 'my-app/0.0.1', + 'Content-Type': 'text/html; charset=utf-8' + } + def get_html(url): + return_js={} + return_js['url']=url + try : + r = requests.get(url,headers=net.headers,verify=False,timeout=(60, 60)) + return_js['status']=r.status_code + get_content_type='text/html' + content_type=re.findall(get_content_type,r.headers['Content-Type']) + if(len(content_type)==0): + return_js['contenttype']=r.headers['Content-Type'] + else: + return_js['contenttype']=get_content_type + + if(return_js['contenttype']=='text/html'): + get_gb2312='gb2312' + charset=re.findall(get_gb2312,r.headers['Content-Type']) + if(len(charset)==0): + return_js['charset_type']='utf-8' + else: + return_js['charset_type']='gb2312' + htmlx_byt=r.content + htmlx=str(htmlx_byt,return_js['charset_type']) + return_js['html']=htmlx + except: + return_js['status']=0 + + return return_js + + def ana_html(webdata): + return_js={} + return_js['status']=webdata['status'] + return_js['url']=webdata['url'] + if webdata['status']==0: + return_js['score']="0" + return_js['contenttype']="none" + return return_js + return_js['contenttype']=webdata['contenttype'] + #return_js['charset_type']=webdata['charset_type'] + if(return_js['contenttype']=='text/html'): + thisurlscore=100 + htmlx=webdata['html'] + ur=urlparse(return_js['url']) + domain=ur.netloc + return_js['domain']=domain + + + #根url + if(ur.path==''): + thisurlscore+=50 + if(ur.path=='/'): + thisurlscore+=50 + #是否带参数 + if(ur.query!=''): + thisurlscore-=10 + #是否带位置 + if(ur.fragment!=''): + thisurlscore-=50 + #获取js + get_htmlx_js='' + htmlx_js=re.findall(get_htmlx_js,htmlx) + #清除htmlx js + for i in htmlx_js: + htmlx=htmlx.replace(i,"") + + htmlx_kuaizhao=html.escape(htmlx) #干掉js后截取快照 + htmlx_yasuo=["\n","\r","\t"," "] + for i in htmlx_yasuo: + htmlx_kuaizhao=htmlx_kuaizhao.replace(i,"")#压缩一下 + + #获取style + get_htmlx_style='' + htmlx_style=re.findall(get_htmlx_style,htmlx) + #print(htmlx_style) + #清除htmlx style + for i in htmlx_style: + htmlx=htmlx.replace(i,"") + + #获取textarea ,像百度这样的阴间网站有隐藏的输入框里面会有样式什么的奇怪东西 + get_htmlx_textarea='' + htmlx_textarea=re.findall(get_htmlx_textarea,htmlx) + for i in htmlx_textarea: + htmlx=htmlx.replace(i,"") + + #获取页面中的url并分类 + get_url=r'\"(http[s]?://[\S]*)\"' + all_url=re.findall(get_url,htmlx) + style_url_list=[] + image_url_list=[] + new_url_list=[] + for i in all_url: + if det.style(i): + style_url_list.append(i) + else: + if det.image(i): + image_url_list.append(i) + else: + new_url_list.append(i) + + #干掉url + image_url=[] + new_url=[] + image_url_js={} + new_url_js={} + image_url_int=0 + new_url_int=0 + for i in style_url_list: + htmlx=htmlx.replace(i,"") + for i in image_url_list: + htmlx=htmlx.replace(i,"") + if det.repeat(image_url,i)==-1: + image_url.append(i) + image_url_js[str(image_url_int)]=i + image_url_int+=1 + thisurlscore+=2#图片加分 + for i in new_url_list: + htmlx=htmlx.replace(i,"") + if det.repeat(new_url,i)==-1: + new_url.append(i) + new_url_js[str(new_url_int)]=i + new_url_int+=1 + + #pending + image_url_str=json.dumps(image_url_js) + new_url_str=json.dumps(new_url_js) + return_js['images']=image_url_str + return_js['newurls']=new_url_str + + #对新url评分 + new_urlscore_js={} + new_urldomain_js={} + new_url_int=0 + for i in new_url: + score=100 + try : + te=urlparse(i) + + #根url + if(te.path==''): + score+=10 + if(te.path=='/'): + score+=10 + + #是否站内url + if(te.netloc==domain): + score-=10 + else: + score+=10 + thisurlscore+=5#站外url加分 + + #是否带参数 + if(te.query!=''): + score-=10 + + #是否带位置 + if(te.fragment!=''): + score-=50 + except: + score=0 + + new_urlscore_js[str(new_url_int)]=score + new_urldomain_js[str(new_url_int)]=te.netloc + new_url_int+=1 + + new_urlscore_js_str=json.dumps(new_urlscore_js) + new_urldomain_js_str=json.dumps(new_urldomain_js) + return_js['newurlscore']=new_urlscore_js_str + return_js['newurldomain']=new_urldomain_js_str + + + + #获取标题 + get_htmlx_title='([\s\S]*?)' + htmlx_title=re.findall(get_htmlx_title,htmlx) + if(len(htmlx_title)>0): + return_js['title']=htmlx_title[0] + print(htmlx_title) + else: + return_js['title']=return_js['url'] + + #get all mate + get_htmlx_mate_lab='' + htmlx_mate_lab=re.findall(get_htmlx_mate_lab,htmlx) + htmlx_mate=[] + htmlx_mate_str="" + htmlx_mate_js={} + for i in htmlx_mate_lab: + get_htmlx_mate_name='name="([\S ]*?)"' + htmlx_mate_name=re.findall(get_htmlx_mate_name,i) + if len(htmlx_mate_name)==1:#只允许有1个名字 其他阴间标签不要 + get_htmlx_mate_val='content="([\S\s]*?)"' + htmlx_mate_val=re.findall(get_htmlx_mate_val,i) + if len(htmlx_mate_val)==1: + if htmlx_mate_name[0]!='': + #print(htmlx_mate_name[0]) + #print(htmlx_mate_val[0]) + #htmlx_mate_str+="\""+htmlx_mate_name[0]+"\":\""+htmlx_mate_val[0]+"\"," + htmlx_mate_js[htmlx_mate_name[0]]=htmlx_mate_val[0] + htmlx_mate.append(htmlx_mate_val[0]) + htmlx_mate_str=json.dumps(htmlx_mate_js) + return_js['mate']=htmlx_mate_str + + #获取所有标签内容 + get_htmlx_all_tab='>([\s\S]*?)<' + htmlx_tab=re.findall(get_htmlx_all_tab,htmlx) + #print(htmlx_tab) + + #inpute mate + for i in htmlx_mate: + htmlx_tab.append(i) + + #先干掉\r\n\t 保留其他标点符号做语义识别 + get_text_rnt=['\r','\n','\t',' '] + htmlx_tab_len=len(htmlx_tab) + for i in range(htmlx_tab_len-1,-1,-1):#从后往前数 + for t in get_text_rnt:htmlx_tab[i]=htmlx_tab[i].replace(t,"") + htmlx_tab[i]=htmlx_tab[i].strip() + if htmlx_tab[i]=='':del htmlx_tab[i]#删除空 + #print(htmlx_tab) + + get_htmlx_BDFH=["\"","\\","^","’","=","/","、","“","”","#","©","|","_","-"," ","*",";","&","$","%","!","?",",",".","(",")","[","]","{","}","<",">","¥","%","!","?",",","。","(",")","【","】","《","》",":"] + #提取关键词 + htmlx_sents_js={} + htmlx_words_js={} + sents=[] + words=[] + words_int=0 + links_js={} + links_int=0 + sents_int=0 + for i in range(len(htmlx_tab)): + sent=htmlx_tab[i] #获取每个句子 + word=jieba.cut_for_search(sent)#从每个句子获取单词 + #数据库限制每个句子256字符 + sent=sent[0:256] + if det.repeat(sents,sent)==-1: #查重 + sents.append(sent) + htmlx_sents_js[str(sents_int)]=sent + + for t in word: + #删除一些意义不大的符号 + #数据库限制每个单词8字符 + w=t[0:8] + for n in get_htmlx_BDFH: + w=w.replace(n,"") + if w!='': + words_det_int=-1 + words_det_int=det.repeat(words,w) + if words_det_int==-1: + words.append(w) + words_det_int=words_int + htmlx_words_js[str(words_int)]=w + words_int+=1 + links_js[str(links_int)]=str(words_det_int)+":"+str(sents_int) + links_int+=1 + sents_int+=1 + else: + #重复的内容 + thisurlscore-=1 + + return_js['sents_int']=sents_int + return_js['words_int']=words_int + return_js['links_int']=links_int + + htmlx_sents_str=json.dumps(htmlx_sents_js) + htmlx_words_str=json.dumps(htmlx_words_js) + links_str=json.dumps(links_js) + + return_js['sents']=htmlx_sents_str + return_js['words']=htmlx_words_str + return_js['links']=links_str + + return_js['htmlx']=htmlx_kuaizhao + return_js['score']=thisurlscore#对当前url评分 + else: + return_js['title']=return_js['url'] + return return_js + + + + + + + + +test=0 + +getone={} +getone['password']="(*&RV^*(&VRH*(V)))" +getone_post_url="https://lmve.net/php/getoneurl.php" +post_url="https://lmve.net/php/newurlreptile.php" + +if test==0: + while 1: + print("**本地消息**\n获取url..") + try: + r = requests.post(getone_post_url,data=getone) + if r.status_code==200: + html_byt=r.content + html_text=str(html_byt,'utf-8') + print("获取url成功:") + print(html_text) + print("\n") + r.close() + if html_text!='': + print("获取text..") + text=net.get_html(html_text) + print("开始分析..") + postdata=net.ana_html(text) + print("分析完成") + postdata['password']="(*&RV^*(&VRH*(V)))" + if postdata['contenttype']=='text/html': + print("分离数据成功") + print("url:") + print(postdata['url']) + print("status code:") + print(postdata['status']) + print("content type:") + print(postdata['contenttype']) + print("Title:") + print(postdata['title']) + print("Sents const:") + print(postdata['sents_int']) + print("Words const:") + print(postdata['words_int']) + print("Links const:") + print(postdata['links_int']) + else : + print("分离数据成功 no html") + print("url:") + print(postdata['url']) + print("status code:") + print(postdata['status']) + print("content type:") + print(postdata['contenttype']) + print("开始发送..") + try: + r1 = requests.post(post_url,data=postdata) + if r1.status_code==200: + html_byt=r1.content + html_text=str(html_byt,'utf-8') + print("发送成功:200") + print(html_text) + r1.close() + else: + print("error"+str(r1.status_code)) + except: + print("try aga") + else: + print("获取url失败") + except: + print("tyr agin") + + + + + +if test==1: + postdata=net.ana_html(net.get_html("https://www.foreverblog.cn/blog/1958.html")) + postdata['password']="(*&RV^*(&VRH*(V)))" + r1 = requests.post(post_url,data=postdata) + if r1.status_code==200: + html_byt=r1.content + html_text=str(html_byt,'utf-8') + print("发送成功:200\n") + print(html_text) + r1.close() + else: + print("error"+str(r1.status_code)) diff --git a/post.py b/post.py new file mode 100644 index 0000000..218e9c2 --- /dev/null +++ b/post.py @@ -0,0 +1,19 @@ +# encoding:utf-8 +import requests +def main(): + post_url="https://lmve.net/php/reptile.php" + headers = { + 'user-agent': 'my-app/0.0.1', + 'Content-Type': 'text/html; charset=utf-8' + } + data={} + data['password']="adswewd" + + r = requests.post(post_url,data=data) + if r.status_code!=200: + return r.status_code + html_byt=r.content + html=str(html_byt,'utf-8') + print(html) + +main() \ No newline at end of file diff --git a/reptile.py b/reptile.py new file mode 100644 index 0000000..93b3fea --- /dev/null +++ b/reptile.py @@ -0,0 +1,374 @@ +# encoding:utf-8 +#简易爬虫 + +# requests 用于下载网页源码 第三方库 通过pip安装 +# re 正则表达式要用到 +# threading 启用多线程 +# os,sys 用于创建目录 +# time 用于获得当天时间 +from asyncio.windows_events import NULL +import requests +import re +import jieba +import html +import json +from myfunsion import myfunsion +#from bs4 import BeautifulSoup as bs +#from lxml import etree + +def det_image(url): + style_type_list=[".jpg",".jpeg",".png",".gif"] + #将url转小写 + url=url.lower() + for i in style_type_list: + if i in url: + return True + return False + +def det_style(url): + style_type_list=[".js",".css"] + url=url.lower() + for i in style_type_list: + if i in url: + return True + return False + +def det_repeat(arr,i): + b=0 + for a in arr: + if a==i: + return b + b+=1 + return -1 + +def get_page_words(url): + + retunrn_js={} + begin_url=url + headers = { + 'user-agent': 'my-app/0.0.1', + 'Content-Type': 'text/html; charset=utf-8' + } + + retunrn_js['url']=begin_url + #print(begin_url) + #获取整个页面 + try: + r = requests.get(begin_url,headers=headers,verify=False) + retunrn_js['status']=r.status_code + get_content_type='text/html' + content_type=re.findall(get_content_type,r.headers['Content-Type']) + if(len(content_type)==0): + retunrn_js['contenttype']=r.headers['Content-Type'] + else: + retunrn_js['contenttype']=get_content_type + + get_gb2312='gb2312' + charset=re.findall(get_gb2312,r.headers['Content-Type']) + if(len(charset)==0): + charset_type='utf-8' + else: + charset_type='gb2312' + + if(retunrn_js['contenttype']=='text/html'): + + htmlx_byt=r.content + htmlx=str(htmlx_byt,charset_type) + #print(htmlx) + + #获取js + get_htmlx_js='' + htmlx_js=re.findall(get_htmlx_js,htmlx) + #清除htmlx js + for i in htmlx_js: + htmlx=htmlx.replace(i,"") + + htmlx_kuaizhao=html.escape(htmlx) #干掉js后截取快照 + htmlx_yasuo=["\n","\r","\t"," "] + for i in htmlx_yasuo: + htmlx_kuaizhao=htmlx_kuaizhao.replace(i,"")#压缩一下 + + #获取style + get_htmlx_style='' + htmlx_style=re.findall(get_htmlx_style,htmlx) + #print(htmlx_style) + #清除htmlx style + for i in htmlx_style: + htmlx=htmlx.replace(i,"") + + #print(htmlx) + + #获取textarea ,像百度这样的阴间网站有隐藏的输入框里面会有样式什么的奇怪东西 + get_htmlx_textarea='' + htmlx_textarea=re.findall(get_htmlx_textarea,htmlx) + for i in htmlx_textarea: + htmlx=htmlx.replace(i,"") + + #获取页面中的url并分类 + get_url=r'"(http[s]?://[\S]*)"' + all_url=re.findall(get_url,htmlx) + style_url_list=[] + image_url_list=[] + new_url_list=[] + for i in all_url: + if det_style(i): + style_url_list.append(i) + else: + if det_image(i): + image_url_list.append(i) + else: + new_url_list.append(i) + + #干掉url + image_url=[] + new_url=[] + image_url_js={} + new_url_js={} + image_url_int=0 + new_url_int=0 + for i in style_url_list: + htmlx=htmlx.replace(i,"") + for i in image_url_list: + htmlx=htmlx.replace(i,"") + if det_repeat(image_url,i)==-1: + image_url.append(i) + image_url_js[str(image_url_int)]=i + image_url_int+=1 + for i in new_url_list: + htmlx=htmlx.replace(i,"") + if det_repeat(new_url,i)==-1: + new_url.append(i) + new_url_js[str(new_url_int)]=i + new_url_int+=1 + + image_url_str=json.dumps(image_url_js) + new_url_str=json.dumps(new_url_js) + + #获取标签内容 + #soup=bs(htmlx,'htmlx.parser') + #dom = etree.htmlx(htmlx) + #print(soup.body) + + #获取标题 + get_htmlx_title='([\s\S]*?)' + htmlx_title=re.findall(get_htmlx_title,htmlx) + #print(htmlx_title) + + #get all mate + get_htmlx_mate_lab='' + htmlx_mate_lab=re.findall(get_htmlx_mate_lab,htmlx) + htmlx_mate=[] + htmlx_mate_str="" + htmlx_mate_js={} + for i in htmlx_mate_lab: + get_htmlx_mate_name='name="([\S ]*?)"' + htmlx_mate_name=re.findall(get_htmlx_mate_name,i) + if len(htmlx_mate_name)==1:#只允许有1个名字 其他阴间标签不要 + get_htmlx_mate_val='content="([\S\s]*?)"' + htmlx_mate_val=re.findall(get_htmlx_mate_val,i) + if len(htmlx_mate_val)==1: + if htmlx_mate_name[0]!='': + #print(htmlx_mate_name[0]) + #print(htmlx_mate_val[0]) + #htmlx_mate_str+="\""+htmlx_mate_name[0]+"\":\""+htmlx_mate_val[0]+"\"," + htmlx_mate_js[htmlx_mate_name[0]]=htmlx_mate_val[0] + htmlx_mate.append(htmlx_mate_val[0]) + htmlx_mate_str=json.dumps(htmlx_mate_js) + + #获取所有标签内容 + get_htmlx_all_tab='>([\s\S]*?)<' + htmlx_tab=re.findall(get_htmlx_all_tab,htmlx) + #print(htmlx_tab) + + #inpute mate + for i in htmlx_mate: + htmlx_tab.append(i) + + #先干掉\r\n\t 保留其他标点符号做语义识别 + get_text_rnt=['\r','\n','\t',' '] + htmlx_tab_len=len(htmlx_tab) + for i in range(htmlx_tab_len-1,-1,-1):#从后往前数 + for t in get_text_rnt:htmlx_tab[i]=htmlx_tab[i].replace(t,"") + htmlx_tab[i]=htmlx_tab[i].strip() + if htmlx_tab[i]=='':del htmlx_tab[i]#删除空 + #print(htmlx_tab) + + get_htmlx_BDFH=["\"","\\","^","’","=","/","、","“","”","#","©","|","_","-"," ","*",";","&","$","%","!","?",",",".","(",")","[","]","{","}","<",">","¥","%","!","?",",","。","(",")","【","】","《","》",":"] + #提取关键词 + htmlx_sents_js={} + htmlx_words_js={} + sents=[] + words=[] + words_int=0 + links_js={} + links_int=0 + sents_int=0 + for i in range(len(htmlx_tab)): + sent=htmlx_tab[i] #获取每个句子 + word=jieba.cut_for_search(sent)#从每个句子获取单词 + #数据库限制每个句子256字符 + sent=sent[0:256] + if det_repeat(sents,sent)==-1: #查重 + sents.append(sent) + htmlx_sents_js[str(sents_int)]=sent + + for t in word: + #删除一些意义不大的符号 + #数据库限制每个单词8字符 + w=t[0:8] + for n in get_htmlx_BDFH: + w=w.replace(n,"") + if w!='': + words_det_int=-1 + words_det_int=det_repeat(words,w) + if words_det_int==-1: + words.append(w) + words_det_int=words_int + htmlx_words_js[str(words_int)]=w + words_int+=1 + links_js[str(links_int)]=str(words_det_int)+":"+str(sents_int) + links_int+=1 + sents_int+=1 + + htmlx_sents_str=json.dumps(htmlx_sents_js) + htmlx_words_str=json.dumps(htmlx_words_js) + links_str=json.dumps(links_js) + + + + if len(htmlx_title)==0: + retunrn_js['title']=begin_url + else: + retunrn_js['title']=htmlx_title[0] + retunrn_js['mate']=htmlx_mate_str + retunrn_js['sents']=htmlx_sents_str + retunrn_js['words']=htmlx_words_str + retunrn_js['links']=links_str + retunrn_js['images']=image_url_str + retunrn_js['new_url']=new_url_str + retunrn_js['htmlx']=htmlx_kuaizhao + + print("分离数据成功") + print("url:") + print(retunrn_js['url']) + print("status code:") + print(retunrn_js['status']) + print("content type:") + print(retunrn_js['contenttype']) + print("Title:") + print(retunrn_js['title']) + print("Sents const:") + print(sents_int) + print("Words const:") + print(words_int) + print("Links const:") + print(links_int) + else: + retunrn_js['title']=begin_url + print("分离数据成功 no html") + print("url:") + print(retunrn_js['url']) + print("status code:") + print(retunrn_js['status']) + print("content type:") + print(retunrn_js['contenttype']) + print("Title:") + print(retunrn_js['title']) + r.close() + except: + print("连接失败") + retunrn_js['status']=0 + retunrn_js['contenttype']="Cant connect" + + + return retunrn_js + + +def getandpost(bgurl): + page_data={} + page_data=get_page_words(bgurl) + page_data['password']="(*&RV^*(&VRH*(V)))" + print("本地爬取完成,开始发送\n") + post_url="https://lmve.net/php/reptile.php" + r1 = requests.post(post_url,data=page_data) + if r1.status_code==200: + html_byt=r1.content + html_text=str(html_byt,'utf-8') + print("发送成功:200\n") + print(html_text) + r1.close() + else: + print("error"+str(r.status_code)) + +def getoneurl(): + page_data={} + page_data['password']="(*&RV^*(&VRH*(V)))" + post_url="https://lmve.net/php/getoneurl.php" + try : + r = requests.post(post_url,data=page_data) + if r.status_code==200: + html_byt=r.content + html_text=str(html_byt,'utf-8') + print("获取url成功:") + print(html_text) + print("\n") + r.close() + return html_text + else: + print("获取url失败\n") + return "-1" + except: + print("获取url失败\n") + return "-1" + + + +test=1 + +if test==0: + while 1: + theurl=getoneurl() + if theurl!='-1': + getandpost(theurl) + else : + print("获取url失败,自动重试") + +if test==1: + theurl="https://lmve.net" + getandpost(theurl) + +if test==2: + theurl=getoneurl() + print(theurl) + getandpost(theurl) + +if test==3: + headers = { + 'user-agent': 'my-app/0.0.1', + 'Content-Type': 'text/html; charset=utf-8' + } + + #print(begin_url) + #获取整个页面 + url1="https://git.lmve.net/kevin/um-all-index-web/-/avatar" + url2="https://yyyyyyounger.com/" + try : + r = requests.get(url2,headers=headers,verify=False) + htmlx_byt=r.content + print(r.headers['Content-Type']) + except: + print("11") +if test==4: + #js=get_page_words("https://lmve.net") + #print(js) + myfunsion.a() + + + + + + + + + + diff --git a/test.txt b/test.txt new file mode 100644 index 0000000..16b0a2e --- /dev/null +++ b/test.txt @@ -0,0 +1,6 @@ +百度一下,你就知道