From f6d0da4ff15e8a5ff2a4c2e64f93b894d367f56e Mon Sep 17 00:00:00 2001 From: fong Date: Mon, 29 Apr 2024 10:28:06 +0800 Subject: [PATCH] php Signed-off-by: fong --- php/getoneurl.php | 30 ++++ php/lmve_net.sql | 339 ++++++++++++++++++++++++++++++++++++++++++ php/newurlreptile.php | 236 +++++++++++++++++++++++++++++ php/search.php | 31 ++++ reptile.py | 52 ++++++- 5 files changed, 684 insertions(+), 4 deletions(-) create mode 100644 php/getoneurl.php create mode 100644 php/lmve_net.sql create mode 100644 php/newurlreptile.php create mode 100644 php/search.php diff --git a/php/getoneurl.php b/php/getoneurl.php new file mode 100644 index 0000000..e770777 --- /dev/null +++ b/php/getoneurl.php @@ -0,0 +1,30 @@ +conetdb(MYSQL_USERNAME,MYSQL_PASSWORD,MYSQL_DBNAME); +$nowtime=date('Y-m-d H:i:s'); +$re=$db1->read_data_on_tab("lmve_newurls",array("flag"=>"1"),array("lastupdata"=>"ASC","score"=>"DESC"),"0,1"); +if(empty($re)) +{ + + $re=$db1->read_data_on_tab("lmve_newurls",array("flag"=>"0"),array("lastupdata"=>"ASC","score"=>"DESC"),"0,1"); + $db1->updata_on_tab("lmve_newurls",array("id"=>$re[0]['id']),array("flag"=>"1","lastupdata"=>$nowtime)); + print_r($re[0]['url']); +}else +{ + if(strtotime($nowtime)-strtotime($re[0]['lastupdata'])>600)//如果这个url10分钟还没更新 + { + $db1->updata_on_tab("lmve_newurls",array("id"=>$re[0]['id']),array("lastupdata"=>$nowtime)); + print_r($re[0]['url']); + }else + { + $re=$db1->read_data_on_tab("lmve_newurls",array("flag"=>"0"),array("lastupdata"=>"ASC","score"=>"DESC"),"0,1"); + $db1->updata_on_tab("lmve_newurls",array("id"=>$re[0]['id']),array("flag"=>"1","lastupdata"=>$nowtime)); + print_r($re[0]['url']); + } +} + +?> \ No newline at end of file diff --git a/php/lmve_net.sql b/php/lmve_net.sql new file mode 100644 index 0000000..d4c6b04 --- /dev/null +++ b/php/lmve_net.sql @@ -0,0 +1,339 @@ +-- phpMyAdmin SQL Dump +-- version 5.2.0 +-- https://www.phpmyadmin.net/ +-- +-- 主机: localhost +-- 生成日期: 2022-09-11 16:46:13 +-- 服务器版本: 5.6.50-log +-- PHP 版本: 7.4.28 + +SET SQL_MODE = "NO_AUTO_VALUE_ON_ZERO"; +START TRANSACTION; +SET time_zone = "+00:00"; + + +/*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */; +/*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */; +/*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */; +/*!40101 SET NAMES utf8mb4 */; + +-- +-- 数据库: `lmve_net` +-- + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_domainlinksents` +-- + +CREATE TABLE `lmve_domainlinksents` ( + `id` int(11) NOT NULL, + `domainid` int(11) NOT NULL, + `sentid` int(11) NOT NULL, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_domains` +-- + +CREATE TABLE `lmve_domains` ( + `id` int(11) NOT NULL, + `domain` varchar(64) NOT NULL, + `score` int(8) NOT NULL DEFAULT '0', + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_imgs` +-- + +CREATE TABLE `lmve_imgs` ( + `id` int(11) NOT NULL, + `imgurl` varchar(256) NOT NULL, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_imgslinkurls` +-- + +CREATE TABLE `lmve_imgslinkurls` ( + `id` int(11) NOT NULL, + `imgurlid` int(11) NOT NULL, + `urlid` int(11) NOT NULL, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_newurls` +-- + +CREATE TABLE `lmve_newurls` ( + `id` int(11) NOT NULL, + `domainid` int(11) NOT NULL, + `url` varchar(256) NOT NULL, + `score` int(4) NOT NULL DEFAULT '100', + `flag` int(1) DEFAULT '0', + `lastupdata` datetime NOT NULL DEFAULT '1998-09-12 00:00:00', + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_sents` +-- + +CREATE TABLE `lmve_sents` ( + `id` int(11) NOT NULL, + `sent` varchar(256) NOT NULL, + `creatdata` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_sentslinkurls` +-- + +CREATE TABLE `lmve_sentslinkurls` ( + `id` int(11) NOT NULL, + `sentid` int(11) NOT NULL, + `urlid` int(11) NOT NULL, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_snapshot` +-- + +CREATE TABLE `lmve_snapshot` ( + `id` int(11) NOT NULL, + `urlid` int(11) NOT NULL, + `url` varchar(256) NOT NULL, + `snapshot` text NOT NULL, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_urls` +-- + +CREATE TABLE `lmve_urls` ( + `id` int(8) NOT NULL COMMENT '序号', + `domainid` int(11) NOT NULL, + `url` varchar(256) NOT NULL COMMENT '链接', + `title` varchar(64) DEFAULT NULL COMMENT '标题', + `mate` varchar(512) DEFAULT NULL COMMENT '标记', + `score` int(4) NOT NULL DEFAULT '100' COMMENT '分值', + `laststatus` int(4) DEFAULT NULL, + `contenttype` varchar(32) DEFAULT NULL, + `lastupdata` datetime NOT NULL DEFAULT '1998-09-12 00:00:00' COMMENT '最后更新时间', + `creatdata` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP COMMENT '创建时间' +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_words` +-- + +CREATE TABLE `lmve_words` ( + `id` int(11) NOT NULL, + `word` varchar(8) NOT NULL, + `creatdata` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- -------------------------------------------------------- + +-- +-- 表的结构 `lmve_wordslinksents` +-- + +CREATE TABLE `lmve_wordslinksents` ( + `id` int(11) NOT NULL, + `wordid` int(11) NOT NULL, + `sentid` int(11) NOT NULL, + `time` datetime NOT NULL DEFAULT CURRENT_TIMESTAMP +) ENGINE=InnoDB DEFAULT CHARSET=utf8; + +-- +-- 转储表的索引 +-- + +-- +-- 表的索引 `lmve_domainlinksents` +-- +ALTER TABLE `lmve_domainlinksents` + ADD PRIMARY KEY (`id`), + ADD KEY `domainid` (`domainid`), + ADD KEY `sentid` (`sentid`); + +-- +-- 表的索引 `lmve_domains` +-- +ALTER TABLE `lmve_domains` + ADD PRIMARY KEY (`id`), + ADD KEY `domain` (`domain`), + ADD KEY `score` (`score`); + +-- +-- 表的索引 `lmve_imgs` +-- +ALTER TABLE `lmve_imgs` + ADD PRIMARY KEY (`id`), + ADD KEY `imgurl` (`imgurl`(255)); + +-- +-- 表的索引 `lmve_imgslinkurls` +-- +ALTER TABLE `lmve_imgslinkurls` + ADD PRIMARY KEY (`id`), + ADD KEY `imgurlid` (`imgurlid`), + ADD KEY `urlid` (`urlid`); + +-- +-- 表的索引 `lmve_newurls` +-- +ALTER TABLE `lmve_newurls` + ADD PRIMARY KEY (`id`), + ADD KEY `url` (`url`(255)), + ADD KEY `score` (`score`), + ADD KEY `flag` (`flag`), + ADD KEY `lastupdata` (`lastupdata`), + ADD KEY `domainid` (`domainid`); + +-- +-- 表的索引 `lmve_sents` +-- +ALTER TABLE `lmve_sents` + ADD PRIMARY KEY (`id`), + ADD KEY `sent` (`sent`(255)); + +-- +-- 表的索引 `lmve_sentslinkurls` +-- +ALTER TABLE `lmve_sentslinkurls` + ADD PRIMARY KEY (`id`), + ADD KEY `sentid` (`sentid`), + ADD KEY `urlid` (`urlid`); + +-- +-- 表的索引 `lmve_snapshot` +-- +ALTER TABLE `lmve_snapshot` + ADD PRIMARY KEY (`id`); + +-- +-- 表的索引 `lmve_urls` +-- +ALTER TABLE `lmve_urls` + ADD PRIMARY KEY (`id`), + ADD KEY `url` (`url`(255)), + ADD KEY `lastupdata` (`lastupdata`), + ADD KEY `score` (`score`), + ADD KEY `domainid` (`domainid`); + +-- +-- 表的索引 `lmve_words` +-- +ALTER TABLE `lmve_words` + ADD PRIMARY KEY (`id`), + ADD KEY `word` (`word`); + +-- +-- 表的索引 `lmve_wordslinksents` +-- +ALTER TABLE `lmve_wordslinksents` + ADD PRIMARY KEY (`id`), + ADD KEY `wordid` (`wordid`), + ADD KEY `sentid` (`sentid`); + +-- +-- 在导出的表使用AUTO_INCREMENT +-- + +-- +-- 使用表AUTO_INCREMENT `lmve_domainlinksents` +-- +ALTER TABLE `lmve_domainlinksents` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_domains` +-- +ALTER TABLE `lmve_domains` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_imgs` +-- +ALTER TABLE `lmve_imgs` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_imgslinkurls` +-- +ALTER TABLE `lmve_imgslinkurls` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_newurls` +-- +ALTER TABLE `lmve_newurls` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_sents` +-- +ALTER TABLE `lmve_sents` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_sentslinkurls` +-- +ALTER TABLE `lmve_sentslinkurls` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_snapshot` +-- +ALTER TABLE `lmve_snapshot` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_urls` +-- +ALTER TABLE `lmve_urls` + MODIFY `id` int(8) NOT NULL AUTO_INCREMENT COMMENT '序号'; + +-- +-- 使用表AUTO_INCREMENT `lmve_words` +-- +ALTER TABLE `lmve_words` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; + +-- +-- 使用表AUTO_INCREMENT `lmve_wordslinksents` +-- +ALTER TABLE `lmve_wordslinksents` + MODIFY `id` int(11) NOT NULL AUTO_INCREMENT; +COMMIT; + +/*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */; +/*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */; +/*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */; diff --git a/php/newurlreptile.php b/php/newurlreptile.php new file mode 100644 index 0000000..b561b9b --- /dev/null +++ b/php/newurlreptile.php @@ -0,0 +1,236 @@ +conetdb(MYSQL_USERNAME,MYSQL_PASSWORD,MYSQL_DBNAME); + +$nowtime=date('Y-m-d H:i:s'); + +function lmve_get_dataid_at_tab($tab,$da) +{ + global $db1; + $re=$db1->read_data_on_tab($tab,$da); + if(empty($re)) + { + $rt_id=$db1->addend_data_on_tab($tab,$da); + $da["id"]=$rt_id; + $da["isnew"]=true; + }else + { + $rt_id=$re[0]['id']; + $da["id"]=$rt_id; + $da["isnew"]=false; + + } + return $da; +} + +function lmve_get_domainscore_at_tab($domain) +{ + global $db1; + $da=array("domain"=>$domain); + $re=$db1->read_data_on_tab("lmve_domains",$da); + if(empty($re)) + { + $rt_id=$db1->addend_data_on_tab("lmve_domains",$da); + $da["id"]=$rt_id; + $da["isnew"]=true; + $da["score"]="0"; + }else + { + $rt_id=$re[0]['id']; + $da["id"]=$rt_id; + $da["isnew"]=false; + $da["score"]=$re[0]['score']; + + } + return $da; +} + +//对数据预处理 +$_POST['url']=addslashes($_POST['url']); +$_POST['title']=addslashes($_POST['title']); +$_POST['mate']=addslashes($_POST['mate']); + +echo "**远程消息**\n"; +echo "url:".$_POST['url']."\n"; + +//获取url在数据库中的id +$url_id=lmve_get_dataid_at_tab("lmve_urls",array("url"=>$_POST['url'])); +//获取domain在数据库中的id +$url_domain=lmve_get_domainscore_at_tab($_POST['domain']); + +if($_POST['contenttype']=='text/html')//如果是text页 +{ + //写入快照 + $_POST['htmlx']=addslashes($_POST['htmlx']); + lmve_get_dataid_at_tab("lmve_snapshot",array("urlid"=>$url_id['id'],"url"=>$_POST['url'],"snapshot"=>$_POST['htmlx'])); + //将获取到的新连接也一起加到数据库 + + //解包参考分 + $new_urls_score=json_decode($_POST['newurlscore']); + $new_urls_score_arr=array(); + foreach($new_urls_score as $val) + { + array_push($new_urls_score_arr,$val); + } + + //解包域名 + $new_urls_domain=json_decode($_POST['newurldomain']); + $new_urls_domain_arr=array(); + foreach($new_urls_domain as $val) + { + array_push($new_urls_domain_arr,$val); + } + + //解包新url + $new_urls=json_decode($_POST['newurls']); + $new_urls_int=0; + foreach($new_urls as $val) + { + $new_url=addslashes($val); + $nwe_domain=$new_urls_domain_arr[$new_urls_int]; + $nwe_domain_info=lmve_get_domainscore_at_tab($nwe_domain); + $score=intval($new_urls_score_arr[$new_urls_int])+intval($nwe_domain_info['score']); + //先检查在数据库中是否有 + $new_url_db=$db1->read_data_on_tab("lmve_newurls",array("url"=>$new_url)); + if(empty($new_url_db)) + { + lmve_get_dataid_at_tab("lmve_newurls",array("domainid"=>$nwe_domain_info['id'],"url"=>$new_url,"score"=>$score)); + }else + { + $db1->updata_on_tab("lmve_newurls",array("id"=>$new_url_db[0]['id']),array("score"=>$score)); + } + $new_urls_int+=1; + } + + $imgs=json_decode($_POST['images']); + //记录网站的图片 + //先删除旧的再连接新的 + $db1->deldata_on_tab("lmve_imgslinkurls",array("urlid"=>$url_id['id'])); + foreach($imgs as $val) + { + $img_url=addslashes($val); + $imgs_id=lmve_get_dataid_at_tab("lmve_imgs",array("imgurl"=>$img_url)); + lmve_get_dataid_at_tab("lmve_imgslinkurls",array("imgurlid"=>$imgs_id['id'],"urlid"=>$url_id['id'])); + } + + $words=json_decode($_POST['words']); + $sents=json_decode($_POST['sents']); + $links=json_decode($_POST['links']); + + //拆包 + $words_id_at_db=array(); + $wordio_int=0; + foreach($words as $val) + { + $word=addslashes($val); + $wtemp=lmve_get_dataid_at_tab("lmve_words",array("word"=>$word));//获取单词在数据库的id + array_push($words_id_at_db,$wtemp['id']); + $wordio_int+=1; + } + //处理内容与单词之间的连接 + $wordid=array(); + $sentid=array(); + foreach($links as $val) + { + $temp=explode(":",$val); + array_push($wordid,$temp[0]); + array_push($sentid,$temp[1]); + } + + + //先获得单词在数据库中的id + + //处理内容 + //先删除旧的再连接新的 + $db1->deldata_on_tab("lmve_sentslinkurls",array("urlid"=>$url_id['id'])); + $sents_int=0; + $repeat_int=0; + $newsents_int=0; + + foreach($sents as $val) + { + //从数据库获取内容id并检查是否重复 + $sent=addslashes($val); + $stemp=lmve_get_dataid_at_tab("lmve_sents",array("sent"=>$sent)); + if($stemp['isnew'])//在数据库中是新内容 + { + lmve_get_dataid_at_tab("lmve_domainlinksents",array("domainid"=>$url_domain['id'],"sentid"=>$stemp['id']));//连接域名关系 + $sentsid_int=0; + foreach($sentid as $val1)//将内容对应的单词添加到数据库并进行关系连接 + { + if($val1==$sents_int) + { + $word_id=$words_id_at_db[$wordid[$sentsid_int]]; + lmve_get_dataid_at_tab("lmve_wordslinksents",array("wordid"=>$word_id,"sentid"=>$stemp['id']));//连接单词与内容 + + } + + $sentsid_int+=1; + + } + $newsents_int+=1; + }else + { + //这个内容在数据库中并不是唯一的,说明有重复内容,还要验证是否是在该域名下重复的 + $temp=lmve_get_dataid_at_tab("lmve_domainlinksents",array("domainid"=>$url_domain['id'],"sentid"=>$stemp['id'])); + if($temp['isnew']) + { + //域名下没有重复 + }else + { + //域名下有重复 + $repeat_int+=1; + } + + } + + //连接内容与url + + lmve_get_dataid_at_tab("lmve_sentslinkurls",array("sentid"=>$stemp['id'],"urlid"=>$url_id['id'])); + + $sents_int+=1; + } + + + //更新连接 + $reppp=ceil(($repeat_int/$sents_int)*100); + $url_score=intval($_POST['score'])+100-$reppp+intval($url_domain['score']); + + + + if($url_id['isnew']) + { + $db1->updata_on_tab("lmve_urls",array("id"=>$url_id['id']),array("domainid"=>$url_domain['id'],"title"=>$_POST['title'],"mate"=>$_POST['mate'],"score"=>$url_score,"laststatus"=>$_POST['status'],"contenttype"=>$_POST['contenttype'],"lastupdata"=>$nowtime)); + echo "记录新url\n"; + + }else + { + $db1->updata_on_tab("lmve_urls",array("id"=>$url_id['id']),array("title"=>$_POST['title'],"mate"=>$_POST['mate'],"laststatus"=>$_POST['status'],"contenttype"=>$_POST['contenttype'],"lastupdata"=>$nowtime)); + echo "不是新url,将不会更新评分\n"; + } + + + echo "内容数量:".$newsents_int."\n"; + echo "重复率:".$reppp."%\n"; + echo "url得分:".$url_score."\n"; + echo "单词io:".$wordio_int."\n"; + +}else +{ + $db1->updata_on_tab("lmve_urls",array("id"=>$url_id['id']),array("domainid"=>$url_domain['id'],"title"=>$_POST['title'],"score"=>"0","laststatus"=>$_POST['status'],"contenttype"=>$_POST['contenttype'],"lastupdata"=>$nowtime)); +} + +$db1->updata_on_tab("lmve_newurls",array("url"=>$_POST['url']),array("flag"=>"3","lastupdata"=>$nowtime)); + +echo "url码:".$_POST['status']."\n"; + + +$oktime=time(); +$runtime=$oktime-$starttime; +echo "响应时:".$runtime."\n"; +?> \ No newline at end of file diff --git a/php/search.php b/php/search.php new file mode 100644 index 0000000..5f378b9 --- /dev/null +++ b/php/search.php @@ -0,0 +1,31 @@ +conetdb(MYSQL_USERNAME,MYSQL_PASSWORD,MYSQL_DBNAME); + +$nowtime=date('Y-m-d H:i:s'); +//ini_set('memory_limit', '128M'); +require_once "./jieba/vendor/multi-array/MultiArray.php"; +require_once "./jieba/vendor/multi-array/Factory/MultiArrayFactory.php"; +require_once "./jieba/class/Jieba.php"; +require_once "./jieba/class/Finalseg.php"; +use Fukuball\Jieba\Jieba; +Jieba::init(); + + + + +$seg_list = Jieba::cutForSearch($_GET['s']); #搜索引擎模式 +var_dump($seg_list); + + + + + +$oktime=time(); +$runtime=$oktime-$starttime; +echo "响应时:".$runtime."\n"; +?> \ No newline at end of file diff --git a/reptile.py b/reptile.py index 0716b88..93b3fea 100644 --- a/reptile.py +++ b/reptile.py @@ -12,7 +12,7 @@ import re import jieba import html import json - +from myfunsion import myfunsion #from bs4 import BeautifulSoup as bs #from lxml import etree @@ -323,8 +323,52 @@ def getoneurl(): +test=1 + +if test==0: + while 1: + theurl=getoneurl() + if theurl!='-1': + getandpost(theurl) + else : + print("获取url失败,自动重试") + +if test==1: + theurl="https://lmve.net" + getandpost(theurl) + +if test==2: + theurl=getoneurl() + print(theurl) + getandpost(theurl) + +if test==3: + headers = { + 'user-agent': 'my-app/0.0.1', + 'Content-Type': 'text/html; charset=utf-8' + } + + #print(begin_url) + #获取整个页面 + url1="https://git.lmve.net/kevin/um-all-index-web/-/avatar" + url2="https://yyyyyyounger.com/" + try : + r = requests.get(url2,headers=headers,verify=False) + htmlx_byt=r.content + print(r.headers['Content-Type']) + except: + print("11") +if test==4: + #js=get_page_words("https://lmve.net") + #print(js) + myfunsion.a() + + + + + + + + -theurl="https://lmve.net" -redata=get_page_words(theurl) -print(redata['words'])