纯php采集周松松最新文章
纯php采集周松松最新文章与邮件订阅发送输出JSON如下:
- <?php$url = 'https://zhousongsong.com/'; $ip = rand(0,255).'.'.rand(0,255).'.'.rand(0,255).'.'.rand(0,255) ; //随机IP$uaagent=["Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15) AppleWebKit/605.1.15 (KHTML, like Gecko)","Mozilla/5.0 (Windows NT 6.1; Win64; x64; +http://url-classification.io/wiki/index.php?title=URL_server_crawler) KStandBot/1.0","Mozilla/5.0 (iPhone; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (iPod; U; CPU iPhone OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (iPad; U; CPU OS 4_3_3 like Mac OS X; en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5","Mozilla/5.0 (Linux; U; Android 2.3.7; en-us; Nexus One Build/FRF91) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1","MQQBrowser/26 Mozilla/5.0 (Linux; U; Android 2.3.7; zh-cn; MB200 Build/GRJ22; CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1","Opera/9.80 (Android 2.3.4; Linux; Opera Mobi/build-1107180945; U; en-GB) Presto/2.8.149 Version/11.10","Mozilla/5.0 (Linux; U; Android 3.0; en-us; Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13","Mozilla/5.0 (BlackBerry; U; BlackBerry 9800; en) AppleWebKit/534.1+ (KHTML, like Gecko) Version/6.0.0.337 Mobile Safari/534.1+","Mozilla/5.0 (hp-tablet; Linux; hpwOS/3.0.0; U; en-US) AppleWebKit/534.6 (KHTML, like Gecko) wOSBrowser/233.70 Safari/534.6 TouchPad/1.0","NOKIA5700/ UCWEB7.0.2.37/28/999","Openwave/ UCWEB7.0.2.37/28/999","Mozilla/4.0 (compatible; MSIE 6.0; ) Opera/UCWEB7.0.2.37/28/999","Mozilla/5.0 (Linux; Android 6.0; 1503-M02 Build/MRA58K) AppleWebKit/537.36 (KHTML, like Gecko) Version/4.0 Chrome/37.0.0.0 Mobile MQQBrowser/6.2 TBS/036558 Safari/537.36 MicroMessenger/6.3.25.861 NetType/WIFI Language/zh_CN"];$randomKey = array_rand($uaagent);$randomUserAgent = $uaagent[$randomKey];//curl封装function fetchurl($url, $headers, $postData = null, $method = 'GET') { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_SSL_VERIFYHOST, 0); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true);curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); curl_setopt($ch, CURLOPT_HTTPHEADER, $headers); if ($postData !== null && $method === 'POST') { curl_setopt($ch, CURLOPT_POST, 1); curl_setopt($ch, CURLOPT_POSTFIELDS, $postData); } if ($method === 'POST') { curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'POST'); } else { curl_setopt($ch, CURLOPT_CUSTOMREQUEST, 'GET'); } $result = curl_exec($ch); if (curl_errno($ch)) { $error_msg = curl_error($ch); curl_close($ch); throw new Exception("cURL Error: " . $error_msg); } curl_close($ch); return $result; } $headers = array('X-FORWARDED-FOR' =>$ip,'CLIENT-IP' =>$ip,'Refererr'=>'https://www.qq.com/','Accept-Encoding' =>'gzip, deflate','User-Agent' =>$randomUserAgent,);$result = fetchurl($url, $headers); preg_match_all('/<title>(.*?)<\/title>/', $result, $m); $webname = $m[1][0];$webname =substr($webname, 0, strpos($webname, '-'));preg_match_all('/name="description" content="(.*?)"/', $result, $m); $description = $m[1][0];if (!extension_loaded('dom')) {die('DOMDocument扩展未加载,请检查PHP配置文件。');} $dom = new DOMDocument(); @$dom->loadHTML($result);$newstext = '';$divtext = $dom->getElementById('con_one_1'); //跟随首页模板最新列表模板html改动foreach ($divtext->childNodes as $child) { $newstext .=$child->ownerDocument->saveHTML($child);} $newstext=strip_tags($newstext, "<h1><h2><h3><h5><h6><br><p><a>"); @$dom->loadHTML('<?xml encoding="UTF-8">' .$newstext); $h2Tags = $dom->getElementsByTagName('h2'); $xpath = new DOMXPath($dom);$allinfo = [];//初始化文章列表数组$webinfo=[];//初始化网站信息数组$comment= []; //初始化评论列表数组$newarticleinfo= []; //初始化最新文章信息数组/**获取文章列表循环获取对应数据比如浏览量,标题,内容等字段*/$h2Nodes = $xpath->query('//h2');if ($h2Nodes->length > 0) { foreach ($h2Nodes as $h2) { $title = $h2->textContent; $link = $h2->getElementsByTagName('a')->item(0)->getAttribute('href'); $h6 = $h2->nextSibling; while ($h6 && $h6->nodeName !== 'h6') { $h6 = $h6->nextSibling; } $h6p = $h6->nextSibling; while ($h6p&& $h6p->nodeName === 'p') { $pcontent = $h6p->textContent; $h6p = $node->nextSibling; } $resulttlist = fetchUrl($link, $headers); $domInner = new DOMDocument(); @$domInner->loadHTML($resulttlist); $xpathInner = new DOMXPath($domInner); $nodes = $xpathInner->query('//dd[@class="con"]');//跟随内容模板html改动 if ($nodes->length > 0) { $conNode = $nodes->item(0); $newstexts = ''; foreach ($conNode->getElementsByTagName('p') as $pNode) { if ($pNode->hasAttribute('style')) { $pNode->removeAttribute('style'); } $newstexts .= $domInner->saveHTML($pNode); } $newstexts = strip_tags($newstexts, '<h1><h2><h3><h5><h6><br><p><a><img>'); $sourceIndex = strpos($newstexts, '来源:'); if ($sourceIndex !== false) { $newstexts = substr($newstexts, 0, $sourceIndex); } $newstexts=preg_replace('/<a[^>]*>(.*?)<\/a>/', '$1',$newstexts); } if ($h6 instanceof DOMElement) { $h6Content = $h6->nodeValue; preg_match('/\d{4}年\d{2}月\d{2}日/', $h6Content, $matches); $date = $matches[0]; preg_match("/浏览:(\d+)/", $h6Content, $matchess); $views = (int) $matchess[1]; preg_match("/评论:(\d+)/", $h6Content, $matchesss); $comments = (int) $matchesss[1]; } else { $date = ''; $views = 0; $comments = 0; } } $allinfo[] = [ 'title' => $title, 'link' => $link, 'date' => $date, 'smalltext' => $pcontent, 'onclick' => $views, 'newstext' =>addcslashes($newstexts, '"'), 'plnum' => $comments, ];}}if ($h2Tags->length > 0) { $firstH2Content = $h2Tags->item(0)->nodeValue; $title="有来自".$webname."的最新文章";$firstH2 = $h2Tags->item(0); $aTags = $firstH2->getElementsByTagName('a'); $firstA = $aTags->item(0); $linkHref = $firstA->getAttribute('href'); $text="".$webname."最新文章标题为《".$firstH2Content."》,地址:<a href='".$linkHref."' target='_blank'>".$linkHref."</a>";$resultt= fetchurl($linkHref, $headers); @$dom->loadHTML($resultt);$xpath = new DOMXPath($dom);$nodes = $xpath->query('//dd[@class="con"]'); //跟随内容模板html改动if ($nodes->length > 0) { $conNode = $nodes->item(0); $newstext = ''; foreach ($conNode->getElementsByTagName('p') as $pNode) { if ($pNode->hasAttribute('style')) { $pNode->removeAttribute('style'); } $newstext .= $pNode->ownerDocument->saveHTML($pNode); } $sourceIndex = strpos($newstext, '来源:'); if ($sourceIndex !== false) { $newstext = substr($newstext, 0, $sourceIndex); } $newstext=preg_replace('/<a[^>]*>(.*?)<\/a>/', '$1',$newstext);}preg_match('/<span class="commentViewNums">(.*?)<\/span>/',$resultt, $viewNumMatch);$viewNum = $viewNumMatch[1];$h6Tags = $dom->getElementsByTagName('h6'); $h6Tag = $h6Tags->item(0); $h6Text = trim($h6Tag->textContent); preg_match('/\d{4}年\d{2}月\d{2}日 \d{2}:\d{2}/', $h6Text, $matches);$datetime = $matches[0]; $newarticleinfo[] = ['title' => $firstH2Content,'link' => $linkHref,'date' => $datetime,'onclick' =>str_replace('浏览量: ', '', $viewNum),'newstext' => $newstext,];} @$dom->loadHTML($result);$divStatistics = $dom->getElementById('divStatistics'); //跟随首页模板实时数据模板html改动 $ul = $divStatistics->getElementsByTagName('ul')->item(0); $totalarticle = ''; // 文章总数 $totalplnum = ''; // 评论总数 $totalonclick = ''; // 浏览总数 foreach ($ul->getElementsByTagName('li') as $li) { $textContent = trim($li->textContent); preg_match('/文章总数:\s*(\d+)/', $textContent, $matchescc); preg_match('/评论总数:\s*(\d+)/', $textContent, $matchesbb); preg_match('/浏览总数:\s*(\d+)/', $textContent, $matchesaa); $totalarticle .= $matchescc[1]; // 累加文章总数 $totalplnum .= $matchesbb[1]; // 累加评论总数 $totalonclick .= $matchesaa[1]; // 累加浏览总数 } $divComments= $dom->getElementById('divComments'); //跟随首页模板最新评论模板html改动 $comments = $divComments->getElementsByTagName('ul')->item(0);foreach ($comments->getElementsByTagName('li') as $li) { $comment[]= trim($li->textContent); } preg_match('/BA号:.*?<\/a>/i', $result, $match);$beianhao = trim(strip_tags($match[0])); $beianhao = str_replace('BA号:', '', $beianhao);preg_match('/站长QQ:.*?<\/a>/i', $result, $matchh);$QQhao = $matchh[0]; $QQhao = str_replace('站长QQ:', '', $QQhao );$webinfo[]= [ 'totalarticle' => $totalarticle, 'totalplnum' => $totalplnum, 'beianhao' => $beianhao, 'qq' =>(int)$QQhao, 'totalonclick' => $totalonclick ];$content= array('webname'=>$webname,'newarticleinfo'=>$newarticleinfo,'allinfo'=>$allinfo,'webinfo'=>$webinfo,'comment'=>$comment,'emailmsg'=>$emailmsg,'description'=>$description,'code'=>200,'msg'=>'获取成功');$Json=json_encode($content,JSON_PRETTY_PRINT|JSON_UNESCAPED_UNICODE);echo stripslashes($Json);