采集网站 相关文件--YiichinaSpider.php [ 1.0 版本 ]
YII2网站采集
(1)QueueController.php
(2)ArticleController.php
(3)ArticleJob.php
(4)ArticleSpider.php
(5)YiichinaSpider.php
<?php
namespace console\models;
use Goutte\Client;
class YiichinaSpider extends ArticleSpider{
private $_url;
/**
* 构造方法,初始化采集网站属性
*/
public function __construct(){
$this->name = 'Yiichina';
$this->baseUrl = 'http://www.yiichina.com';
$this->category = [
'教程'=>'http://www.yiichina.com/tutorial',
'扩展'=>'http://www.yiichina.com/extension',
'源码'=>'http://www.yiichina.com/code',
];
}
/**
* 采集执行函数,调用 getPages ,获取所有分页 ;然后调用 urls ,获取每页文章的文章url,并将他们存入队列
*/
public function process(){
foreach($this->category as $category=>$url){
$pages = $this->getPages($url,$category);
if($pages){
foreach($pages as $p){
$this->urls($category,$p);
}
}
}
}
/**
* 获取当前网站指定分类的分页
* @return array
*/
private function getPages($pageUrl,$category){
$client = new Client();
$crawler = $client->request('GET', $pageUrl);
//获取分页
$crawler->filter('.media-list .pagination li a')->each(function ($node) use($pageUrl,$category) {
if($node){
try{
$this->_url[] = $this->baseUrl.trim($node->attr('href'));
}catch(\Exception $e){
$this->addLog($pageUrl,$category,false,$e->getMessage());
}
}
});
return array_unique($this->_url);
}
/**
* 获取每页的文章列表中文章URL和发布时间
* @param $category
* @param $url
*/
private function urls($category,$url){
$client = new Client();
$crawler = $client->request('GET', $url);
$crawler->filter('.media-list .media')->each(function ($node) use($category,$url) {
if($node){
try{
$a = $node->filter('.media-body .media-heading a');
if($a){
$u = $this->baseUrl.trim($a->attr('href'));
if(!$this->isGathered($u)){
$this->enqueue($category,$u,'yiichina');
}
}
}catch(\Exception $e){
$this->addLog($url,$category,false, $e->getMessage());
}
}
});
}
/**
* 获取指定url的文章标题、内容、发布时间
* @param $url
* @param $category
* @return string
*/
public function getContent($url,$category){
$client = new Client();
$crawler = $client->request('GET', $url);
$node = $crawler->filter('.col-lg-9')->eq(0);
if($node){
try{
$title = $node->filter('.page-header h1');
$time = $node->filter('.action .time');
if($title && $time){
$title = trim($title->text());
$content = $node->html();
$time = $time->text();
return json_encode(['title'=>$title,'content'=>$content,'time'=>$time]);
}
}catch(\Exception $e){
$this->addLog($url,$category,false,$e->getMessage());
}
}
return '';
}
}
specialnot
注册时间:2015-08-06
最后登录:2019-08-16
在线时长:27小时54分
最后登录:2019-08-16
在线时长:27小时54分
- 粉丝43
- 金钱1175
- 威望200
- 积分3445
共 1 条评论
Goutte\Client
这个不应该介绍下吗?
那个插件基本上就是把js选择起的功能用php实现了,只要熟悉js,这个就挺好上手的。具体在github中有详细介绍。 https://github.com/FriendsOfPHP/Goutte