8wDlpd.png
8wDFp9.png
8wDEOx.png
8wDMfH.png
8wDKte.png

PHP解析HTML,DomCrawler使用案例

admin 10月前 138

<?php
define('LARAVEL_START', microtime(true));
require __DIR__.'/../vendor/autoload.php';

use GuzzleHttp\Client;
use Symfony\Component\DomCrawler\Crawler;

print_r(json_encode(Spider(), JSON_UNESCAPED_UNICODE));

function Spider()
{
    //需要爬取的页面
    $url = 'https://movie.douban.com/subject/25812712/?from=showing';
    echo "ok";
    //下载网页内容
    $client   = new Client([
        'timeout' => 60,
        'verify' => false,
        'headers' => ['User-Agent' => 'Mozilla/5.0 (compatible; Baiduspider-render/2.0; +http://www.baidu.com/search/spider.html)',
        ],
    ]);
    echo "ok";
    $response = $client->request('GET', $url)->getBody()->getContents();
    echo "ok";
    //进行XPath页面数据抽取
    $data    = []; //结构化数据存本数组
    $crawler = new Crawler();
    $crawler->addHtmlContent($response);

    try {
        //电影名称
        //网页结构中用css选择器用id的比较容易写xpath表达式
        $data['name'] = $crawler->filterXPath('//*[@id="content"]/h1/span[1]')->text();
        //电影海报
        $data['cover'] = $crawler->filterXPath('//*[@id="mainpic"]/a/img/@src')->text();
        //导演
        $data['director'] = $crawler->filterXPath('//*[@id="info"]/span[1]/span[2]')->text();
        //多个导演处理成数组
        $data['director'] = explode('/', $data['director']);
        //过滤前后空格
        $data['director'] = array_map('trim', $data['director']);

        //编剧
        $data['cover'] = $crawler->filterXPath('//*[@id="info"]/span[2]/span[2]/a')->text();
        //主演
        $data['mactor'] = $crawler->filterXPath('//*[@id="info"]/span[contains(@class,"actor")]/span[contains(@class,"attrs")]')->text();
        //多个主演处理成数组
        $data['mactor'] = explode('/', $data['mactor']);
        //过滤前后空格
        $data['mactor'] = array_map('trim', $data['mactor']);

        //上映日期
        $data['rdate'] = $crawler->filterXPath('//*[@id="info"]')->text();
        //使用正则进行抽取
        preg_match_all("/(\d{4})-(\d{2})-(\d{2})\(.*?\)/", $data['rdate'], $rdate); //2017-07-07(中国大陆) / 2017-06-14(安锡动画电影节) / 2017-06-30(美国)
        $data['rdate'] = $rdate[0];
        //简介
        //演示使用class选择器的方式
        $data['introduction'] = trim($crawler->filterXPath('//div[contains(@class,"indent")]/span')->text());

        //演员
        //本xpath表达式会得到多个对象结果,用each方法进行遍历
        //each是传入的参数是一个闭包,在闭包中使用外部的变量使用use方法,并使用变量指针
        $crawler->filterXPath('//ul[contains(@class,"celebrities-list from-subject")]/li')->each(function (Crawler $node, $i) use (&$data) {
            $actor['name']   = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"name")]/a')->text(); //名字
            $actor['role']   = $node->filterXPath('//div[contains(@class,"info")]/span[contains(@class,"role")]')->text(); //角色
            $actor['avatar'] = $node->filterXPath('//a/div[contains(@class,"avatar")]/@style')->text(); //头像
            //background-image: url(https://img3.doubanio.com/img/celebrity/medium/5253.jpg) 正则抽取头像图片
            preg_match_all("/((https|http|ftp|rtsp|mms)?:\/\/)[^\s]+\.(jpg|jpeg|gif|png)/", $actor['avatar'], $avatar);
            $actor['avatar'] = $avatar[0][0];
            //print_r($actor);
            $data['actor'][] = $actor;
        });

    } catch (\Exception $e) {

    }

    return $data;

}

?>

 

最新回复 (0)
    • 肚兜网
      2
        登录 注册 QQ登录暂未开通
返回
发新帖