在上一篇我們只處理了文章列表,並沒有處理分頁問題,所以這次我們把分頁的程式碼加上去
首先我們先用這段程式碼來分析出『最舊、上頁、下頁、最新』的連結,並調整程式碼(調整前記得先執行一次測試)
<?php
namespace Recca0120\Ithome30\Crawlers;
use GuzzleHttp\Psr7\Request;
use Psr\Http\Client\ClientInterface;
class Board
{
public function __construct(private ClientInterface $httpClient)
{
}
public function fetch(array $board)
{
$url = $board['url'];
$results = [];
do {
$html = $this->sendRequest($url);
$rows = array_map(
fn (string $row) => $this->parseCols($row, $board),
$this->parseRows($html)
);
$results = array_merge($results, $rows);
$pagination = $this->parsePagination($html);
$url = $pagination['prev'];
} while ($pagination['prev'] !== null);
return $results;
}
private function sendRequest($url)
{
$request = new Request('GET', $url, [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' => 'gzip, deflate, br',
'Accept-Language' => 'zh-TW,zh;q=0.8',
'Cache-Control' => 'max-age=0',
'Cookie' => 'over18=1',
'Referer' => 'https://www.ptt.cc/bbs/Gossiping/index.html',
'Sec-Ch-Ua' => '"Brave";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile' => '?0',
'Sec-Ch-Ua-Platform' => '"macOS"',
'Sec-Fetch-Dest' => 'document',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-Site' => 'same-origin',
'Sec-Fetch-User' => '?1',
'Sec-Gpc' => '1',
'Upgrade-Insecure-Requests' => '1',
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
]);
$response = $this->httpClient->sendRequest($request);
$html = (string)$response->getBody();
return $html;
}
private function parsePagination($html)
{
preg_match_all('/<a class="btn wide( disabled)?"( href="(?<href>.+)")?>.*(?<name>(最舊|上頁|下頁|最新))[^<]*?<\/a>/', $html, $matches);
$lookup = ['最舊' => 'oldest', '上頁' => 'prev', '下頁' => 'next', '最新' => 'latest'];
$pagination = [];
foreach (array_keys($matches[0]) as $index) {
$pagination[$lookup[$matches['name'][$index]]] = $matches['href'][$index]
? 'https://www.ptt.cc' . $matches['href'][$index]
: null;
}
return $pagination;
}
private function parseCols($row, $board)
{
preg_match_all('/<div class="(?<name>(nrec|title|author|date))"[^>]*>(?<value>.*?)<\/div>/s', $row, $matches);
$cols = [
'board_name' => $board['name'],
'board_class' => $board['class'],
];
foreach (array_keys($matches[0]) as $index) {
$cols[$matches['name'][$index]] = trim($matches['value'][$index]);
}
$cols['nrec'] = strip_tags($cols['nrec']);
preg_match('/href="(.*)"/', $cols['title'], $matched);
$cols['url'] = 'https://www.ptt.cc' . $matched[1];
preg_match('/\[(.+)\](.+)/', strip_tags($cols['title']), $matched);
$cols['type'] = trim($matched[1]);
$cols['title'] = trim($matched[2]);
return $cols;
}
private function parseRows($html)
{
preg_match_all('/class="r-ent">.+<div class="mark">(.+)<\/div>/sU', $html, $matches);
return $matches[0];
}
}
當程式碼調整完再執行測試會等待超久的時間,因為真的會去抓取分頁,所以這時候該怎麼辦?我們可以調整一下 fetch 的程式碼,增加一個 $take
變數,來限制抓取數量
public function fetch(array $board, ?int $take = null)
{
$url = $board['url'];
$results = [];
// 加入已抓取分頁
$page = 0;
do {
$page++;
$html = $this->sendRequest($url);
$rows = array_map(
fn (string $row) => $this->parseCols($row, $board),
$this->parseRows($html)
);
$results = array_merge($results, $rows);
// 增加跳出條件
if ($take !== null && $page >= $take) {
break;
}
$pagination = $this->parsePagination($html);
$url = $pagination['prev'];
} while ($pagination['prev'] !== null);
return $results;
}
這時我們再調整一下測試案例,指定 take 為 2,然後我們再驗證一下抓取的總筆數即可
<?php
namespace Recca0120\Ithome30\Tests\Crawlers;
use Mockery;
use GuzzleHttp\Client;
use PHPUnit\Framework\TestCase;
use Recca0120\Ithome30\Crawlers\Board;
class BoardTest extends TestCase
{
public function test_fetch_board_articles_list()
{
\VCR\VCR::turnOn();
\VCR\VCR::insertCassette('ptt_gossiping.yaml');
/** @var Mockery\Mock|ClientInterface $httpClient */
$httpClient = Mockery::spy(new Client());
$crawler = new Board($httpClient);
$records = $crawler->fetch([
'name' => 'Gossiping',
"nuser" => '8803',
'class' => '綜合',
'title' => '[八卦] 亞運李智凱、許皓鋐奪金!',
'url' => 'https://www.ptt.cc/bbs/Gossiping/index.html'
], 2);
// 兩頁共 43 筆資料
self::assertCount(43, $records);
self::assertEquals([
'board_name' => 'Gossiping',
'board_class' => '綜合',
'nrec' => '4',
'type' => '問卦',
'title' => '司機夫人真的有去卡地亞血拚$1.1M嗎?',
'author' => 'uwmtsa',
'date' => '10/06',
'url' => 'https://www.ptt.cc/bbs/Gossiping/M.1696537444.A.1A5.html',
], $records[0]);
\VCR\VCR::eject();
\VCR\VCR::turnOff();
}
}
這樣我們就完成了抓取分頁的功能了,但這邊我們可以再思考一下,fetch 回傳資料的格式是不是要再做調整?
$records = [
[
// page 1
[
'board_name' => 'Gossiping',
'board_class' => '綜合',
'nrec' => '4',
'type' => '問卦',
'title' => '司機夫人真的有去卡地亞血拚$1.1M嗎?',
'author' => 'uwmtsa',
'date' => '10/06',
'url' => 'https://www.ptt.cc/bbs/Gossiping/M.1696537444.A.1A5.html',
],
// ....
],
[
// page 2
[
'board_name' => 'Gossiping',
'board_class' => '綜合',
'nrec' => '4',
'type' => '問卦',
'title' => '司機夫人真的有去卡地亞血拚$1.1M嗎?',
'author' => 'uwmtsa',
'date' => '10/06',
'url' => 'https://www.ptt.cc/bbs/Gossiping/M.1696537444.A.1A5.html',
],
// ....
]
]
改成二維陣列,以一頁一頁的方式回傳會比較清楚呢,所以我們再調整一下程式碼
<?php
namespace Recca0120\Ithome30\Crawlers;
use GuzzleHttp\Psr7\Request;
use Psr\Http\Client\ClientInterface;
class Board
{
public function __construct(private ClientInterface $httpClient)
{
}
public function fetch(array $board, ?int $take = null)
{
$url = $board['url'];
$results = [];
$page = 0;
do {
$page++;
$html = $this->sendRequest($url);
$rows = array_map(
fn (string $row) => $this->parseCols($row, $board),
$this->parseRows($html)
);
$results[] = $rows;
if ($take !== null && $page >= $take) {
break;
}
$pagination = $this->parsePagination($html);
$url = $pagination['prev'];
} while ($pagination['prev'] !== null);
return $results;
}
private function sendRequest($url)
{
$request = new Request('GET', $url, [
'Accept' => 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8',
'Accept-Encoding' => 'gzip, deflate, br',
'Accept-Language' => 'zh-TW,zh;q=0.8',
'Cache-Control' => 'max-age=0',
'Cookie' => 'over18=1',
'Referer' => 'https://www.ptt.cc/bbs/Gossiping/index.html',
'Sec-Ch-Ua' => '"Brave";v="117", "Not;A=Brand";v="8", "Chromium";v="117"',
'Sec-Ch-Ua-Mobile' => '?0',
'Sec-Ch-Ua-Platform' => '"macOS"',
'Sec-Fetch-Dest' => 'document',
'Sec-Fetch-Mode' => 'navigate',
'Sec-Fetch-Site' => 'same-origin',
'Sec-Fetch-User' => '?1',
'Sec-Gpc' => '1',
'Upgrade-Insecure-Requests' => '1',
'User-Agent' => 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.0.0 Safari/537.36',
]);
$response = $this->httpClient->sendRequest($request);
$html = (string)$response->getBody();
return $html;
}
private function parsePagination($html)
{
preg_match_all('/<a class="btn wide( disabled)?"( href="(?<href>.+)")?>.*(?<name>(最舊|上頁|下頁|最新))[^<]*?<\/a>/', $html, $matches);
$lookup = ['最舊' => 'oldest', '上頁' => 'prev', '下頁' => 'next', '最新' => 'latest'];
$pagination = [];
foreach (array_keys($matches[0]) as $index) {
$pagination[$lookup[$matches['name'][$index]]] = $matches['href'][$index]
? 'https://www.ptt.cc' . $matches['href'][$index]
: null;
}
return $pagination;
}
private function parseCols($row, $board)
{
preg_match_all('/<div class="(?<name>(nrec|title|author|date))"[^>]*>(?<value>.*?)<\/div>/s', $row, $matches);
$cols = [
'board_name' => $board['name'],
'board_class' => $board['class'],
];
foreach (array_keys($matches[0]) as $index) {
$cols[$matches['name'][$index]] = trim($matches['value'][$index]);
}
$cols['nrec'] = strip_tags($cols['nrec']);
preg_match('/href="(.*)"/', $cols['title'], $matched);
$cols['url'] = 'https://www.ptt.cc' . $matched[1];
preg_match('/\[(.+)\](.+)/', strip_tags($cols['title']), $matched);
$cols['type'] = trim($matched[1]);
$cols['title'] = trim($matched[2]);
return $cols;
}
private function parseRows($html)
{
preg_match_all('/class="r-ent">.+<div class="mark">(.+)<\/div>/sU', $html, $matches);
return $matches[0];
}
}
<?php
namespace Recca0120\Ithome30\Tests\Crawlers;
use Mockery;
use GuzzleHttp\Client;
use PHPUnit\Framework\TestCase;
use Recca0120\Ithome30\Crawlers\Board;
class BoardTest extends TestCase
{
public function test_fetch_board_articles_list()
{
\VCR\VCR::turnOn();
\VCR\VCR::insertCassette('ptt_gossiping.yaml');
/** @var Mockery\Mock|ClientInterface $httpClient */
$httpClient = Mockery::spy(new Client());
$crawler = new Board($httpClient);
$records = $crawler->fetch([
'name' => 'Gossiping',
"nuser" => '8803',
'class' => '綜合',
'title' => '[八卦] 亞運李智凱、許皓鋐奪金!',
'url' => 'https://www.ptt.cc/bbs/Gossiping/index.html'
], 2);
// 驗證共抓取 2 頁
self::assertCount(2, $records);
// 驗證第一頁有 23 筆資料
self::assertCount(23, $records[0]);
self::assertEquals([
'board_name' => 'Gossiping',
'board_class' => '綜合',
'nrec' => '4',
'type' => '問卦',
'title' => '司機夫人真的有去卡地亞血拚$1.1M嗎?',
'author' => 'uwmtsa',
'date' => '10/06',
'url' => 'https://www.ptt.cc/bbs/Gossiping/M.1696537444.A.1A5.html',
], $records[0][0]);
\VCR\VCR::eject();
\VCR\VCR::turnOff();
}
}
這樣我們就完成了分頁功能