Skip to content

Commit

Permalink
mb_convert_encodingを用いたHTMLエンティティ化が非推奨になったので、別実装で代替
Browse files Browse the repository at this point in the history
  • Loading branch information
shibafu528 committed Nov 23, 2024
1 parent dfc4c9f commit 5c8e008
Show file tree
Hide file tree
Showing 7 changed files with 28 additions and 7 deletions.
3 changes: 2 additions & 1 deletion app/MetadataResolver/ActivityPubResolver.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\MetadataResolver;

use App\Facades\Formatter;
use GuzzleHttp\Exception\TransferException;
use Illuminate\Support\Facades\Log;
use Psr\Http\Message\ResponseInterface;
Expand Down Expand Up @@ -81,7 +82,7 @@ private function html2text(string $html): string
return '';
}

$html = mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8');
$html = Formatter::htmlEntities($html, 'UTF-8');

Check warning on line 85 in app/MetadataResolver/ActivityPubResolver.php

View check run for this annotation

Codecov / codecov/patch

app/MetadataResolver/ActivityPubResolver.php#L85

Added line #L85 was not covered by tests
$html = preg_replace('~<br\s*/?\s*>|</p>\s*<p[^>]*>~i', "\n", $html);
$dom = new \DOMDocument();
$dom->loadHTML($html, LIBXML_HTML_NOIMPLIED | LIBXML_HTML_NODEFDTD);
Expand Down
5 changes: 3 additions & 2 deletions app/MetadataResolver/DLsiteResolver.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\MetadataResolver;

use App\Facades\Formatter;
use GuzzleHttp\Client;

class DLsiteResolver implements Resolver
Expand Down Expand Up @@ -29,7 +30,7 @@ public function __construct(Client $client, OGPResolver $ogpResolver)
public function extractTags(string $html): array
{
$dom = new \DOMDocument();
@$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'UTF-8'));
@$dom->loadHTML(Formatter::htmlEntities($html, 'UTF-8'));
$xpath = new \DOMXPath($dom);

$genreNode = $xpath->query("//div[@class='main_genre'][1]");
Expand Down Expand Up @@ -79,7 +80,7 @@ public function resolve(string $url): Metadata
$metadata = $this->ogpResolver->parse($res->getBody());

$dom = new \DOMDocument();
@$dom->loadHTML(mb_convert_encoding($res->getBody(), 'HTML-ENTITIES', 'UTF-8'));
@$dom->loadHTML(Formatter::htmlEntities($res->getBody(), 'UTF-8'));
$xpath = new \DOMXPath($dom);

// OGPタイトルから[]に囲まれているmakerを取得する
Expand Down
3 changes: 2 additions & 1 deletion app/MetadataResolver/MelonbooksResolver.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\MetadataResolver;

use App\Facades\Formatter;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\CookieJar;

Expand Down Expand Up @@ -38,7 +39,7 @@ public function resolve(string $url): Metadata
$metadata = $this->ogpResolver->parse($res->getBody());

$dom = new \DOMDocument();
@$dom->loadHTML(mb_convert_encoding($res->getBody(), 'HTML-ENTITIES', 'UTF-8'));
@$dom->loadHTML(Formatter::htmlEntities($res->getBody(), 'UTF-8'));
$xpath = new \DOMXPath($dom);
$descriptionNodelist = $xpath->query('//div[contains(@class, "item-detail")]/*[contains(@class, "page-headline") and contains(text(), "作品詳細")]/following-sibling::div[1]');
$specialDescriptionNodelist = $xpath->query('//div[contains(@class, "item-detail")]/*[contains(@class, "page-headline") and contains(text(), "スタッフのオススメポイント")]/following-sibling::div[1]');
Expand Down
3 changes: 2 additions & 1 deletion app/MetadataResolver/NarouResolver.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\MetadataResolver;

use App\Facades\Formatter;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\CookieJar;

Expand Down Expand Up @@ -35,7 +36,7 @@ public function resolve(string $url): Metadata

// 一見旧式のDOMDocumentを使っているように見えるがこれは罠で、なろうのHTMLはDOMCrawlerだとパースに失敗する
$dom = new \DOMDocument();
@$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'ASCII,JIS,UTF-8,eucJP-win,SJIS-win'));
@$dom->loadHTML(Formatter::htmlEntities($html, 'ASCII,JIS,UTF-8,eucJP-win,SJIS-win'));
$xpath = new \DOMXPath($dom);

$metadata = $this->ogpResolver->parse($html);
Expand Down
3 changes: 2 additions & 1 deletion app/MetadataResolver/OGPResolver.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\MetadataResolver;

use App\Facades\Formatter;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\CookieJar;
use GuzzleHttp\RequestOptions;
Expand Down Expand Up @@ -30,7 +31,7 @@ public function parse(string $html, ?OGPParsePriority $priority = null): Metadat
}

$dom = new \DOMDocument();
@$dom->loadHTML(mb_convert_encoding($html, 'HTML-ENTITIES', 'ASCII,JIS,UTF-8,eucJP-win,SJIS-win'));
@$dom->loadHTML(Formatter::htmlEntities($html, 'ASCII,JIS,UTF-8,eucJP-win,SJIS-win'));
$xpath = new \DOMXPath($dom);

$metadata = new Metadata();
Expand Down
3 changes: 2 additions & 1 deletion app/MetadataResolver/ToranoanaResolver.php
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

namespace App\MetadataResolver;

use App\Facades\Formatter;
use GuzzleHttp\Client;
use GuzzleHttp\Cookie\CookieJar;

Expand All @@ -28,7 +29,7 @@ public function resolve(string $url): Metadata
$metadata = $this->ogpResolver->parse($res->getBody());

$dom = new \DOMDocument();
@$dom->loadHTML(mb_convert_encoding($res->getBody(), 'HTML-ENTITIES', 'UTF-8'));
@$dom->loadHTML(Formatter::htmlEntities($res->getBody(), 'UTF-8'));
$xpath = new \DOMXPath($dom);
$imgNode = $xpath->query('//*[@id="preview"]//img')->item(0);
if ($imgNode !== null) {
Expand Down
15 changes: 15 additions & 0 deletions app/Utilities/Formatter.php
Original file line number Diff line number Diff line change
Expand Up @@ -150,4 +150,19 @@ public function sanitizeLike(string $value): string
{
return preg_replace('/[%_]/', '\\\\$0', $value);
}

/**
* PHP 8.1までの `mb_convert_encoding($input, 'HTML-ENTITIES', $from)` 相当のHTMLエンティティ化処理を行います。
* @param string $input エンコードする文字列
* @param string|null $fromEncoding `$input` の文字コード (nullの場合はdefault_charsetに準ずる、通常はUTF-8)
* @return string エンティティ化された文字列
*/
public function htmlEntities(string $input, string $fromEncoding = null): string
{
// 非Unicode文字列は上手く処理できないので、UTF-8に正規化する
$input = mb_convert_encoding($input, 'UTF-8', $fromEncoding);

// 参考: https://github.com/php/php-src/pull/7177#issuecomment-1317296767
return mb_encode_numericentity($input, [0x80, 0x10fffff, 0, 0x1fffff]);
}
}

0 comments on commit 5c8e008

Please sign in to comment.