-
Notifications
You must be signed in to change notification settings - Fork 11
Home
姜 天戩 Mike Tian-Jian Jiang edited this page Jan 13, 2022
·
10 revisions
Welcome to the metadata wiki!
Some old PHP code for HTML text extraction
HtmlContentExtractor
class HtmlContentExtractor
{
private $dom;
protected $xpath;
/**
* Constructor
*
* @param \DOMDocument $dom DOMDocument
*
* @throws \Exception
*/
public function __construct(\DOMDocument $dom)
{
set_error_handler('Yaraku\Html\ErrorHandlerFunction');
libxml_use_internal_errors(true);
try {
$this->dom = $dom;
if (!$this->dom) {
throw new \Exception("DOMDocument is invalid.");
}
$this->dom->encoding = 'UTF-8';
$this->dom->formatOutput = false;
$this->xpath = new \DOMXPath($this->dom);
} catch (\Exception $e) {
restore_error_handler();
throw $e;
}
restore_error_handler();
}
/**
* Get the map of node path and text
*
* @return array
* @throws \Exception
*/
public function getNodePathAndTextMap()
{
$textArray = array();
$blocks = array();
$elements = $this->xpath->query(
"//*[name() != 'script' and name() != 'style'"
." and name() != 'code'"
." and not(@translate='no')]/text()"
);
$elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
/** @var \DOMNode $e */
foreach ($elementArray as $e) {
//$temp = self::whiteSpaceNormalization($e->C14N());
$temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
$temp = preg_replace("/<[^>]+>/u", "", $temp);
$temp = preg_replace("/[\s\d]+/u", "", $temp);
if ($temp !== '') {
$nodePath = $e->getNodePath();
$ancestorQuery
= "$nodePath/ancestor::p[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::a[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::div[not(descendant::table)"
." and not(descendant::div)"
." and not(descendant::code)"
." and normalize-space(text())]"
." | $nodePath/ancestor::font[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::span[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::li[not(descendant::table)"
." and not(descendant::li)"
." and not(descendant::code)]"
." | $nodePath/ancestor::dt[not(descendant::table)]"
." | $nodePath/ancestor::dd[not(descendant::table)]"
." | $nodePath/ancestor::td[not(descendant::table)"
." and not(descendant::div) "
." and not(descendant::code)"
." and normalize-space(text())]"
." | $nodePath/ancestor::th[not(descendant::table)]"
." | $nodePath/ancestor::b[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::i[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::u[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::tt[not(descendant::table)]"
." | $nodePath/ancestor::blockquote[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::strike[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::em[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::strong[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::iframe[not(descendant::table)"
." and not(descendant::div) and normalize-space(text())]"
;
$ancestors = $this->xpath->query($ancestorQuery);
if (!$ancestors) {
throw new \Exception(
"$ancestorQuery is an incorrect XPath query."
);
} elseif (0 === $ancestors->length) {
//$blocks[$nodePath] = $e->C14N();
$blocks[$nodePath] = $this->dom->saveHTML($e);
} else {
$isExtractedBlock = false;
for ($i = $ancestors->length - 1; $i >= 0; $i--) {
$blockNode = $ancestors->item($i);
//$blockOuterC14N = $blockNode->C14N();
$blockOuterC14N = $this->dom->saveHTML($blockNode);
$blockNodePath = $blockNode->getNodePath();
$headAndTheRest = preg_split(
"/^(<$blockNode->nodeName[^>]*>)/iu",
$blockOuterC14N,
2,
PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
);
$head = $headAndTheRest[0];
$tail = "</$blockNode->nodeName>";
$key = "$blockNodePath|$head|$tail";
if (array_key_exists($key, $blocks)) {
$isExtractedBlock = true;
break;
}
}
if ($isExtractedBlock) {
continue;
}
$blockNode = $ancestors->item($ancestors->length - 1);
//$blockOuterC14N = $blockNode->C14N();
$blockOuterC14N = $this->dom->saveHTML($blockNode);
$blockNodePath = $blockNode->getNodePath();
$headAndTheRest = preg_split(
"/^(<$blockNode->nodeName[^>]*>)/iu",
$blockOuterC14N,
2,
PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
);
$head = $headAndTheRest[0];
$tail = "</$blockNode->nodeName>";
$end = strripos($headAndTheRest[1], $tail);
$blockInnerC14N = substr($headAndTheRest[1], 0, $end);
$blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
}
}
}
$blocks = array_reverse($blocks);
foreach ($blocks as $path => $html) {
$textArray[$path] = str_replace('
', '', $html);
}
$metae = $this->xpath->query(
"/html/head/meta"
."[string(@content)"
." and ("
."@name='Description' or @name='description'"
." or @name='Keywords' or @name='keywords'"
.")]"
);
/** @var \DOMElement $meta */
foreach ($metae as $meta) {
if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
$textArray[$meta->getNodePath()]
= $meta->getAttribute("content");
}
}
$inputs = $this->xpath->query(
"//input"
."[string(@value)"
." and ("
."@type='button' or @type='Button'"
." or @type='reset' or @type='Reset'"
." or @type='search' or @type='Search'"
." or @type='submit' or @type='Submit'"
." or @type='text' or @type='Text'"
.")]"
);
/** @var \DOMElement $input */
foreach ($inputs as $input) {
if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
$textArray[$input->getNodePath()]
= $input->getAttribute("value");
}
}
$inputsWithPlaceholder
= $this->xpath->query("//input[string(@placeholder)]");
foreach ($inputsWithPlaceholder as $input) {
if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
) {
$textArray[$input->getNodePath() .'/@placeholder']
= $input->getAttribute("placeholder");
}
}
// $attributes =
// $this->m_xpath->query(
// "//*["
// ."string(@abbr) or string(@alt) or string(@label)"
// ." or string(@title) or string(@standby)"
// ." or string(@summary)"
// ."]");
// foreach ($attributes as $a)
// $textArray[$a->getNodePath()] = $a->C14N();
$attributeName = 'alt';
$attributes = $this->xpath->query(
"//*[string(@$attributeName)]/@$attributeName"
);
/** @var \DOMNode $a */
foreach ($attributes as $a) {
$textArray[$a->getNodePath()] = $a->nodeValue;
}
return $textArray;
}
/**
* Get text array
*
* @return array
*/
public function getTextArray()
{
return array_values($this->getNodePathAndTextMap());
}
/**
* @return array
* @throws \Exception
*/
public function getTextArrayWithLineNumber()
{
$textWithLineNumberList = [];
$nodePathAndTextMap = $this->getNodePathAndTextMap();
foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
$nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
/** @var \DOMNodeList $nodeList */
$nodeList = $this->xpath->query($nodePath);
$lineNumber = $nodeList->item(0)->getLineNo();
$textWithLineNumberList[] = [$lineNumber, $text];
}
return $textWithLineNumberList;
}
/**
* Get the map of node path and image
*
* @return array
*/
public function getNodePathAndImageMap()
{
$imageArray = array();
$images = $this->xpath->query("//img[string(@src)]");
/** @var \DOMNode $i */
foreach ($images as $i) {
$imageArray[$i->getNodePath()]
= $i->attributes->getNamedItem('src')->nodeValue;
}
return $imageArray;
}
/**
* Get image array
*
* @return array
*/
public function getImageArray()
{
return array_values($this->getNodePathAndImageMap());
}
/**
* Convert HTML to a one line string that can be used as Json variable
*
* @param string $html The html
* @param bool $jsonFriendly Prepare to use as Json variable
*
* @return string
*/
public static function htmlToOneLineString($html, $jsonFriendly=true)
{
$html = preg_replace('~>\s+<~', '><', $html);
$html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);
if ($jsonFriendly) {
$html = str_replace('"', '\"', $html);
}
return $html;
}
/**
* Encode the string into HTML Encoding format
*
* @param String $str Text String
*
* @return string
*/
public static function encode($str)
{
$str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');
$t = unpack("N*", $str);
$t = array_map(
function ($n) {
return "&#$n;";
}, $t
);
return implode("", $t);
}
/**
* Normalize white space inside the text
*
* @param String $text raw text
*
* @return String $text
* @throws \Exception
*/
public static function whiteSpaceNormalization($text)
{
// encode the text in decimal format
$text = self::encode($text);
// replace uncommon white space with ordinary white space
$text = preg_replace(
'/(\&\#5760\;|\&\#6158\;|'
.'\&\#8192\;|\&\#8193\;|'
.'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
.'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
.'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
.'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
.'\&\#32\;|\&\#09\;|\&\#11\;|'
.'\&\#160\;|\&\#9\;)+/u',
" ",
$text
);
// if $text is null, there is something wrong with the preg_replace function
if (is_null($text)) {
$errorCode = preg_last_error();
throw new \Exception("preg_replace error code $errorCode.");
}
// decode the text again into the normal string
$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
// replace all common white space in named HTML entities with ordinary white
// space
// references:
// - http://www.w3schools.com/tags/ref_symbols.asp
// - http://www.w3schools.com/tags/ref_entities.asp
$text = trim(
preg_replace(
'/(\s|\ \;|\&\#xA0\;|\¨\;|\&\#xA8\;|\­\;|\&\#xAD\;|'
.'\¯\;|\&\#xAF\;|\´\;|\&\#xB4\;|\¸\;|\&\#xB8\;|'
.'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
.'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
.'\&rlm\;|\&\#x200F\;|\&\#xA\;|\
\;|\x{FEFF})+/u',
" ",
$text
)
);
return $text;
}
}
HtmlContentExtractorTest
class HtmlContentExtractorTest extends \PHPUnit_Framework_TestCase
{
protected $dataFolderPath;
protected $oInnPage;
protected $solarePage;
/** @var \DOMDocument $phpDom */
protected $phpDom;
/** @var \DOMDocument $html5Dom */
protected $html5Dom;
/** @var HtmlContentExtractor $extractor */
protected $extractor;
public function setUp()
{
parent::setUp();
$this->dataFolderPath
= __DIR__ . DIRECTORY_SEPARATOR .'_data'. DIRECTORY_SEPARATOR;
$this->oInnPage = $this->dataFolderPath .'www.o-inn.co.jp_index.html';
$this->solarePage = $this->dataFolderPath .'www.solarehotels.com.html';
}
public function testGetTextArrayWithLineNumber()
{
$this->extractor = new HtmlContentExtractor(
PhpDom::make($this->solarePage)
);
$textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
$firstPair = array_values($textWithLineNoList)[0];
$this->assertEquals([108, 'For Smileage member'], $firstPair);
$this->markTestSkipped('Masterminds\HTML5 has no support of it.');
$this->extractor = new HtmlContentExtractor(
Html5Dom::make($this->solarePage)
);
$textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
$firstPair = array_values($textWithLineNoList)[0];
$this->assertEquals([108, 'For Smileage member'], $firstPair);
}
/**
* Test get node path and text map
*
* @return void
*
* @ticket #108
* @ticket #109
* @ticket #136
*/
public function testGetNodePathAndTextMap()
{
$expectedMetaKeywordsNodePath = "/html/head/meta[5]";
$expectedMetaKeywordsText
= "Best Price Guarantee,Bottom Price,Lowest Price,Hotel,Stay,"
."Reservation,Booking,SOLARE HOTELS & RESORTS";
$doms = [
PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
$this->assertArrayHasKey(
$expectedMetaKeywordsNodePath,
$nodePathAndTextMap
);
$this->assertEquals(
$expectedMetaKeywordsText,
$nodePathAndTextMap[$expectedMetaKeywordsNodePath]
);
}
$expectedMetaKeywordsNodePath = "/html/head/meta[3]";
$expectedMetaKeywordsText
= "お茶の水イン,御茶ノ水,お茶の水,後楽園,"
."ビジネスホテル,文京区,東京ドーム,出張,宿泊予約";
$expectedInterpolatedCaseNodePath
= "/html/body/div[4]/div/div[2]/div[6]/p|<p>|</p>";
$expectedInterpolatedCaseText
= '掲載されている'
.'<a href="http://www.tripadvisor.jp/'
.'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
.'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
.' target="_blank">'
. PHP_EOL //< Masterminds\HTML5 seems not using source's EOL.
.' ホテルお茶の水イン'
.'</a>'
.'のクチコミはTripAdvisorより提供を受けています'
;
$doms = [
PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
$this->assertArrayHasKey(
$expectedMetaKeywordsNodePath,
$nodePathAndTextMap
);
$this->assertEquals(
$expectedMetaKeywordsText,
$nodePathAndTextMap[$expectedMetaKeywordsNodePath]
);
$this->assertArrayHasKey(
$expectedInterpolatedCaseNodePath,
$nodePathAndTextMap
);
$this->assertEquals(
$expectedInterpolatedCaseText,
$nodePathAndTextMap[$expectedInterpolatedCaseNodePath]
);
}
}
/**
* Test get text array
*
* @return void
*
* @ticket #108
* @ticket #109
* @ticket #136
*/
public function testGetTextArray()
{
$expectedCommonCase
= ' * 1...Only applicable to rates compared on the same date'
.' as the date of reservation made via the SORALRE HOTELS &'
.' RESORTS official website.<br>'."\n"
.' * 2...Limited to claims submitted via email within 24 hours'
.' of booking.';
$doms = [
PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$textArray = $this->extractor->getTextArray();
$this->assertContains($expectedCommonCase, $textArray);
}
$expectedCommonCase
= '<strong>お茶の水イン</strong><br>〒113-0034<br>'
.'東京都文京区湯島1-3-7<br>TEL:03-3813-8211<br>'
.'FAX:03-3813-9730<br>'
.'<a href="/transportation/">お茶の水インまでの地図</a>'
;
$expectedInterpolatedCase
= '掲載されている'
.'<a href="http://www.tripadvisor.jp/'
.'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
.'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
.' target="_blank">'
."\n ホテルお茶の水イン"
.'</a>'
.'のクチコミはTripAdvisorより提供を受けています'
;
$doms = [
PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$textArray = $this->extractor->getTextArray();
$this->assertContains($expectedCommonCase, $textArray);
$this->assertContains($expectedInterpolatedCase, $textArray);
}
}
/**
* @ticket #109
*/
/**
* Test get text array on ill formed html
*
* @return void
*/
public function testGetTextArrayOnIllFormedHtml()
{
$unpreparedHtml = file_get_contents($this->oInnPage);
$illformedCase
= '掲載されている'
.'<a href="http://www.tripadvisor.jp/'
.'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
.'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
."\n ホテルお茶の水イン</a>"
.'のクチコミはTripAdvisorより提供を受けています'
.'</a>'
;
$this->assertNotEquals(false, strpos($unpreparedHtml, $illformedCase));
$expectedCase
= '掲載されている'
. '<a href="http://www.tripadvisor.jp/'
. 'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
. 'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
. "\n ホテルお茶の水イン</a>"
. 'のクチコミはTripAdvisorより提供を受けています';
$doms = [
PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
];
foreach ($doms as $dom) {
$this->extractor = new HtmlContentExtractor($dom);
$textArray = $this->extractor->getTextArray();
$this->assertContains($expectedCase, $textArray);
}
}
/**
* Testing for placeholder attribute extraction from input tag
*
* @return void
*/
public function testGetNodePathAndTextMapOnPlaceholderAttributeOfInputTag()
{
$expectedPath = '/html/body/input/@placeholder';
$expectedText = 'お名前';
$file = $this->dataFolderPath .'inputPlaceholderTest.html';
$doms = [PhpDom::make($file), Html5Dom::make($file)];
foreach ($doms as $dom) {
$extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $extractor->getNodePathAndTextMap();
$this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
$this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
}
}
/**
* Testing for alt attribute extraction from image tag
*
* @return void
*
* @ticket LOC-2162
*/
public function testGetNodePathAndTextMapOnAltAttributeOfImageTag()
{
$expectedPath = '/html/body/img/@alt';
$expectedText = '画像です';
$file = $this->dataFolderPath .'imageAltTest.html';
$doms = [PhpDom::make($file), Html5Dom::make($file)];
foreach ($doms as $dom) {
$extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $extractor->getNodePathAndTextMap();
$this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
$this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
}
}
/**
* Test BOM removal
*
* @ticket ZEN-2579
*
* @return void
*/
public function testGetNodePathAndTextMapOnBOM()
{
$unexpectedPath = '/html/body/div[1]/text()[3]';
$file = $this->dataFolderPath
.'www.yokohamabay-sheraton.co.jp__other_facilities.html';
$doms = [PhpDom::make($file), Html5Dom::make($file)];
foreach ($doms as $dom) {
$extractor = new HtmlContentExtractor($dom);
$nodePathAndTextMap = $extractor->getNodePathAndTextMap();
$this->assertArrayNotHasKey($unexpectedPath, $nodePathAndTextMap);
}
}
// /**
// * Test sorting the node-path-to-text map by the line numbers of the HTML
// *
// * @return void
// */
// public function testSortNodePathAndTextMapByLineNum()
// {
// $file = $this->dataFolderPath .'replaceNodeXpath.html';
// $doms = [PhpDom::make($file), Html5Dom::make($file)];
// foreach ($doms as $dom) {
// $extractor = new HtmlContentExtractor($dom);
// $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
// $this->assertEquals(
// 'HTML Test', array_values($nodePathAndTextMap)[0]
// );
// $this->assertEquals(
// '選べるリージョンとゾーン',
// array_values($nodePathAndTextMap)[4]
// );
// }
// }
}
PartialHtmlWrapper
class PartialHtmlWrapper
{
const HEADER = <<<HTML_HEADER
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type"></head><body>
HTML_HEADER;
const HEADER_C14N = <<<HTML_HEADER_C14N
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.0 Transitional//EN" "http://www.w3.org/TR/REC-html40/loose.dtd">
<html><head><meta content="text/html; charset=UTF-8" http-equiv="Content-Type">'</meta></head><body>
HTML_HEADER_C14N;
const FOOTER = <<<HTML_FOOTER
</body></html>
HTML_FOOTER;
/**
* Wrap the partial HTML
*
* @param string $partialHtml Partial HTML
*
* @return string
*/
public static function wrap($partialHtml)
{
return self::HEADER . $partialHtml . self::FOOTER;
}
/**
* Unwrap wrapped partial HTML
*
* @param string $wrappedPartialHtml Wrapped partial HTML
*
* @return string
*/
public static function unwrap($wrappedPartialHtml)
{
return
str_replace(
[self::HEADER_C14N, self::HEADER, self::FOOTER],
'',
$wrappedPartialHtml
);
}
}