-
Notifications
You must be signed in to change notification settings - Fork 11
Home
姜 天戩 Mike Tian-Jian Jiang edited this page Jan 13, 2022
·
10 revisions
Welcome to the metadata wiki!
Some old PHP code for HTML text extraction
HtmlContentExtractor
class HtmlContentExtractor
{
private $dom;
protected $xpath;
/**
* Constructor
*
* @param \DOMDocument $dom DOMDocument
*
* @throws \Exception
*/
public function __construct(\DOMDocument $dom)
{
set_error_handler('Yaraku\Html\ErrorHandlerFunction');
libxml_use_internal_errors(true);
try {
$this->dom = $dom;
if (!$this->dom) {
throw new \Exception("DOMDocument is invalid.");
}
$this->dom->encoding = 'UTF-8';
$this->dom->formatOutput = false;
$this->xpath = new \DOMXPath($this->dom);
} catch (\Exception $e) {
restore_error_handler();
throw $e;
}
restore_error_handler();
}
/**
* Get the map of node path and text
*
* @return array
* @throws \Exception
*/
public function getNodePathAndTextMap()
{
$textArray = array();
$blocks = array();
$elements = $this->xpath->query(
"//*[name() != 'script' and name() != 'style'"
." and name() != 'code'"
." and not(@translate='no')]/text()"
);
$elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
/** @var \DOMNode $e */
foreach ($elementArray as $e) {
//$temp = self::whiteSpaceNormalization($e->C14N());
$temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
$temp = preg_replace("/<[^>]+>/u", "", $temp);
$temp = preg_replace("/[\s\d]+/u", "", $temp);
if ($temp !== '') {
$nodePath = $e->getNodePath();
$ancestorQuery
= "$nodePath/ancestor::p[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::a[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::div[not(descendant::table)"
." and not(descendant::div)"
." and not(descendant::code)"
." and normalize-space(text())]"
." | $nodePath/ancestor::font[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::span[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::li[not(descendant::table)"
." and not(descendant::li)"
." and not(descendant::code)]"
." | $nodePath/ancestor::dt[not(descendant::table)]"
." | $nodePath/ancestor::dd[not(descendant::table)]"
." | $nodePath/ancestor::td[not(descendant::table)"
." and not(descendant::div) "
." and not(descendant::code)"
." and normalize-space(text())]"
." | $nodePath/ancestor::th[not(descendant::table)]"
." | $nodePath/ancestor::b[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::i[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::u[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::tt[not(descendant::table)]"
." | $nodePath/ancestor::blockquote[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::strike[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::em[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::strong[not(descendant::table)"
." and not(descendant::code)]"
." | $nodePath/ancestor::iframe[not(descendant::table)"
." and not(descendant::div) and normalize-space(text())]"
;
$ancestors = $this->xpath->query($ancestorQuery);
if (!$ancestors) {
throw new \Exception(
"$ancestorQuery is an incorrect XPath query."
);
} elseif (0 === $ancestors->length) {
//$blocks[$nodePath] = $e->C14N();
$blocks[$nodePath] = $this->dom->saveHTML($e);
} else {
$isExtractedBlock = false;
for ($i = $ancestors->length - 1; $i >= 0; $i--) {
$blockNode = $ancestors->item($i);
//$blockOuterC14N = $blockNode->C14N();
$blockOuterC14N = $this->dom->saveHTML($blockNode);
$blockNodePath = $blockNode->getNodePath();
$headAndTheRest = preg_split(
"/^(<$blockNode->nodeName[^>]*>)/iu",
$blockOuterC14N,
2,
PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
);
$head = $headAndTheRest[0];
$tail = "</$blockNode->nodeName>";
$key = "$blockNodePath|$head|$tail";
if (array_key_exists($key, $blocks)) {
$isExtractedBlock = true;
break;
}
}
if ($isExtractedBlock) {
continue;
}
$blockNode = $ancestors->item($ancestors->length - 1);
//$blockOuterC14N = $blockNode->C14N();
$blockOuterC14N = $this->dom->saveHTML($blockNode);
$blockNodePath = $blockNode->getNodePath();
$headAndTheRest = preg_split(
"/^(<$blockNode->nodeName[^>]*>)/iu",
$blockOuterC14N,
2,
PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
);
$head = $headAndTheRest[0];
$tail = "</$blockNode->nodeName>";
$end = strripos($headAndTheRest[1], $tail);
$blockInnerC14N = substr($headAndTheRest[1], 0, $end);
$blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
}
}
}
$blocks = array_reverse($blocks);
foreach ($blocks as $path => $html) {
$textArray[$path] = str_replace('
', '', $html);
}
$metae = $this->xpath->query(
"/html/head/meta"
."[string(@content)"
." and ("
."@name='Description' or @name='description'"
." or @name='Keywords' or @name='keywords'"
.")]"
);
/** @var \DOMElement $meta */
foreach ($metae as $meta) {
if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
$textArray[$meta->getNodePath()]
= $meta->getAttribute("content");
}
}
$inputs = $this->xpath->query(
"//input"
."[string(@value)"
." and ("
."@type='button' or @type='Button'"
." or @type='reset' or @type='Reset'"
." or @type='search' or @type='Search'"
." or @type='submit' or @type='Submit'"
." or @type='text' or @type='Text'"
.")]"
);
/** @var \DOMElement $input */
foreach ($inputs as $input) {
if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
$textArray[$input->getNodePath()]
= $input->getAttribute("value");
}
}
$inputsWithPlaceholder
= $this->xpath->query("//input[string(@placeholder)]");
foreach ($inputsWithPlaceholder as $input) {
if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
) {
$textArray[$input->getNodePath() .'/@placeholder']
= $input->getAttribute("placeholder");
}
}
// $attributes =
// $this->m_xpath->query(
// "//*["
// ."string(@abbr) or string(@alt) or string(@label)"
// ." or string(@title) or string(@standby)"
// ." or string(@summary)"
// ."]");
// foreach ($attributes as $a)
// $textArray[$a->getNodePath()] = $a->C14N();
$attributeName = 'alt';
$attributes = $this->xpath->query(
"//*[string(@$attributeName)]/@$attributeName"
);
/** @var \DOMNode $a */
foreach ($attributes as $a) {
$textArray[$a->getNodePath()] = $a->nodeValue;
}
return $textArray;
}
/**
* Get text array
*
* @return array
*/
public function getTextArray()
{
return array_values($this->getNodePathAndTextMap());
}
/**
* @return array
* @throws \Exception
*/
public function getTextArrayWithLineNumber()
{
$textWithLineNumberList = [];
$nodePathAndTextMap = $this->getNodePathAndTextMap();
foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
$nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
/** @var \DOMNodeList $nodeList */
$nodeList = $this->xpath->query($nodePath);
$lineNumber = $nodeList->item(0)->getLineNo();
$textWithLineNumberList[] = [$lineNumber, $text];
}
return $textWithLineNumberList;
}
/**
* Get the map of node path and image
*
* @return array
*/
public function getNodePathAndImageMap()
{
$imageArray = array();
$images = $this->xpath->query("//img[string(@src)]");
/** @var \DOMNode $i */
foreach ($images as $i) {
$imageArray[$i->getNodePath()]
= $i->attributes->getNamedItem('src')->nodeValue;
}
return $imageArray;
}
/**
* Get image array
*
* @return array
*/
public function getImageArray()
{
return array_values($this->getNodePathAndImageMap());
}
/**
* Convert HTML to a one line string that can be used as Json variable
*
* @param string $html The html
* @param bool $jsonFriendly Prepare to use as Json variable
*
* @return string
*/
public static function htmlToOneLineString($html, $jsonFriendly=true)
{
$html = preg_replace('~>\s+<~', '><', $html);
$html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);
if ($jsonFriendly) {
$html = str_replace('"', '\"', $html);
}
return $html;
}
/**
* Encode the string into HTML Encoding format
*
* @param String $str Text String
*
* @return string
*/
public static function encode($str)
{
$str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');
$t = unpack("N*", $str);
$t = array_map(
function ($n) {
return "&#$n;";
}, $t
);
return implode("", $t);
}
/**
* Normalize white space inside the text
*
* @param String $text raw text
*
* @return String $text
* @throws \Exception
*/
public static function whiteSpaceNormalization($text)
{
// encode the text in decimal format
$text = self::encode($text);
// replace uncommon white space with ordinary white space
$text = preg_replace(
'/(\&\#5760\;|\&\#6158\;|'
.'\&\#8192\;|\&\#8193\;|'
.'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
.'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
.'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
.'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
.'\&\#32\;|\&\#09\;|\&\#11\;|'
.'\&\#160\;|\&\#9\;)+/u',
" ",
$text
);
// if $text is null, there is something wrong with the preg_replace function
if (is_null($text)) {
$errorCode = preg_last_error();
throw new \Exception("preg_replace error code $errorCode.");
}
// decode the text again into the normal string
$text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');
// replace all common white space in named HTML entities with ordinary white
// space
// references:
// - http://www.w3schools.com/tags/ref_symbols.asp
// - http://www.w3schools.com/tags/ref_entities.asp
$text = trim(
preg_replace(
'/(\s|\ \;|\&\#xA0\;|\¨\;|\&\#xA8\;|\­\;|\&\#xAD\;|'
.'\¯\;|\&\#xAF\;|\´\;|\&\#xB4\;|\¸\;|\&\#xB8\;|'
.'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
.'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
.'\&rlm\;|\&\#x200F\;|\&\#xA\;|\
\;|\x{FEFF})+/u',
" ",
$text
)
);
return $text;
}
}