Skip to content
姜 天戩 Mike Tian-Jian Jiang edited this page Jan 13, 2022 · 10 revisions

Welcome to the metadata wiki!

Some old PHP code for HTML text extraction

class HtmlContentExtractor
    private $dom;
    protected $xpath;

     * Constructor
     * @param \DOMDocument $dom DOMDocument
     * @throws \Exception
    public function __construct(\DOMDocument $dom)

        try {
            $this->dom = $dom;
            if (!$this->dom) {
                throw new \Exception("DOMDocument is invalid.");
            $this->dom->encoding = 'UTF-8';
            $this->dom->formatOutput = false;
            $this->xpath = new \DOMXPath($this->dom);
        } catch (\Exception $e) {
            throw $e;


     * Get the map of node path and text
     * @return array
     * @throws \Exception
    public function getNodePathAndTextMap()
        $textArray = array();

        $blocks = array();
        $elements = $this->xpath->query(
            "//*[name() != 'script' and name() != 'style'"
            ." and name() != 'code'"
            ." and not(@translate='no')]/text()"
        $elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
        /** @var \DOMNode $e */
        foreach ($elementArray as $e) {
            //$temp = self::whiteSpaceNormalization($e->C14N());
            $temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
            $temp = preg_replace("/<[^>]+>/u", "", $temp);
            $temp = preg_replace("/[\s\d]+/u", "", $temp);
            if ($temp !== '') {
                $nodePath = $e->getNodePath();
                    = "$nodePath/ancestor::p[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::a[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::div[not(descendant::table)"
                     ." and not(descendant::div)"
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::font[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::span[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::li[not(descendant::table)"
                     ." and not(descendant::li)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::dt[not(descendant::table)]"
                     ." | $nodePath/ancestor::dd[not(descendant::table)]"
                     ." | $nodePath/ancestor::td[not(descendant::table)"
                     ." and not(descendant::div) "
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::th[not(descendant::table)]"
                     ." | $nodePath/ancestor::b[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::i[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::u[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::tt[not(descendant::table)]"
                     ." | $nodePath/ancestor::blockquote[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strike[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::em[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strong[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::iframe[not(descendant::table)"
                     ." and not(descendant::div) and normalize-space(text())]"
                $ancestors = $this->xpath->query($ancestorQuery);
                if (!$ancestors) {
                    throw new \Exception(
                        "$ancestorQuery is an incorrect XPath query."
                } elseif (0 === $ancestors->length) {
                    //$blocks[$nodePath] = $e->C14N();
                    $blocks[$nodePath] = $this->dom->saveHTML($e);
                } else {
                    $isExtractedBlock = false;
                    for ($i = $ancestors->length - 1; $i >= 0; $i--) {
                        $blockNode = $ancestors->item($i);
                        //$blockOuterC14N = $blockNode->C14N();
                        $blockOuterC14N = $this->dom->saveHTML($blockNode);
                        $blockNodePath = $blockNode->getNodePath();
                        $headAndTheRest = preg_split(
                        $head = $headAndTheRest[0];
                        $tail = "</$blockNode->nodeName>";

                        $key = "$blockNodePath|$head|$tail";
                        if (array_key_exists($key, $blocks)) {
                            $isExtractedBlock = true;
                    if ($isExtractedBlock) {

                    $blockNode = $ancestors->item($ancestors->length - 1);
                    //$blockOuterC14N = $blockNode->C14N();
                    $blockOuterC14N = $this->dom->saveHTML($blockNode);
                    $blockNodePath = $blockNode->getNodePath();
                    $headAndTheRest = preg_split(
                    $head = $headAndTheRest[0];
                    $tail = "</$blockNode->nodeName>";
                    $end = strripos($headAndTheRest[1], $tail);
                    $blockInnerC14N = substr($headAndTheRest[1], 0, $end);
                    $blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
        $blocks = array_reverse($blocks);
        foreach ($blocks as $path => $html) {
            $textArray[$path] = str_replace('&#xD;', '', $html);

        $metae = $this->xpath->query(
            ." and ("
            ."@name='Description' or @name='description'"
            ." or @name='Keywords' or @name='keywords'"
        /** @var \DOMElement $meta */
        foreach ($metae as $meta) {
            if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
                    = $meta->getAttribute("content");

        $inputs = $this->xpath->query(
            ." and ("
            ."@type='button' or @type='Button'"
            ." or @type='reset' or @type='Reset'"
            ." or @type='search' or @type='Search'"
            ." or @type='submit' or @type='Submit'"
            ." or @type='text' or @type='Text'"
        /** @var \DOMElement $input */
        foreach ($inputs as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
                    = $input->getAttribute("value");
            = $this->xpath->query("//input[string(@placeholder)]");
        foreach ($inputsWithPlaceholder as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
            ) {
                $textArray[$input->getNodePath() .'/@placeholder']
                    = $input->getAttribute("placeholder");
        //        $attributes =
        //            $this->m_xpath->query(
        //                "//*["
        //                    ."string(@abbr) or string(@alt) or string(@label)"
        //                    ." or string(@title) or string(@standby)"
        //                    ." or string(@summary)"
        //                ."]");
        //        foreach ($attributes as $a)
        //            $textArray[$a->getNodePath()] = $a->C14N();
        $attributeName = 'alt';
        $attributes = $this->xpath->query(
        /** @var \DOMNode $a */
        foreach ($attributes as $a) {
            $textArray[$a->getNodePath()] = $a->nodeValue;

        return $textArray;

     * Get text array
     * @return array
    public function getTextArray()
        return array_values($this->getNodePathAndTextMap());

     * @return array
     * @throws \Exception
    public function getTextArrayWithLineNumber()
        $textWithLineNumberList = [];
        $nodePathAndTextMap = $this->getNodePathAndTextMap();
        foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
            $nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
            /** @var \DOMNodeList $nodeList */
            $nodeList = $this->xpath->query($nodePath);
            $lineNumber = $nodeList->item(0)->getLineNo();
            $textWithLineNumberList[] = [$lineNumber, $text];

        return $textWithLineNumberList;

     * Get the map of node path and image
     * @return array
    public function getNodePathAndImageMap()
        $imageArray = array();

        $images = $this->xpath->query("//img[string(@src)]");
        /** @var \DOMNode $i */
        foreach ($images as $i) {
                = $i->attributes->getNamedItem('src')->nodeValue;

        return $imageArray;

     * Get image array
     * @return array
    public function getImageArray()
        return array_values($this->getNodePathAndImageMap());

     * Convert HTML to a one line string that can be used as Json variable
     * @param string $html         The html
     * @param bool   $jsonFriendly Prepare to use as Json variable
     * @return string
    public static function htmlToOneLineString($html, $jsonFriendly=true)
        $html = preg_replace('~>\s+<~', '><', $html);
        $html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);

        if ($jsonFriendly) {
            $html = str_replace('"', '\"', $html);

        return $html;

     * Encode the string into HTML Encoding format
     * @param String $str Text String
     * @return string
    public static function encode($str)
        $str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');

        $t = unpack("N*", $str);

        $t = array_map(
            function ($n) {
                return "&#$n;";
            }, $t

        return implode("", $t);

     * Normalize white space inside the text
     * @param String $text raw text
     * @return String $text
     * @throws \Exception
    public static function whiteSpaceNormalization($text)
        // encode the text in decimal format
        $text = self::encode($text);

        // replace uncommon white space with ordinary white space
        $text = preg_replace(
            " ",

        // if $text is null, there is something wrong with the preg_replace function
        if (is_null($text)) {
            $errorCode = preg_last_error();
            throw new \Exception("preg_replace error code $errorCode.");

        // decode the text again into the normal string
        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');

        // replace all common white space in named HTML entities with ordinary white
        // space
        // references:
        // -
        // -
        $text = trim(
                " ",

        return $text;
class HtmlContentExtractorTest extends \PHPUnit_Framework_TestCase
    protected $dataFolderPath;
    protected $oInnPage;
    protected $solarePage;

    /** @var \DOMDocument $phpDom */
    protected $phpDom;

    /** @var \DOMDocument $html5Dom */
    protected $html5Dom;

    /** @var HtmlContentExtractor $extractor */
    protected $extractor;

    public function setUp()
        $this->oInnPage = $this->dataFolderPath .'';
        $this->solarePage = $this->dataFolderPath .'';

    public function testGetTextArrayWithLineNumber()
        $this->extractor = new HtmlContentExtractor(
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);

        $this->markTestSkipped('Masterminds\HTML5 has no support of it.');
        $this->extractor = new HtmlContentExtractor(
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);

     * Test get node path and text map
     * @return void
     * @ticket #108
     * @ticket #109
     * @ticket #136
    public function testGetNodePathAndTextMap()
        $expectedMetaKeywordsNodePath = "/html/head/meta[5]";
            = "Best Price Guarantee,Bottom Price,Lowest Price,Hotel,Stay,"
                ."Reservation,Booking,SOLARE HOTELS & RESORTS";
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();

        $expectedMetaKeywordsNodePath = "/html/head/meta[3]";
            = "お茶の水イン,御茶ノ水,お茶の水,後楽園,"
            = "/html/body/div[4]/div/div[2]/div[6]/p|<p>|</p>";
            = '掲載されている'
                .'<a href="'
                .' target="_blank">'
                . PHP_EOL //< Masterminds\HTML5 seems not using source's EOL.
                .'            ホテルお茶の水イン'
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();

     * Test get text array
     * @return void
     * @ticket #108
     * @ticket #109
     * @ticket #136
    public function testGetTextArray()
            = '  * 1...Only applicable to rates compared on the same date'
            .' as the date of reservation made via the SORALRE HOTELS &amp;'
            .' RESORTS official website.<br>'."\n"
            .'  * 2...Limited to claims submitted via email within 24 hours'
            .' of booking.';
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);

            = '<strong>お茶の水イン</strong><br>〒113-0034<br>'
                .'<a href="/transportation/">お茶の水インまでの地図</a>'
            = '掲載されている'
                .'<a href="'
                .' target="_blank">'
                ."\n            ホテルお茶の水イン"
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
            $this->assertContains($expectedInterpolatedCase, $textArray);

     * @ticket #109
     * Test get text array on ill formed html
     * @return void
    public function testGetTextArrayOnIllFormedHtml()
        $unpreparedHtml = file_get_contents($this->oInnPage);
            = '掲載されている'
            .'<a href="'
            .'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
            ."\n            ホテルお茶の水イン</a>"
        $this->assertNotEquals(false, strpos($unpreparedHtml, $illformedCase));

            = '掲載されている'
                . '<a href="'
                . 'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
                . 'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
                . "\n            ホテルお茶の水イン</a>"
                . 'のクチコミはTripAdvisorより提供を受けています';
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCase, $textArray);

     * Testing for placeholder attribute extraction from input tag
     * @return void
    public function testGetNodePathAndTextMapOnPlaceholderAttributeOfInputTag()
        $expectedPath = '/html/body/input/@placeholder';
        $expectedText = 'お名前';
        $file = $this->dataFolderPath .'inputPlaceholderTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);

     * Testing for alt attribute extraction from image tag
     * @return void
     * @ticket LOC-2162
    public function testGetNodePathAndTextMapOnAltAttributeOfImageTag()
        $expectedPath = '/html/body/img/@alt';
        $expectedText = '画像です';
        $file = $this->dataFolderPath .'imageAltTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);

     * Test BOM removal
     * @ticket ZEN-2579
     * @return void
    public function testGetNodePathAndTextMapOnBOM()
        $unexpectedPath = '/html/body/div[1]/text()[3]';
        $file = $this->dataFolderPath
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayNotHasKey($unexpectedPath, $nodePathAndTextMap);

//    /**
//     * Test sorting the node-path-to-text map by the line numbers of the HTML
//     *
//     * @return void
//     */
//    public function testSortNodePathAndTextMapByLineNum()
//    {
//        $file = $this->dataFolderPath .'replaceNodeXpath.html';
//        $doms = [PhpDom::make($file), Html5Dom::make($file)];
//        foreach ($doms as $dom) {
//            $extractor = new HtmlContentExtractor($dom);
//            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
//            $this->assertEquals(
//                'HTML Test', array_values($nodePathAndTextMap)[0]
//            );
//            $this->assertEquals(
//                '選べるリージョンとゾーン',
//                array_values($nodePathAndTextMap)[4]
//            );
//        }
//    }

Clone this wiki locally