Skip to content
姜 天戩 Mike Tian-Jian Jiang edited this page Jan 13, 2022 · 10 revisions

Welcome to the metadata wiki!

Some old PHP code for HTML text extraction

HtmlContentExtractor
class HtmlContentExtractor
{
    private $dom;
    protected $xpath;

    /**
     * Constructor
     *
     * @param \DOMDocument $dom DOMDocument
     *
     * @throws \Exception
     */
    public function __construct(\DOMDocument $dom)
    {
        set_error_handler('Yaraku\Html\ErrorHandlerFunction');
        libxml_use_internal_errors(true);

        try {
            $this->dom = $dom;
            if (!$this->dom) {
                throw new \Exception("DOMDocument is invalid.");
            }
            $this->dom->encoding = 'UTF-8';
            $this->dom->formatOutput = false;
            $this->xpath = new \DOMXPath($this->dom);
        } catch (\Exception $e) {
            restore_error_handler();
            throw $e;
        }

        restore_error_handler();
    }

    /**
     * Get the map of node path and text
     *
     * @return array
     * @throws \Exception
     */
    public function getNodePathAndTextMap()
    {
        $textArray = array();

        $blocks = array();
        $elements = $this->xpath->query(
            "//*[name() != 'script' and name() != 'style'"
            ." and name() != 'code'"
            ." and not(@translate='no')]/text()"
        );
        $elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
        /** @var \DOMNode $e */
        foreach ($elementArray as $e) {
            //$temp = self::whiteSpaceNormalization($e->C14N());
            $temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
            $temp = preg_replace("/<[^>]+>/u", "", $temp);
            $temp = preg_replace("/[\s\d]+/u", "", $temp);
            if ($temp !== '') {
                $nodePath = $e->getNodePath();
                $ancestorQuery
                    = "$nodePath/ancestor::p[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::a[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::div[not(descendant::table)"
                     ." and not(descendant::div)"
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::font[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::span[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::li[not(descendant::table)"
                     ." and not(descendant::li)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::dt[not(descendant::table)]"
                     ." | $nodePath/ancestor::dd[not(descendant::table)]"
                     ." | $nodePath/ancestor::td[not(descendant::table)"
                     ." and not(descendant::div) "
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::th[not(descendant::table)]"
                     ." | $nodePath/ancestor::b[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::i[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::u[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::tt[not(descendant::table)]"
                     ." | $nodePath/ancestor::blockquote[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strike[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::em[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strong[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::iframe[not(descendant::table)"
                     ." and not(descendant::div) and normalize-space(text())]"
                ;
                $ancestors = $this->xpath->query($ancestorQuery);
                if (!$ancestors) {
                    throw new \Exception(
                        "$ancestorQuery is an incorrect XPath query."
                    );
                } elseif (0 === $ancestors->length) {
                    //$blocks[$nodePath] = $e->C14N();
                    $blocks[$nodePath] = $this->dom->saveHTML($e);
                } else {
                    $isExtractedBlock = false;
                    for ($i = $ancestors->length - 1; $i >= 0; $i--) {
                        $blockNode = $ancestors->item($i);
                        //$blockOuterC14N = $blockNode->C14N();
                        $blockOuterC14N = $this->dom->saveHTML($blockNode);
                        $blockNodePath = $blockNode->getNodePath();
                        $headAndTheRest = preg_split(
                            "/^(<$blockNode->nodeName[^>]*>)/iu",
                            $blockOuterC14N,
                            2,
                            PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                        );
                        $head = $headAndTheRest[0];
                        $tail = "</$blockNode->nodeName>";

                        $key = "$blockNodePath|$head|$tail";
                        if (array_key_exists($key, $blocks)) {
                            $isExtractedBlock = true;
                            break;
                        }
                    }
                    if ($isExtractedBlock) {
                        continue;
                    }

                    $blockNode = $ancestors->item($ancestors->length - 1);
                    //$blockOuterC14N = $blockNode->C14N();
                    $blockOuterC14N = $this->dom->saveHTML($blockNode);
                    $blockNodePath = $blockNode->getNodePath();
                    $headAndTheRest = preg_split(
                        "/^(<$blockNode->nodeName[^>]*>)/iu",
                        $blockOuterC14N,
                        2,
                        PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                    );
                    $head = $headAndTheRest[0];
                    $tail = "</$blockNode->nodeName>";
                    $end = strripos($headAndTheRest[1], $tail);
                    $blockInnerC14N = substr($headAndTheRest[1], 0, $end);
                    $blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
                }
            }
        }
        $blocks = array_reverse($blocks);
        foreach ($blocks as $path => $html) {
            $textArray[$path] = str_replace('&#xD;', '', $html);
        }

        $metae = $this->xpath->query(
            "/html/head/meta"
            ."[string(@content)"
            ." and ("
            ."@name='Description' or @name='description'"
            ." or @name='Keywords' or @name='keywords'"
            .")]"
        );
        /** @var \DOMElement $meta */
        foreach ($metae as $meta) {
            if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
                $textArray[$meta->getNodePath()]
                    = $meta->getAttribute("content");
            }
        }

        $inputs = $this->xpath->query(
            "//input"
            ."[string(@value)"
            ." and ("
            ."@type='button' or @type='Button'"
            ." or @type='reset' or @type='Reset'"
            ." or @type='search' or @type='Search'"
            ." or @type='submit' or @type='Submit'"
            ." or @type='text' or @type='Text'"
            .")]"
        );
        /** @var \DOMElement $input */
        foreach ($inputs as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
                $textArray[$input->getNodePath()]
                    = $input->getAttribute("value");
            }
        }
        $inputsWithPlaceholder
            = $this->xpath->query("//input[string(@placeholder)]");
        foreach ($inputsWithPlaceholder as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
            ) {
                $textArray[$input->getNodePath() .'/@placeholder']
                    = $input->getAttribute("placeholder");
            }
        }
        //        $attributes =
        //            $this->m_xpath->query(
        //                "//*["
        //                    ."string(@abbr) or string(@alt) or string(@label)"
        //                    ." or string(@title) or string(@standby)"
        //                    ." or string(@summary)"
        //                ."]");
        //        foreach ($attributes as $a)
        //            $textArray[$a->getNodePath()] = $a->C14N();
        $attributeName = 'alt';
        $attributes = $this->xpath->query(
            "//*[string(@$attributeName)]/@$attributeName"
        );
        /** @var \DOMNode $a */
        foreach ($attributes as $a) {
            $textArray[$a->getNodePath()] = $a->nodeValue;
        }

        return $textArray;
    }

    /**
     * Get text array
     *
     * @return array
     */
    public function getTextArray()
    {
        return array_values($this->getNodePathAndTextMap());
    }

    /**
     * @return array
     * @throws \Exception
     */
    public function getTextArrayWithLineNumber()
    {
        $textWithLineNumberList = [];
        $nodePathAndTextMap = $this->getNodePathAndTextMap();
        foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
            $nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
            /** @var \DOMNodeList $nodeList */
            $nodeList = $this->xpath->query($nodePath);
            $lineNumber = $nodeList->item(0)->getLineNo();
            $textWithLineNumberList[] = [$lineNumber, $text];
        }

        return $textWithLineNumberList;
    }

    /**
     * Get the map of node path and image
     *
     * @return array
     */
    public function getNodePathAndImageMap()
    {
        $imageArray = array();

        $images = $this->xpath->query("//img[string(@src)]");
        /** @var \DOMNode $i */
        foreach ($images as $i) {
            $imageArray[$i->getNodePath()]
                = $i->attributes->getNamedItem('src')->nodeValue;
        }

        return $imageArray;
    }

    /**
     * Get image array
     *
     * @return array
     */
    public function getImageArray()
    {
        return array_values($this->getNodePathAndImageMap());
    }

    /**
     * Convert HTML to a one line string that can be used as Json variable
     *
     * @param string $html         The html
     * @param bool   $jsonFriendly Prepare to use as Json variable
     *
     * @return string
     */
    public static function htmlToOneLineString($html, $jsonFriendly=true)
    {
        $html = preg_replace('~>\s+<~', '><', $html);
        $html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);

        if ($jsonFriendly) {
            $html = str_replace('"', '\"', $html);
        }

        return $html;
    }

    /**
     * Encode the string into HTML Encoding format
     *
     * @param String $str Text String
     *
     * @return string
     */
    public static function encode($str)
    {
        $str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');

        $t = unpack("N*", $str);

        $t = array_map(
            function ($n) {
                return "&#$n;";
            }, $t
        );

        return implode("", $t);
    }

    /**
     * Normalize white space inside the text
     *
     * @param String $text raw text
     *
     * @return String $text
     * @throws \Exception
     */
    public static function whiteSpaceNormalization($text)
    {
        // encode the text in decimal format
        $text = self::encode($text);

        // replace uncommon white space with ordinary white space
        $text = preg_replace(
            '/(\&\#5760\;|\&\#6158\;|'
            .'\&\#8192\;|\&\#8193\;|'
            .'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
            .'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
            .'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
            .'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
            .'\&\#32\;|\&\#09\;|\&\#11\;|'
            .'\&\#160\;|\&\#9\;)+/u',
            " ",
            $text
        );

        // if $text is null, there is something wrong with the preg_replace function
        if (is_null($text)) {
            $errorCode = preg_last_error();
            throw new \Exception("preg_replace error code $errorCode.");
        }

        // decode the text again into the normal string
        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');

        // replace all common white space in named HTML entities with ordinary white
        // space
        // references:
        // - http://www.w3schools.com/tags/ref_symbols.asp
        // - http://www.w3schools.com/tags/ref_entities.asp
        $text = trim(
            preg_replace(
                '/(\s|\&nbsp\;|\&\#xA0\;|\&uml\;|\&\#xA8\;|\&shy\;|\&\#xAD\;|'
                .'\&macr\;|\&\#xAF\;|\&acute\;|\&\#xB4\;|\&cedil\;|\&\#xB8\;|'
                .'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
                .'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
                .'\&rlm\;|\&\#x200F\;|\&\#xA\;|\&#xD\;|\x{FEFF})+/u',
                " ",
                $text
            )
        );

        return $text;
    }
}
HtmlContentExtractorTest
class HtmlContentExtractorTest extends \PHPUnit_Framework_TestCase
{
    protected $dataFolderPath;
    protected $oInnPage;
    protected $solarePage;

    /** @var \DOMDocument $phpDom */
    protected $phpDom;

    /** @var \DOMDocument $html5Dom */
    protected $html5Dom;

    /** @var HtmlContentExtractor $extractor */
    protected $extractor;

    public function setUp()
    {
        parent::setUp();
        $this->dataFolderPath
            = __DIR__ . DIRECTORY_SEPARATOR .'_data'. DIRECTORY_SEPARATOR;
        $this->oInnPage = $this->dataFolderPath .'www.o-inn.co.jp_index.html';
        $this->solarePage = $this->dataFolderPath .'www.solarehotels.com.html';
    }

    public function testGetTextArrayWithLineNumber()
    {
        $this->extractor = new HtmlContentExtractor(
            PhpDom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);

        $this->markTestSkipped('Masterminds\HTML5 has no support of it.');
        $this->extractor = new HtmlContentExtractor(
            Html5Dom::make($this->solarePage)
        );
        $textWithLineNoList = $this->extractor->getTextArrayWithLineNumber();
        $firstPair = array_values($textWithLineNoList)[0];
        $this->assertEquals([108, 'For Smileage member'], $firstPair);
    }

    /**
     * Test get node path and text map
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetNodePathAndTextMap()
    {
        $expectedMetaKeywordsNodePath = "/html/head/meta[5]";
        $expectedMetaKeywordsText
            = "Best Price Guarantee,Bottom Price,Lowest Price,Hotel,Stay,"
                ."Reservation,Booking,SOLARE HOTELS & RESORTS";
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
        }

        $expectedMetaKeywordsNodePath = "/html/head/meta[3]";
        $expectedMetaKeywordsText
            = "お茶の水イン,御茶ノ水,お茶の水,後楽園,"
                ."ビジネスホテル,文京区,東京ドーム,出張,宿泊予約";
        $expectedInterpolatedCaseNodePath
            = "/html/body/div[4]/div/div[2]/div[6]/p|<p>|</p>";
        $expectedInterpolatedCaseText
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                    .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                    .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                . PHP_EOL //< Masterminds\HTML5 seems not using source's EOL.
                .'            ホテルお茶の水イン'
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $this->extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey(
                $expectedMetaKeywordsNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedMetaKeywordsText,
                $nodePathAndTextMap[$expectedMetaKeywordsNodePath]
            );
            $this->assertArrayHasKey(
                $expectedInterpolatedCaseNodePath,
                $nodePathAndTextMap
            );
            $this->assertEquals(
                $expectedInterpolatedCaseText,
                $nodePathAndTextMap[$expectedInterpolatedCaseNodePath]
            );
        }
    }

    /**
     * Test get text array
     *
     * @return void
     *
     * @ticket #108
     * @ticket #109
     * @ticket #136
     */
    public function testGetTextArray()
    {
        $expectedCommonCase
            = '  * 1...Only applicable to rates compared on the same date'
            .' as the date of reservation made via the SORALRE HOTELS &amp;'
            .' RESORTS official website.<br>'."\n"
            .'  * 2...Limited to claims submitted via email within 24 hours'
            .' of booking.';
        $doms = [
            PhpDom::make($this->solarePage), Html5Dom::make($this->solarePage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
        }

        $expectedCommonCase
            = '<strong>お茶の水イン</strong><br>〒113-0034<br>'
                .'東京都文京区湯島1-3-7<br>TEL:03-3813-8211<br>'
                .'FAX:03-3813-9730<br>'
                .'<a href="/transportation/">お茶の水インまでの地図</a>'
            ;
        $expectedInterpolatedCase
            = '掲載されている'
                .'<a href="http://www.tripadvisor.jp/'
                .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn'
                .'-Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html"'
                .' target="_blank">'
                ."\n            ホテルお茶の水イン"
                .'</a>'
                .'のクチコミはTripAdvisorより提供を受けています'
            ;
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCommonCase, $textArray);
            $this->assertContains($expectedInterpolatedCase, $textArray);
        }
    }

    /**
     * @ticket #109
     */
    /**
     * Test get text array on ill formed html
     *
     * @return void
     */
    public function testGetTextArrayOnIllFormedHtml()
    {
        $unpreparedHtml = file_get_contents($this->oInnPage);
        $illformedCase
            = '掲載されている'
            .'<a href="http://www.tripadvisor.jp/'
            .'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
            .'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
            ."\n            ホテルお茶の水イン</a>"
            .'のクチコミはTripAdvisorより提供を受けています'
            .'</a>'
        ;
        $this->assertNotEquals(false, strpos($unpreparedHtml, $illformedCase));

        $expectedCase
            = '掲載されている'
                . '<a href="http://www.tripadvisor.jp/'
                . 'Hotel_Review-g1066442-d1082434-Reviews-Ochanomizu_Inn-'
                . 'Bunkyo_Tokyo_Tokyo_Prefecture_Kanto.html" target="_blank">'
                . "\n            ホテルお茶の水イン</a>"
                . 'のクチコミはTripAdvisorより提供を受けています';
        $doms = [
            PhpDom::make($this->oInnPage), Html5Dom::make($this->oInnPage)
        ];
        foreach ($doms as $dom) {
            $this->extractor = new HtmlContentExtractor($dom);
            $textArray = $this->extractor->getTextArray();
            $this->assertContains($expectedCase, $textArray);
        }
    }

    /**
     * Testing for placeholder attribute extraction from input tag
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnPlaceholderAttributeOfInputTag()
    {
        $expectedPath = '/html/body/input/@placeholder';
        $expectedText = 'お名前';
        $file = $this->dataFolderPath .'inputPlaceholderTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Testing for alt attribute extraction from image tag
     *
     * @return void
     *
     * @ticket LOC-2162
     */
    public function testGetNodePathAndTextMapOnAltAttributeOfImageTag()
    {
        $expectedPath = '/html/body/img/@alt';
        $expectedText = '画像です';
        $file = $this->dataFolderPath .'imageAltTest.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayHasKey($expectedPath, $nodePathAndTextMap);
            $this->assertEquals($expectedText, $nodePathAndTextMap[$expectedPath]);
        }
    }

    /**
     * Test BOM removal
     *
     * @ticket ZEN-2579
     *
     * @return void
     */
    public function testGetNodePathAndTextMapOnBOM()
    {
        $unexpectedPath = '/html/body/div[1]/text()[3]';
        $file = $this->dataFolderPath
            .'www.yokohamabay-sheraton.co.jp__other_facilities.html';
        $doms = [PhpDom::make($file), Html5Dom::make($file)];
        foreach ($doms as $dom) {
            $extractor = new HtmlContentExtractor($dom);
            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
            $this->assertArrayNotHasKey($unexpectedPath, $nodePathAndTextMap);
        }
    }

//    /**
//     * Test sorting the node-path-to-text map by the line numbers of the HTML
//     *
//     * @return void
//     */
//    public function testSortNodePathAndTextMapByLineNum()
//    {
//        $file = $this->dataFolderPath .'replaceNodeXpath.html';
//        $doms = [PhpDom::make($file), Html5Dom::make($file)];
//        foreach ($doms as $dom) {
//            $extractor = new HtmlContentExtractor($dom);
//            $nodePathAndTextMap = $extractor->getNodePathAndTextMap();
//            $this->assertEquals(
//                'HTML Test', array_values($nodePathAndTextMap)[0]
//            );
//            $this->assertEquals(
//                '選べるリージョンとゾーン',
//                array_values($nodePathAndTextMap)[4]
//            );
//        }
//    }

}
Clone this wiki locally