Home

Welcome to the metadata wiki!

Some old PHP code for HTML text extraction

HtmlContentExtractor

class HtmlContentExtractor
{
    private $dom;
    protected $xpath;

    /**
     * Constructor
     *
     * @param \DOMDocument $dom DOMDocument
     *
     * @throws \Exception
     */
    public function __construct(\DOMDocument $dom)
    {
        set_error_handler('Yaraku\Html\ErrorHandlerFunction');
        libxml_use_internal_errors(true);

        try {
            $this->dom = $dom;
            if (!$this->dom) {
                throw new \Exception("DOMDocument is invalid.");
            }
            $this->dom->encoding = 'UTF-8';
            $this->dom->formatOutput = false;
            $this->xpath = new \DOMXPath($this->dom);
        } catch (\Exception $e) {
            restore_error_handler();
            throw $e;
        }

        restore_error_handler();
    }

    /**
     * Get the map of node path and text
     *
     * @return array
     * @throws \Exception
     */
    public function getNodePathAndTextMap()
    {
        $textArray = array();

        $blocks = array();
        $elements = $this->xpath->query(
            "//*[name() != 'script' and name() != 'style'"
            ." and name() != 'code'"
            ." and not(@translate='no')]/text()"
        );
        $elementArray = GetDepthSortedDomNodeArrayFromDomNodeList($elements);
        /** @var \DOMNode $e */
        foreach ($elementArray as $e) {
            //$temp = self::whiteSpaceNormalization($e->C14N());
            $temp = self::whiteSpaceNormalization($this->dom->saveHTML($e));
            $temp = preg_replace("/<[^>]+>/u", "", $temp);
            $temp = preg_replace("/[\s\d]+/u", "", $temp);
            if ($temp !== '') {
                $nodePath = $e->getNodePath();
                $ancestorQuery
                    = "$nodePath/ancestor::p[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::a[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::div[not(descendant::table)"
                     ." and not(descendant::div)"
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::font[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::span[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::li[not(descendant::table)"
                     ." and not(descendant::li)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::dt[not(descendant::table)]"
                     ." | $nodePath/ancestor::dd[not(descendant::table)]"
                     ." | $nodePath/ancestor::td[not(descendant::table)"
                     ." and not(descendant::div) "
                     ." and not(descendant::code)"
                     ." and normalize-space(text())]"
                     ." | $nodePath/ancestor::th[not(descendant::table)]"
                     ." | $nodePath/ancestor::b[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::i[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::u[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::tt[not(descendant::table)]"
                     ." | $nodePath/ancestor::blockquote[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strike[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::em[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::strong[not(descendant::table)"
                     ." and not(descendant::code)]"
                     ." | $nodePath/ancestor::iframe[not(descendant::table)"
                     ." and not(descendant::div) and normalize-space(text())]"
                ;
                $ancestors = $this->xpath->query($ancestorQuery);
                if (!$ancestors) {
                    throw new \Exception(
                        "$ancestorQuery is an incorrect XPath query."
                    );
                } elseif (0 === $ancestors->length) {
                    //$blocks[$nodePath] = $e->C14N();
                    $blocks[$nodePath] = $this->dom->saveHTML($e);
                } else {
                    $isExtractedBlock = false;
                    for ($i = $ancestors->length - 1; $i >= 0; $i--) {
                        $blockNode = $ancestors->item($i);
                        //$blockOuterC14N = $blockNode->C14N();
                        $blockOuterC14N = $this->dom->saveHTML($blockNode);
                        $blockNodePath = $blockNode->getNodePath();
                        $headAndTheRest = preg_split(
                            "/^(<$blockNode->nodeName[^>]*>)/iu",
                            $blockOuterC14N,
                            2,
                            PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                        );
                        $head = $headAndTheRest[0];
                        $tail = "</$blockNode->nodeName>";

                        $key = "$blockNodePath|$head|$tail";
                        if (array_key_exists($key, $blocks)) {
                            $isExtractedBlock = true;
                            break;
                        }
                    }
                    if ($isExtractedBlock) {
                        continue;
                    }

                    $blockNode = $ancestors->item($ancestors->length - 1);
                    //$blockOuterC14N = $blockNode->C14N();
                    $blockOuterC14N = $this->dom->saveHTML($blockNode);
                    $blockNodePath = $blockNode->getNodePath();
                    $headAndTheRest = preg_split(
                        "/^(<$blockNode->nodeName[^>]*>)/iu",
                        $blockOuterC14N,
                        2,
                        PREG_SPLIT_NO_EMPTY|PREG_SPLIT_DELIM_CAPTURE
                    );
                    $head = $headAndTheRest[0];
                    $tail = "</$blockNode->nodeName>";
                    $end = strripos($headAndTheRest[1], $tail);
                    $blockInnerC14N = substr($headAndTheRest[1], 0, $end);
                    $blocks["$blockNodePath|$head|$tail"] = $blockInnerC14N;
                }
            }
        }
        $blocks = array_reverse($blocks);
        foreach ($blocks as $path => $html) {
            $textArray[$path] = str_replace('&#xD;', '', $html);
        }

        $metae = $this->xpath->query(
            "/html/head/meta"
            ."[string(@content)"
            ." and ("
            ."@name='Description' or @name='description'"
            ." or @name='Keywords' or @name='keywords'"
            .")]"
        );
        /** @var \DOMElement $meta */
        foreach ($metae as $meta) {
            if (1 != preg_match("/^[\s\d]+$/u", $meta->getAttribute("content"))) {
                $textArray[$meta->getNodePath()]
                    = $meta->getAttribute("content");
            }
        }

        $inputs = $this->xpath->query(
            "//input"
            ."[string(@value)"
            ." and ("
            ."@type='button' or @type='Button'"
            ." or @type='reset' or @type='Reset'"
            ." or @type='search' or @type='Search'"
            ." or @type='submit' or @type='Submit'"
            ." or @type='text' or @type='Text'"
            .")]"
        );
        /** @var \DOMElement $input */
        foreach ($inputs as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("value"))) {
                $textArray[$input->getNodePath()]
                    = $input->getAttribute("value");
            }
        }
        $inputsWithPlaceholder
            = $this->xpath->query("//input[string(@placeholder)]");
        foreach ($inputsWithPlaceholder as $input) {
            if (1 != preg_match("/^[\s\d]+$/u", $input->getAttribute("placeholder"))
            ) {
                $textArray[$input->getNodePath() .'/@placeholder']
                    = $input->getAttribute("placeholder");
            }
        }
        //        $attributes =
        //            $this->m_xpath->query(
        //                "//*["
        //                    ."string(@abbr) or string(@alt) or string(@label)"
        //                    ." or string(@title) or string(@standby)"
        //                    ." or string(@summary)"
        //                ."]");
        //        foreach ($attributes as $a)
        //            $textArray[$a->getNodePath()] = $a->C14N();
        $attributeName = 'alt';
        $attributes = $this->xpath->query(
            "//*[string(@$attributeName)]/@$attributeName"
        );
        /** @var \DOMNode $a */
        foreach ($attributes as $a) {
            $textArray[$a->getNodePath()] = $a->nodeValue;
        }

        return $textArray;
    }

    /**
     * Get text array
     *
     * @return array
     */
    public function getTextArray()
    {
        return array_values($this->getNodePathAndTextMap());
    }

    /**
     * @return array
     * @throws \Exception
     */
    public function getTextArrayWithLineNumber()
    {
        $textWithLineNumberList = [];
        $nodePathAndTextMap = $this->getNodePathAndTextMap();
        foreach ($nodePathAndTextMap as $nodePathWithHeadTail => $text) {
            $nodePath = array_values(explode('|', $nodePathWithHeadTail))[0];
            /** @var \DOMNodeList $nodeList */
            $nodeList = $this->xpath->query($nodePath);
            $lineNumber = $nodeList->item(0)->getLineNo();
            $textWithLineNumberList[] = [$lineNumber, $text];
        }

        return $textWithLineNumberList;
    }

    /**
     * Get the map of node path and image
     *
     * @return array
     */
    public function getNodePathAndImageMap()
    {
        $imageArray = array();

        $images = $this->xpath->query("//img[string(@src)]");
        /** @var \DOMNode $i */
        foreach ($images as $i) {
            $imageArray[$i->getNodePath()]
                = $i->attributes->getNamedItem('src')->nodeValue;
        }

        return $imageArray;
    }

    /**
     * Get image array
     *
     * @return array
     */
    public function getImageArray()
    {
        return array_values($this->getNodePathAndImageMap());
    }

    /**
     * Convert HTML to a one line string that can be used as Json variable
     *
     * @param string $html         The html
     * @param bool   $jsonFriendly Prepare to use as Json variable
     *
     * @return string
     */
    public static function htmlToOneLineString($html, $jsonFriendly=true)
    {
        $html = preg_replace('~>\s+<~', '><', $html);
        $html = preg_replace('/^\s+|\n|\r|\s+$/um', '', $html);

        if ($jsonFriendly) {
            $html = str_replace('"', '\"', $html);
        }

        return $html;
    }

    /**
     * Encode the string into HTML Encoding format
     *
     * @param String $str Text String
     *
     * @return string
     */
    public static function encode($str)
    {
        $str = mb_convert_encoding($str, 'UTF-32', 'UTF-8');

        $t = unpack("N*", $str);

        $t = array_map(
            function ($n) {
                return "&#$n;";
            }, $t
        );

        return implode("", $t);
    }

    /**
     * Normalize white space inside the text
     *
     * @param String $text raw text
     *
     * @return String $text
     * @throws \Exception
     */
    public static function whiteSpaceNormalization($text)
    {
        // encode the text in decimal format
        $text = self::encode($text);

        // replace uncommon white space with ordinary white space
        $text = preg_replace(
            '/(\&\#5760\;|\&\#6158\;|'
            .'\&\#8192\;|\&\#8193\;|'
            .'\&\#8194\;|\&\#8195\;|\&\#8196\;|\&\#8197\;|\&\#8198\;|\&\#8199\;|'
            .'\&\#8200\;|\&\#8201\;|\&\#8204\;|\&\#8205\;|\&\#8206\;|\&\#8207\;|'
            .'\&\#8202\;|\&\#8239\;|\&\#8287\;|\&\#12288\;|\&\#10\;|'
            .'\&\#11\;|\&\#12\;|\&\#13\;|\&\#133\;|\&\#8232\;|\&\#8233\;'
            .'\&\#32\;|\&\#09\;|\&\#11\;|'
            .'\&\#160\;|\&\#9\;)+/u',
            " ",
            $text
        );

        // if $text is null, there is something wrong with the preg_replace function
        if (is_null($text)) {
            $errorCode = preg_last_error();
            throw new \Exception("preg_replace error code $errorCode.");
        }

        // decode the text again into the normal string
        $text = html_entity_decode($text, ENT_QUOTES, 'UTF-8');

        // replace all common white space in named HTML entities with ordinary white
        // space
        // references:
        // - http://www.w3schools.com/tags/ref_symbols.asp
        // - http://www.w3schools.com/tags/ref_entities.asp
        $text = trim(
            preg_replace(
                '/(\s|\&nbsp\;|\&\#xA0\;|\&uml\;|\&\#xA8\;|\&shy\;|\&\#xAD\;|'
                .'\&macr\;|\&\#xAF\;|\&acute\;|\&\#xB4\;|\&cedil\;|\&\#xB8\;|'
                .'\&ensp\;|\&\#x2002\;|\&emsp\;|\&\#x2003\;|\&thinsp\;|\&\#x2009\;|'
                .'\&zwnj\;|\&\#x200C\;|\&zwj\;|\&\#x200D\;|\&lrm\;|\&\#x200E\;|'
                .'\&rlm\;|\&\#x200F\;|\&\#xA\;|\&#xD\;|\x{FEFF})+/u',
                " ",
                $text
            )
        );

        return $text;
    }
}

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Home

Clone this wiki locally