Skip to content

Commit

Permalink
Merge branch 'master' into fix-lists-in-tables
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar authored Apr 11, 2024
2 parents f5b6045 + 54ad86c commit 8a65919
Show file tree
Hide file tree
Showing 19 changed files with 789 additions and 760 deletions.
3 changes: 2 additions & 1 deletion tests/comparison.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,9 @@
from trafilatura import extract

try:
from trafilatura.core import baseline
from trafilatura import baseline
except ImportError:
print("Cannot import baseline, using simple version")
baseline = None
from evaldata import EVAL_PAGES

Expand Down
6 changes: 3 additions & 3 deletions tests/comparison_small.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,9 @@

from trafilatura import extract
try:
from trafilatura.core import baseline, html2txt
from trafilatura import baseline, html2txt
except ImportError:
print("Cannot import baseline, using simple version")
baseline = None
html2txt = None
#from trafilatura.htmlprocessing import prune_html
Expand Down Expand Up @@ -155,8 +156,7 @@ def run_baseline(htmlstring):
if baseline is not None:
_, result, _ = baseline(htmlstring)
return result
result = run_baseline_2(htmlstring)
return result
return run_baseline_2(htmlstring)


def run_trafilatura(htmlstring):
Expand Down
35 changes: 30 additions & 5 deletions tests/unit_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@
process_record, utils, xml)
from trafilatura.core import (Extractor, handle_formatting, handle_image,
handle_lists, handle_paragraphs, handle_quotes,
handle_table, handle_textelem, sanitize_tree,
trim)
from trafilatura.external import try_justext
handle_table, handle_textelem)
from trafilatura.external import sanitize_tree, try_justext
from trafilatura.filters import textfilter
from trafilatura.meta import reset_caches
from trafilatura.metadata import Document
from trafilatura.settings import DEFAULT_CONFIG, TAG_CATALOG, use_config
from trafilatura.utils import trim

logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)

Expand Down Expand Up @@ -116,6 +116,12 @@ def test_trim():

def test_input():
'''test if loaded strings/trees are handled properly'''
teststring = "高山云雾出好茶".encode("utf-8")
assert utils.detect_encoding(teststring) == ["utf-8"]
teststring = "高山云雾出好茶".encode("gb18030")
assert "gb18030" in utils.detect_encoding(teststring)
assert "gb18030" in utils.detect_encoding(teststring*1000)

assert utils.is_dubious_html("This is a string.") is True

htmlstring = "<!DOCTYPE html PUBLIC />\n<html></html>"
Expand Down Expand Up @@ -147,7 +153,8 @@ def test_input():
# old: with pytest.raises(TypeError) as err:
assert extract(None, 'url', '0000', target_language=None) is None
# legacy
assert process_record(None, 'url', '0000', target_language=None) is None
with pytest.raises(SystemExit):
assert process_record(None, 'url', '0000', target_language=None) is None
# GZip
with open(os.path.join(RESOURCES_DIR, 'webpage.html.gz'), 'rb') as gzfile:
myinput = gzfile.read()
Expand Down Expand Up @@ -293,21 +300,29 @@ def test_formatting():
my_document = html.fromstring('<html><body><article><h3>Title</h3><p><b>This here is in bold font.</b></p></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == '### Title\n**This here is in bold font.**'

# space between paragraphs
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Paragraph 1</p><p>Paragraph 2</p></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result.endswith('Paragraph 1\n\nParagraph 2')

# code sections
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura</code></p></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == """### Title
Here is a code sample:
`import trafilatura`"""
my_document = html.fromstring('<html><body><article><h3>Title</h3><p>Here is a code sample:</p><code>import trafilatura\ntrafilatura.extract("")</code></p></article></body></html>')
my_result = extract(my_document, output_format='txt', include_formatting=True, config=ZERO_CONFIG)
assert my_result == """### Title
Here is a code sample:
```
import trafilatura
trafilatura.extract("")
```"""

# nested
my_document = html.fromstring('<html><body><p><b>This here is in bold and <i>italic</i> font.</b></p></body></html>')
my_result = extract(my_document, output_format='xml', include_formatting=True, config=ZERO_CONFIG)
Expand Down Expand Up @@ -1255,7 +1270,17 @@ def test_lang_detection():
assert detected == sample['expected'], f"Lang detection failed for {sample['expected']}"


def test_config_loading():
"Check if the config file is read correctly."
with pytest.raises(FileNotFoundError):
config = use_config(filename="/bogus-dir/bogus-file.txt")

config = use_config(filename=os.path.join(RESOURCES_DIR, "newsettings.cfg"))
assert config is not None


if __name__ == '__main__':
test_config_loading()
test_trim()
test_input()
test_formatting()
Expand Down
3 changes: 2 additions & 1 deletion trafilatura/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@

import logging

from .core import bare_extraction, baseline, extract, html2txt, process_record
from .baseline import baseline, html2txt
from .core import bare_extraction, extract, process_record
from .downloads import fetch_response, fetch_url
from .metadata import extract_metadata
from .utils import load_html
Expand Down
101 changes: 101 additions & 0 deletions trafilatura/baseline.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
# pylint:disable-msg=E0611
import re

from lxml.etree import Element, SubElement

from .settings import BASIC_CLEAN_XPATH
from .utils import load_html, trim


JSON_SEARCH = re.compile(r'"articlebody": *"(.+?)(?<!\\)"', re.I)



def basic_cleaning(tree):
"Remove a few section types from the document."
for elem in BASIC_CLEAN_XPATH(tree):
elem.getparent().remove(elem)
return tree


def baseline(filecontent):
"""Use baseline extraction function targeting text paragraphs and/or JSON metadata.
Args:
filecontent: HTML code as binary string or string.
Returns:
A LXML <body> element containing the extracted paragraphs,
the main text as string, and its length as integer.
"""
tree = load_html(filecontent)
postbody = Element('body')
if tree is None:
return postbody, '', 0
# scrape from json text
for elem in tree.iterfind('.//script[@type="application/ld+json"]'):
if elem.text and '"article' in elem.text:
mymatch = JSON_SEARCH.search(elem.text)
if mymatch:
elem = SubElement(postbody, 'p')
elem.text = trim(mymatch[1].replace('\\"', '"'))
return postbody, elem.text, len(elem.text)

tree = basic_cleaning(tree)

# scrape from article tag
article_elem = tree.find('.//article')
if article_elem is not None:
temp_text = trim(article_elem.text_content())
if len(temp_text) > 100:
elem = SubElement(postbody, 'p')
elem.text = temp_text
return postbody, temp_text, len(temp_text)
# scrape from text paragraphs
results = set()
for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
entry = element.text_content()
if entry not in results:
elem = SubElement(postbody, 'p')
elem.text = entry
results.add(entry)
temp_text = trim('\n'.join(postbody.itertext()))
if len(temp_text) > 100:
return postbody, temp_text, len(temp_text)
# default strategy: clean the tree and take everything
postbody = Element('body')
body_elem = tree.find('.//body')
if body_elem is not None:
# elem.text = trim(body_elem.text_content())
text = '\n'.join([trim(e) for e in body_elem.itertext()])
if len(text) > 100:
elem = SubElement(postbody, 'p')
elem.text = text
return postbody, text, len(text)
# new fallback
text = html2txt(tree)
elem = SubElement(postbody, 'p')
elem.text = text
return postbody, text, len(text)
# old: return postbody, '', 0


def html2txt(content):
"""Run basic html2txt on a document.
Args:
content: HTML document as string or LXML element.
Returns:
The extracted text in the form of a string or an empty string.
"""
tree = load_html(content)
if tree is None:
return ""
body = tree.find(".//body")
if body is None:
return ""
tree = basic_cleaning(tree)
return " ".join(body.text_content().split()).strip()
7 changes: 4 additions & 3 deletions trafilatura/cli_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,8 @@

from trafilatura import spider

from .core import extract, html2txt
from .baseline import html2txt
from .core import extract
from .downloads import (add_to_compressed_dict, buffered_downloads,
load_download_buffer)
from .feeds import find_feed_urls
Expand All @@ -26,7 +27,7 @@
from .meta import reset_caches
from .settings import FILENAME_LEN, MAX_FILES_PER_DIRECTORY, use_config
from .sitemaps import sitemap_search
from .utils import URL_BLACKLIST_REGEX, make_chunks, uniquify_list
from .utils import URL_BLACKLIST_REGEX, make_chunks

LOGGER = logging.getLogger(__name__)

Expand Down Expand Up @@ -67,7 +68,7 @@ def load_input_urls(args):
LOGGER.warning('No input provided')

# uniq URLs while preserving order (important)
return uniquify_list(input_urls)
return list(dict.fromkeys(input_urls))


def load_blacklist(filename):
Expand Down
Loading

0 comments on commit 8a65919

Please sign in to comment.