Skip to content

Commit

Permalink
code linted for readability
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Apr 23, 2020
1 parent 402842b commit 67d65ae
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 61 deletions.
45 changes: 25 additions & 20 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from .settings import (HTML_CLEANER, MIN_EXTRACTED_SIZE, MIN_EXTRACTED_COMM_SIZE,
MIN_OUTPUT_SIZE, MIN_OUTPUT_COMM_SIZE, TAG_CATALOG)
from .utils import load_html, sanitize, trim, txttocsv
from .xml import build_outputtree, validate_tei, xmltotxt
from .xml import add_xml_meta, build_tei_output, build_xml_output, validate_tei, xmltotxt
from .xpaths import BODY_XPATH, COMMENTS_XPATH


Expand Down Expand Up @@ -438,7 +438,7 @@ def extract_comments(tree):
return comments_body, temp_comments, len_comments, tree


def compare_extraction(tree, url, body, text, len_text, sure_thing):
def compare_extraction(tree, url, body, text, len_text):
'''Decide whether to choose own or external extraction
based on a series of heuristics'''
if tree is None:
Expand Down Expand Up @@ -568,14 +568,14 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False,
if include_comments is True:
commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree)
else:
commentsbody, temp_comments, len_comments = etree.Element('body'), '', 0
commentsbody, temp_comments, len_comments = None, '', 0

# extract content
postbody, temp_text, len_text, sure_thing = extract_content(cleaned_tree, include_tables)

# compare if necessary
if no_fallback is False: # and sure_thing is False:
postbody, temp_text, len_text = compare_extraction(backup_tree, url, postbody, temp_text, len_text, sure_thing)
postbody, temp_text, len_text = compare_extraction(backup_tree, url, postbody, temp_text, len_text)
# try with justext
if len_text < MIN_EXTRACTED_SIZE:
LOGGER.error('not enough text %s %s', record_id, url)
Expand Down Expand Up @@ -603,26 +603,22 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False,
if language_filter(temp_text, temp_comments, target_language, record_id, url) is True:
return None

# cache elements
put_in_cache(postbody)
put_in_cache(commentsbody)
# del tree_cache[cleaned_tree]

# XML (TEI) steps
output = build_outputtree(record_id, postbody, commentsbody, docmeta, include_comments, tei_output, tei_validation)

# check duplicates at body level
if duplicate_test(postbody) is True:
return None

if xml_output is False and tei_output is False:
if csv_output is False:
returnstring = xmltotxt(output)
else:
posttext = xmltotxt(postbody)
commentstext = xmltotxt(commentsbody)
returnstring = txttocsv(posttext, commentstext, docmeta)
else:
# cache elements
put_in_cache(postbody)
if commentsbody is not None:
put_in_cache(commentsbody)

# XML (TEI) steps
if xml_output is True or tei_output is True:
if xml_output is True:
output = build_xml_output(postbody, commentsbody)
output = add_xml_meta(output, docmeta)
elif tei_output is True:
output = build_tei_output(postbody, commentsbody, docmeta)
# can be improved
control_string = etree.tostring(output, encoding='unicode')
control_string = sanitize(control_string)
Expand All @@ -634,6 +630,15 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False,
result = validate_tei(output_tree)
LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url)
returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip()
# CSV + TXT output
else:
if csv_output is True:
posttext = xmltotxt(postbody)
commentstext = xmltotxt(commentsbody)
returnstring = txttocsv(posttext, commentstext, docmeta)
else:
output = build_xml_output(postbody, commentsbody)
returnstring = xmltotxt(output)

return returnstring

Expand Down
2 changes: 1 addition & 1 deletion trafilatura/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -309,7 +309,7 @@ def extract_metadata(filecontent, default_url=None):
if mymeta.sitename:
if mymeta.sitename.startswith('@'):
# scrap Twitter ID
newsitename = re.sub(r'^@', '', mymeta.sitename)
newsitename = re.sub(r'^@', '', mymeta.sitename)
mymeta = mymeta._replace(sitename=newsitename)
# capitalize
if not '.' in mymeta.sitename and not mymeta.sitename[0].isupper():
Expand Down
83 changes: 43 additions & 40 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,46 +29,49 @@
# https://chase-seibert.github.io/blog/2011/05/20/stripping-control-characters-in-python.html


def build_outputtree(record_id, postbody, commentsbody, docmeta, include_comments, tei_output, tei_validation):
'''Build XML output tree based on options and extracted information'''
# clear comments if necessary
if include_comments is False:
commentsbody = None
# TEI-XML
if tei_output is True:
# build TEI tree
output = write_teitree(postbody, commentsbody, docmeta)
# filter output (strip unwanted elements), just in case
# check and repair
output = check_tei(output, docmeta.url)
# XML
else:
output = etree.Element('doc')
postbody.tag = 'main'
output.append(postbody)
if commentsbody is not None:
commentsbody.tag = 'comments'
output.append(commentsbody)
# metadata
if docmeta:
if docmeta.sitename is not None:
output.set('sitename', docmeta.sitename)
if docmeta.title is not None:
output.set('title', docmeta.title)
if docmeta.author is not None:
output.set('author', docmeta.author)
if docmeta.date is not None:
output.set('date', docmeta.date)
if docmeta.url is not None:
output.set('source', docmeta.url)
if docmeta.description is not None:
output.set('excerpt', docmeta.description)
if docmeta.categories is not None and len(docmeta.categories) > 0:
cats = ';'.join(docmeta.categories)
output.set('categories', cats)
if docmeta.tags is not None and len(docmeta.tags) > 0:
tags = ';'.join(docmeta.tags)
output.set('tags', tags)
def build_xml_output(postbody, commentsbody):
'''Build XML output tree based on extracted information'''
output = etree.Element('doc')
postbody.tag = 'main'
output.append(postbody)
if commentsbody is not None:
commentsbody.tag = 'comments'
output.append(commentsbody)
return output


def add_xml_meta(output, docmeta):
'''Add extracted metadata to the XML output tree'''
# metadata
if docmeta:
if docmeta.sitename is not None:
output.set('sitename', docmeta.sitename)
if docmeta.title is not None:
output.set('title', docmeta.title)
if docmeta.author is not None:
output.set('author', docmeta.author)
if docmeta.date is not None:
output.set('date', docmeta.date)
if docmeta.url is not None:
output.set('source', docmeta.url)
if docmeta.description is not None:
output.set('excerpt', docmeta.description)
if docmeta.categories is not None and len(docmeta.categories) > 0:
cats = ';'.join(docmeta.categories)
output.set('categories', cats)
if docmeta.tags is not None and len(docmeta.tags) > 0:
tags = ';'.join(docmeta.tags)
output.set('tags', tags)
return output


def build_tei_output(postbody, commentsbody, docmeta):
'''Build TEI-XML output tree based on extracted information'''
# build TEI tree
output = write_teitree(postbody, commentsbody, docmeta)
# filter output (strip unwanted elements), just in case
# check and repair
output = check_tei(output, docmeta.url)
return output


Expand Down

0 comments on commit 67d65ae

Please sign in to comment.