code linted for readability

adbar · Apr 23, 2020 · 67d65ae · 67d65ae
1 parent 402842b
commit 67d65ae
Show file tree

Hide file tree

Showing 3 changed files with 69 additions and 61 deletions.
diff --git a/trafilatura/core.py b/trafilatura/core.py
@@ -25,7 +25,7 @@
 from .settings import (HTML_CLEANER, MIN_EXTRACTED_SIZE, MIN_EXTRACTED_COMM_SIZE,
                        MIN_OUTPUT_SIZE, MIN_OUTPUT_COMM_SIZE, TAG_CATALOG)
 from .utils import load_html, sanitize, trim, txttocsv
-from .xml import build_outputtree, validate_tei, xmltotxt
+from .xml import add_xml_meta, build_tei_output, build_xml_output, validate_tei, xmltotxt
 from .xpaths import BODY_XPATH, COMMENTS_XPATH
 
 
@@ -438,7 +438,7 @@ def extract_comments(tree):
     return comments_body, temp_comments, len_comments, tree
 
 
-def compare_extraction(tree, url, body, text, len_text, sure_thing):
+def compare_extraction(tree, url, body, text, len_text):
     '''Decide whether to choose own or external extraction
        based on a series of heuristics'''
     if tree is None:
@@ -568,14 +568,14 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False,
     if include_comments is True:
         commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree)
     else:
-        commentsbody, temp_comments, len_comments = etree.Element('body'), '', 0
+        commentsbody, temp_comments, len_comments = None, '', 0
 
     # extract content
     postbody, temp_text, len_text, sure_thing = extract_content(cleaned_tree, include_tables)
 
     # compare if necessary
     if no_fallback is False: # and sure_thing is False:
-        postbody, temp_text, len_text = compare_extraction(backup_tree, url, postbody, temp_text, len_text, sure_thing)
+        postbody, temp_text, len_text = compare_extraction(backup_tree, url, postbody, temp_text, len_text)
         # try with justext
         if len_text < MIN_EXTRACTED_SIZE:
             LOGGER.error('not enough text %s %s', record_id, url)
@@ -603,26 +603,22 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False,
     if language_filter(temp_text, temp_comments, target_language, record_id, url) is True:
         return None
 
-    # cache elements
-    put_in_cache(postbody)
-    put_in_cache(commentsbody)
-    # del tree_cache[cleaned_tree]
-
-    # XML (TEI) steps
-    output = build_outputtree(record_id, postbody, commentsbody, docmeta, include_comments, tei_output, tei_validation)
-
     # check duplicates at body level
     if duplicate_test(postbody) is True:
         return None
 
-    if xml_output is False and tei_output is False:
-        if csv_output is False:
-            returnstring = xmltotxt(output)
-        else:
-            posttext = xmltotxt(postbody)
-            commentstext = xmltotxt(commentsbody)
-            returnstring = txttocsv(posttext, commentstext, docmeta)
-    else:
+    # cache elements
+    put_in_cache(postbody)
+    if commentsbody is not None:
+        put_in_cache(commentsbody)
+
+    # XML (TEI) steps
+    if xml_output is True or tei_output is True:
+        if xml_output is True:
+            output = build_xml_output(postbody, commentsbody)
+            output = add_xml_meta(output, docmeta)
+        elif tei_output is True:
+            output = build_tei_output(postbody, commentsbody, docmeta)
         # can be improved
         control_string = etree.tostring(output, encoding='unicode')
         control_string = sanitize(control_string)
@@ -634,6 +630,15 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False,
             result = validate_tei(output_tree)
             LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url)
         returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip()
+    # CSV + TXT output
+    else:
+        if csv_output is True:
+            posttext = xmltotxt(postbody)
+            commentstext = xmltotxt(commentsbody)
+            returnstring = txttocsv(posttext, commentstext, docmeta)
+        else:
+            output = build_xml_output(postbody, commentsbody)
+            returnstring = xmltotxt(output)
 
     return returnstring
 

diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py
@@ -309,7 +309,7 @@ def extract_metadata(filecontent, default_url=None):
     if mymeta.sitename:
         if mymeta.sitename.startswith('@'):
             # scrap Twitter ID
-            newsitename = re.sub(r'^@', '',  mymeta.sitename)
+            newsitename = re.sub(r'^@', '', mymeta.sitename)
             mymeta = mymeta._replace(sitename=newsitename)
         # capitalize
         if not '.' in mymeta.sitename and not mymeta.sitename[0].isupper():

diff --git a/trafilatura/xml.py b/trafilatura/xml.py
@@ -29,46 +29,49 @@
 # https://chase-seibert.github.io/blog/2011/05/20/stripping-control-characters-in-python.html
 
 
-def build_outputtree(record_id, postbody, commentsbody, docmeta, include_comments, tei_output, tei_validation):
-    '''Build XML output tree based on options and extracted information'''
-    # clear comments if necessary
-    if include_comments is False:
-        commentsbody = None
-    # TEI-XML
-    if tei_output is True:
-        # build TEI tree
-        output = write_teitree(postbody, commentsbody, docmeta)
-        # filter output (strip unwanted elements), just in case
-        # check and repair
-        output = check_tei(output, docmeta.url)
-    # XML
-    else:
-        output = etree.Element('doc')
-        postbody.tag = 'main'
-        output.append(postbody)
-        if commentsbody is not None:
-            commentsbody.tag = 'comments'
-            output.append(commentsbody)
-        # metadata
-        if docmeta:
-            if docmeta.sitename is not None:
-                output.set('sitename', docmeta.sitename)
-            if docmeta.title is not None:
-                output.set('title', docmeta.title)
-            if docmeta.author is not None:
-                output.set('author', docmeta.author)
-            if docmeta.date is not None:
-                output.set('date', docmeta.date)
-            if docmeta.url is not None:
-                output.set('source', docmeta.url)
-            if docmeta.description is not None:
-                output.set('excerpt', docmeta.description)
-            if docmeta.categories is not None and len(docmeta.categories) > 0:
-                cats = ';'.join(docmeta.categories)
-                output.set('categories', cats)
-            if docmeta.tags is not None and len(docmeta.tags) > 0:
-                tags = ';'.join(docmeta.tags)
-                output.set('tags', tags)
+def build_xml_output(postbody, commentsbody):
+    '''Build XML output tree based on extracted information'''
+    output = etree.Element('doc')
+    postbody.tag = 'main'
+    output.append(postbody)
+    if commentsbody is not None:
+        commentsbody.tag = 'comments'
+        output.append(commentsbody)
+    return output
+
+
+def add_xml_meta(output, docmeta):
+    '''Add extracted metadata to the XML output tree'''
+    # metadata
+    if docmeta:
+        if docmeta.sitename is not None:
+            output.set('sitename', docmeta.sitename)
+        if docmeta.title is not None:
+            output.set('title', docmeta.title)
+        if docmeta.author is not None:
+            output.set('author', docmeta.author)
+        if docmeta.date is not None:
+            output.set('date', docmeta.date)
+        if docmeta.url is not None:
+            output.set('source', docmeta.url)
+        if docmeta.description is not None:
+            output.set('excerpt', docmeta.description)
+        if docmeta.categories is not None and len(docmeta.categories) > 0:
+            cats = ';'.join(docmeta.categories)
+            output.set('categories', cats)
+        if docmeta.tags is not None and len(docmeta.tags) > 0:
+            tags = ';'.join(docmeta.tags)
+            output.set('tags', tags)
+    return output
+
+
+def build_tei_output(postbody, commentsbody, docmeta):
+    '''Build TEI-XML output tree based on extracted information'''
+    # build TEI tree
+    output = write_teitree(postbody, commentsbody, docmeta)
+    # filter output (strip unwanted elements), just in case
+    # check and repair
+    output = check_tei(output, docmeta.url)
     return output