From 67d65aef5401d1dfaf990195ec31a35247fb546e Mon Sep 17 00:00:00 2001 From: Adrien Barbaresi Date: Thu, 23 Apr 2020 13:18:36 +0200 Subject: [PATCH] code linted for readability --- trafilatura/core.py | 45 ++++++++++++---------- trafilatura/metadata.py | 2 +- trafilatura/xml.py | 83 +++++++++++++++++++++-------------------- 3 files changed, 69 insertions(+), 61 deletions(-) diff --git a/trafilatura/core.py b/trafilatura/core.py index a75264b8..c214e7e7 100644 --- a/trafilatura/core.py +++ b/trafilatura/core.py @@ -25,7 +25,7 @@ from .settings import (HTML_CLEANER, MIN_EXTRACTED_SIZE, MIN_EXTRACTED_COMM_SIZE, MIN_OUTPUT_SIZE, MIN_OUTPUT_COMM_SIZE, TAG_CATALOG) from .utils import load_html, sanitize, trim, txttocsv -from .xml import build_outputtree, validate_tei, xmltotxt +from .xml import add_xml_meta, build_tei_output, build_xml_output, validate_tei, xmltotxt from .xpaths import BODY_XPATH, COMMENTS_XPATH @@ -438,7 +438,7 @@ def extract_comments(tree): return comments_body, temp_comments, len_comments, tree -def compare_extraction(tree, url, body, text, len_text, sure_thing): +def compare_extraction(tree, url, body, text, len_text): '''Decide whether to choose own or external extraction based on a series of heuristics''' if tree is None: @@ -568,14 +568,14 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False, if include_comments is True: commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments(cleaned_tree) else: - commentsbody, temp_comments, len_comments = etree.Element('body'), '', 0 + commentsbody, temp_comments, len_comments = None, '', 0 # extract content postbody, temp_text, len_text, sure_thing = extract_content(cleaned_tree, include_tables) # compare if necessary if no_fallback is False: # and sure_thing is False: - postbody, temp_text, len_text = compare_extraction(backup_tree, url, postbody, temp_text, len_text, sure_thing) + postbody, temp_text, len_text = compare_extraction(backup_tree, url, postbody, temp_text, len_text) # try with justext if len_text < MIN_EXTRACTED_SIZE: LOGGER.error('not enough text %s %s', record_id, url) @@ -603,26 +603,22 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False, if language_filter(temp_text, temp_comments, target_language, record_id, url) is True: return None - # cache elements - put_in_cache(postbody) - put_in_cache(commentsbody) - # del tree_cache[cleaned_tree] - - # XML (TEI) steps - output = build_outputtree(record_id, postbody, commentsbody, docmeta, include_comments, tei_output, tei_validation) - # check duplicates at body level if duplicate_test(postbody) is True: return None - if xml_output is False and tei_output is False: - if csv_output is False: - returnstring = xmltotxt(output) - else: - posttext = xmltotxt(postbody) - commentstext = xmltotxt(commentsbody) - returnstring = txttocsv(posttext, commentstext, docmeta) - else: + # cache elements + put_in_cache(postbody) + if commentsbody is not None: + put_in_cache(commentsbody) + + # XML (TEI) steps + if xml_output is True or tei_output is True: + if xml_output is True: + output = build_xml_output(postbody, commentsbody) + output = add_xml_meta(output, docmeta) + elif tei_output is True: + output = build_tei_output(postbody, commentsbody, docmeta) # can be improved control_string = etree.tostring(output, encoding='unicode') control_string = sanitize(control_string) @@ -634,6 +630,15 @@ def extract(filecontent, url=None, record_id='0001', no_fallback=False, result = validate_tei(output_tree) LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url) returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip() + # CSV + TXT output + else: + if csv_output is True: + posttext = xmltotxt(postbody) + commentstext = xmltotxt(commentsbody) + returnstring = txttocsv(posttext, commentstext, docmeta) + else: + output = build_xml_output(postbody, commentsbody) + returnstring = xmltotxt(output) return returnstring diff --git a/trafilatura/metadata.py b/trafilatura/metadata.py index aab4f771..520427a5 100644 --- a/trafilatura/metadata.py +++ b/trafilatura/metadata.py @@ -309,7 +309,7 @@ def extract_metadata(filecontent, default_url=None): if mymeta.sitename: if mymeta.sitename.startswith('@'): # scrap Twitter ID - newsitename = re.sub(r'^@', '', mymeta.sitename) + newsitename = re.sub(r'^@', '', mymeta.sitename) mymeta = mymeta._replace(sitename=newsitename) # capitalize if not '.' in mymeta.sitename and not mymeta.sitename[0].isupper(): diff --git a/trafilatura/xml.py b/trafilatura/xml.py index 3c4ceafd..1597a2db 100644 --- a/trafilatura/xml.py +++ b/trafilatura/xml.py @@ -29,46 +29,49 @@ # https://chase-seibert.github.io/blog/2011/05/20/stripping-control-characters-in-python.html -def build_outputtree(record_id, postbody, commentsbody, docmeta, include_comments, tei_output, tei_validation): - '''Build XML output tree based on options and extracted information''' - # clear comments if necessary - if include_comments is False: - commentsbody = None - # TEI-XML - if tei_output is True: - # build TEI tree - output = write_teitree(postbody, commentsbody, docmeta) - # filter output (strip unwanted elements), just in case - # check and repair - output = check_tei(output, docmeta.url) - # XML - else: - output = etree.Element('doc') - postbody.tag = 'main' - output.append(postbody) - if commentsbody is not None: - commentsbody.tag = 'comments' - output.append(commentsbody) - # metadata - if docmeta: - if docmeta.sitename is not None: - output.set('sitename', docmeta.sitename) - if docmeta.title is not None: - output.set('title', docmeta.title) - if docmeta.author is not None: - output.set('author', docmeta.author) - if docmeta.date is not None: - output.set('date', docmeta.date) - if docmeta.url is not None: - output.set('source', docmeta.url) - if docmeta.description is not None: - output.set('excerpt', docmeta.description) - if docmeta.categories is not None and len(docmeta.categories) > 0: - cats = ';'.join(docmeta.categories) - output.set('categories', cats) - if docmeta.tags is not None and len(docmeta.tags) > 0: - tags = ';'.join(docmeta.tags) - output.set('tags', tags) +def build_xml_output(postbody, commentsbody): + '''Build XML output tree based on extracted information''' + output = etree.Element('doc') + postbody.tag = 'main' + output.append(postbody) + if commentsbody is not None: + commentsbody.tag = 'comments' + output.append(commentsbody) + return output + + +def add_xml_meta(output, docmeta): + '''Add extracted metadata to the XML output tree''' + # metadata + if docmeta: + if docmeta.sitename is not None: + output.set('sitename', docmeta.sitename) + if docmeta.title is not None: + output.set('title', docmeta.title) + if docmeta.author is not None: + output.set('author', docmeta.author) + if docmeta.date is not None: + output.set('date', docmeta.date) + if docmeta.url is not None: + output.set('source', docmeta.url) + if docmeta.description is not None: + output.set('excerpt', docmeta.description) + if docmeta.categories is not None and len(docmeta.categories) > 0: + cats = ';'.join(docmeta.categories) + output.set('categories', cats) + if docmeta.tags is not None and len(docmeta.tags) > 0: + tags = ';'.join(docmeta.tags) + output.set('tags', tags) + return output + + +def build_tei_output(postbody, commentsbody, docmeta): + '''Build TEI-XML output tree based on extracted information''' + # build TEI tree + output = write_teitree(postbody, commentsbody, docmeta) + # filter output (strip unwanted elements), just in case + # check and repair + output = check_tei(output, docmeta.url) return output