Skip to content

Commit

Permalink
review code
Browse files Browse the repository at this point in the history
  • Loading branch information
adbar committed Mar 28, 2024
1 parent 0afe678 commit 55036fb
Show file tree
Hide file tree
Showing 3 changed files with 4 additions and 7 deletions.
3 changes: 1 addition & 2 deletions trafilatura/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -346,8 +346,7 @@ def handle_paragraphs(element, potential_tags, options):
return processed_element
if processed_element.text:
return processed_element
if LOGGER.isEnabledFor(logging.DEBUG):
LOGGER.debug('discarding p-child: %s', tostring(processed_element))
LOGGER.debug('discarding p-child: %s', tostring(processed_element))
return None


Expand Down
3 changes: 0 additions & 3 deletions trafilatura/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,9 +313,6 @@ def sanitize_tree(tree):
@lru_cache(maxsize=1024)
def trim(string):
'''Remove unnecessary spaces within a text string'''
if string is None:
return None

try:
# remove newlines that are not related to punctuation or markup + proper trimming
# return LINES_TRIMMING.sub(r' ', string).strip(' \t\n\r\v')
Expand Down
5 changes: 3 additions & 2 deletions trafilatura/xml.py
Original file line number Diff line number Diff line change
Expand Up @@ -280,8 +280,7 @@ def process_element(element, returnlist, include_formatting):
returnlist.append(textelement)

for child in element:
if child is not None:
process_element(child, returnlist, include_formatting)
process_element(child, returnlist, include_formatting)

if element.text is None and element.tail is None:
if element.tag == 'graphic':
Expand Down Expand Up @@ -309,6 +308,7 @@ def process_element(element, returnlist, include_formatting):
if element.tail is not None:
returnlist.append(element.tail)


def xmltotxt(xmloutput, include_formatting):
'''Convert to plain text format and optionally preserve formatting as markdown.'''
returnlist = []
Expand All @@ -317,6 +317,7 @@ def xmltotxt(xmloutput, include_formatting):

return unescape(sanitize(''.join(returnlist)))


def xmltocsv(document, include_formatting, *, delim="\t", null="null"):
"Convert the internal XML document representation to a CSV string."
# preprocessing
Expand Down

0 comments on commit 55036fb

Please sign in to comment.