Skip to content

Commit

Permalink
feat: improve parsing performance from o(n^2) to o(n)
Browse files Browse the repository at this point in the history
The function process_tag was previously concatenating strings
inside of a loop. Each + operation creates a new string,
resulting in repeated copying of already accumulated data. By
replacing this with an array we append to, and then joining this
array at the end, we can take this from an exponential function
to a linear function.
  • Loading branch information
AlextheYounga committed Dec 17, 2024
1 parent 6258f5c commit 7dd9a1a
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 7 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -10,3 +10,4 @@
build/
.vscode/settings.json
.tox/
.python-version
26 changes: 19 additions & 7 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,7 +126,7 @@ def convert_soup(self, soup):
return self.process_tag(soup, convert_as_inline=False, children_only=True)

def process_tag(self, node, convert_as_inline, children_only=False):
text = ''
text_parts = []

# markdown headings or cells can't include
# block elements (elements w/newlines)
Expand Down Expand Up @@ -160,22 +160,34 @@ def process_tag(self, node, convert_as_inline, children_only=False):
if isinstance(el, Comment) or isinstance(el, Doctype):
continue
elif isinstance(el, NavigableString):
text += self.process_text(el)
text_parts.append(self.process_text(el))
else:
text_strip = text.rstrip('\n')
newlines_left = len(text) - len(text_strip)
# Handle the case when text_parts is not empty
if text_parts:
text_strip = text_parts[-1].rstrip('\n')
newlines_left = len(text_parts[-1]) - len(text_strip)
else:
text_strip = ''
newlines_left = 0

next_text = self.process_tag(el, convert_children_as_inline)
next_text_strip = next_text.lstrip('\n')
newlines_right = len(next_text) - len(next_text_strip)
newlines = '\n' * max(newlines_left, newlines_right)
text = text_strip + newlines + next_text_strip
# Modify how we add the new text
if text_parts:
text_parts[-1] = text_strip
text_parts.append(newlines + next_text_strip)

if not children_only:
convert_fn = getattr(self, 'convert_%s' % node.name, None)
if convert_fn and self.should_convert_tag(node.name):
text = convert_fn(node, text, convert_as_inline)
# Join the text parts before passing to convert_fn
text_parts_str = ''.join(text_parts)
text_parts = [convert_fn(node, text_parts_str, convert_as_inline)]

return text
# Return the joined text parts
return ''.join(text_parts)

def process_text(self, el):
text = six.text_type(el) or ''
Expand Down

0 comments on commit 7dd9a1a

Please sign in to comment.