feat: improve parsing performance from o(n^2) to o(n)

The function process_tag was previously concatenating strings inside of a loop. Each + operation creates a new string, resulting in repeated copying of already accumulated data. By replacing this with an array we append to, and then joining this array at the end, we can take this from an exponential function to a linear function.
matthewwithanm · Dec 17, 2024 · 7dd9a1a · 7dd9a1a
1 parent 6258f5c
commit 7dd9a1a
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 7 deletions.
diff --git a/.gitignore b/.gitignore
@@ -10,3 +10,4 @@
 build/
 .vscode/settings.json
 .tox/
+.python-version
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -126,7 +126,7 @@ def convert_soup(self, soup):
         return self.process_tag(soup, convert_as_inline=False, children_only=True)
 
     def process_tag(self, node, convert_as_inline, children_only=False):
-        text = ''
+        text_parts = []
 
         # markdown headings or cells can't include
         # block elements (elements w/newlines)
@@ -160,22 +160,34 @@ def process_tag(self, node, convert_as_inline, children_only=False):
             if isinstance(el, Comment) or isinstance(el, Doctype):
                 continue
             elif isinstance(el, NavigableString):
-                text += self.process_text(el)
+                text_parts.append(self.process_text(el))
             else:
-                text_strip = text.rstrip('\n')
-                newlines_left = len(text) - len(text_strip)
+                # Handle the case when text_parts is not empty
+                if text_parts:
+                    text_strip = text_parts[-1].rstrip('\n')
+                    newlines_left = len(text_parts[-1]) - len(text_strip)
+                else:
+                    text_strip = ''
+                    newlines_left = 0
+
                 next_text = self.process_tag(el, convert_children_as_inline)
                 next_text_strip = next_text.lstrip('\n')
                 newlines_right = len(next_text) - len(next_text_strip)
                 newlines = '\n' * max(newlines_left, newlines_right)
-                text = text_strip + newlines + next_text_strip
+                # Modify how we add the new text
+                if text_parts:
+                    text_parts[-1] = text_strip
+                text_parts.append(newlines + next_text_strip)
 
         if not children_only:
             convert_fn = getattr(self, 'convert_%s' % node.name, None)
             if convert_fn and self.should_convert_tag(node.name):
-                text = convert_fn(node, text, convert_as_inline)
+                # Join the text parts before passing to convert_fn
+                text_parts_str = ''.join(text_parts)
+                text_parts = [convert_fn(node, text_parts_str, convert_as_inline)]
 
-        return text
+        # Return the joined text parts
+        return ''.join(text_parts)
 
     def process_text(self, el):
         text = six.text_type(el) or ''
-Original file line number
+Diff line change
@@ Expand Up / @@ -10,3 +10,4 @@ @@
     build/
     .vscode/settings.json
     .tox/
+    .python-version