moving

slava-vishnyakov · Aug 1, 2024 · 268d51f · 268d51f
1 parent 83ccc30
commit 268d51f
Show file tree

Hide file tree

Showing 5 changed files with 55 additions and 22 deletions.
diff --git a/Python/README.md b/Python/README.md
@@ -2,6 +2,8 @@
 
 [![Python Tests](https://github.com/slava-vishnyakov/domscribe/actions/workflows/python-tests.yml/badge.svg)](https://github.com/slava-vishnyakov/domscribe/actions/workflows/python-tests.yml)
 
+> **Warning**: This is an alpha version of Domscribe. Some tests are still failing, and the API may change in future releases. Use with caution in production environments.
+
 This Python library is a semi-automated port of [dom-to-semantic-markdown](https://github.com/romansky/dom-to-semantic-markdown). It converts HTML to semantic Markdown, preserving the structure and meaning of the original content.
 
 ## Why Domscribe?

diff --git a/Python/domscribe/html_to_markdown_ast.py b/Python/domscribe/html_to_markdown_ast.py
@@ -117,13 +117,31 @@ def debug_log(message: str):
                         'content': html_to_markdown_ast(child, options, indent_level)
                     })
                 else:
-                    unhandled_element_processing = options.get('process_unhandled_element') if options else None
-                    if unhandled_element_processing:
-                        debug_log(f"Processing Unhandled Element: '{child.name}'")
-                        result.extend(unhandled_element_processing(child, options, indent_level))
+                    keep_html = options.get('keep_html', []) if options else []
+                    if child.name in keep_html:
+                        debug_log(f"Preserving HTML Element: '{child.name}'")
+                        attrs = []
+                        for k, v in child.attrs.items():
+                            if k == 'class':
+                                class_value = ' '.join(v) if isinstance(v, list) else v
+                                attrs.append(f'class="{class_value}"')
+                            elif v:
+                                attrs.append(f'{k}="{v}"')
+                        attrs = ' '.join(attrs)
+                        result.append({
+                            'type': 'preservedHtml',
+                            'tag': child.name,
+                            'attrs': attrs,
+                            'content': html_to_markdown_ast(child, options, indent_level + 1)
+                        })
                     else:
-                        debug_log(f"Generic HTMLElement: '{child.name}'")
-                        result.extend(html_to_markdown_ast(child, options, indent_level + 1))
+                        unhandled_element_processing = options.get('process_unhandled_element') if options else None
+                        if unhandled_element_processing:
+                            debug_log(f"Processing Unhandled Element: '{child.name}'")
+                            result.extend(unhandled_element_processing(child, options, indent_level))
+                        else:
+                            debug_log(f"Generic HTMLElement: '{child.name}'")
+                            result.extend(html_to_markdown_ast(child, options, indent_level + 1))
         elif child.string and child.string.strip():
             result.append({'type': 'text', 'content': child.string.strip()})
 

diff --git a/Python/domscribe/markdown_ast_to_string.py b/Python/domscribe/markdown_ast_to_string.py
@@ -21,7 +21,7 @@ def debug_log(message: str):
                 markdown_string += override_result
                 continue
 
-        if node['type'] in ['text', 'bold', 'italic', 'strikethrough', 'link']:
+        if node['type'] in ['text', 'bold', 'italic', 'strikethrough', 'link', 'reflink']:
             debug_log(f"Processing inline element: {node['type']}")
             is_last_whitespace = markdown_string[-1].isspace() if markdown_string else False
             is_starts_with_whitespace = node['content'][0].isspace() if node['content'] and isinstance(node['content'], str) else False
@@ -39,10 +39,13 @@ def debug_log(message: str):
                 markdown_string += f"~~{node['content']}~~"
             elif node['type'] == 'link':
                 if len(node['content']) == 1 and node['content'][0]['type'] == 'text':
-                    markdown_string += f" [{node['content'][0]['content']}]({node['href']})"
+                    markdown_string += f"[{node['content'][0]['content']}]({node['href']})"
                 else:
                     link_content = markdown_ast_to_string(node['content'], options, indent_level + 1)
-                    markdown_string += f" <a href=\"{node['href']}\">{link_content}</a>"
+                    markdown_string += f"<a href=\"{node['href']}\">{link_content}</a>"
+            elif node['type'] == 'reflink':
+                link_content = markdown_ast_to_string(node['content'], options, indent_level + 1).strip()
+                markdown_string += f"[{link_content}]{node['href']}\n"
 
         elif node['type'] == 'heading':
             debug_log(f"Processing heading level {node['level']}")
@@ -128,5 +131,17 @@ def debug_log(message: str):
             custom_node_rendering = options.get('render_custom_node')
             if custom_node_rendering:
                 markdown_string += custom_node_rendering(node, options, indent_level)
+
+        elif node['type'] == 'preservedHtml':
+            debug_log(f"Processing preserved HTML: {node['tag']}")
+            content = markdown_ast_to_string(node['content'], options, indent_level)
+            # Ensure proper spacing before the opening tag
+            if markdown_string and not markdown_string.endswith(' '):
+                markdown_string += ' '
+            attrs = node['attrs'] if node['attrs'] else ''
+            markdown_string += f"<{node['tag']}{' ' + attrs if attrs else ''}>{content.strip()}</{node['tag']}>"
+            # Ensure proper spacing after the closing tag
+            if not content.endswith(' '):
+                markdown_string += ' '
 
     return markdown_string
diff --git a/Python/domscribe/url_utils.py b/Python/domscribe/url_utils.py
@@ -54,7 +54,7 @@ def process_element(element: Union[SemanticMarkdownAST, List[SemanticMarkdownAST
                 else:
                     element['content'] = [{
                         'type': 'text',
-                        'content': element.get('content', '')[0].get('content', '').strip()
+                        'content': ''.join(item.get('content', '') for item in element.get('content', [])).strip()
                     }]
                 element['href'] = f'[{ref_number}]'  # Use reference number as href
                 element['type'] = 'reflink'  # Change the type to 'reflink'
@@ -70,15 +70,13 @@ def process_element(element: Union[SemanticMarkdownAST, List[SemanticMarkdownAST
     # Add reference links at the end
     if isinstance(processed_ast, list):
         processed_ast.append({'type': 'newline'})
-        processed_ast.append({'type': 'newline'})
+        reference_links = []
         for url, ref_number in url_map.items():
-            processed_ast.append({
-                'type': 'text',
-                'content': f"[{ref_number}]: {url}"
-            })
-            processed_ast.append({'type': 'newline'})
-        # Remove the last newline to match the expected output
-        if processed_ast and processed_ast[-1]['type'] == 'newline':
-            processed_ast.pop()
+            reference_links.append(f"[{ref_number}]: {url}")
+        processed_ast.append({
+            'type': 'text',
+            'content': '\n'.join(reference_links)
+        })
+        processed_ast.append({'type': 'newline'})
 
     return processed_ast
diff --git a/Python/tests/test_converter.py b/Python/tests/test_converter.py
@@ -186,9 +186,9 @@ def test_refify_urls_with_repeated_links():
     '''
     options = {'refify_urls': True}
     expected = '''
-Link 1[1] and Link 2[2]
+[Link 1][1] and [Link 2][2]
 
-Here's Link 1[1] again.
+Here's [Link 1][1] again.
 
 [1]: https://example.com
 [2]: https://example.org
@@ -201,7 +201,7 @@ def test_refify_urls_with_different_text_same_url():
     '''
     options = {'refify_urls': True}
     expected = '''
-First link[1] and Second link[1]
+[First link][1] and [Second link][1]
 
 [1]: https://example.com
 '''.strip() + '\n'