Skip to content

Commit

Permalink
moving
Browse files Browse the repository at this point in the history
  • Loading branch information
slava-vishnyakov committed Aug 1, 2024
1 parent 83ccc30 commit 268d51f
Show file tree
Hide file tree
Showing 5 changed files with 55 additions and 22 deletions.
2 changes: 2 additions & 0 deletions Python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@

[![Python Tests](https://github.com/slava-vishnyakov/domscribe/actions/workflows/python-tests.yml/badge.svg)](https://github.com/slava-vishnyakov/domscribe/actions/workflows/python-tests.yml)

> **Warning**: This is an alpha version of Domscribe. Some tests are still failing, and the API may change in future releases. Use with caution in production environments.
This Python library is a semi-automated port of [dom-to-semantic-markdown](https://github.com/romansky/dom-to-semantic-markdown). It converts HTML to semantic Markdown, preserving the structure and meaning of the original content.

## Why Domscribe?
Expand Down
30 changes: 24 additions & 6 deletions Python/domscribe/html_to_markdown_ast.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,13 +117,31 @@ def debug_log(message: str):
'content': html_to_markdown_ast(child, options, indent_level)
})
else:
unhandled_element_processing = options.get('process_unhandled_element') if options else None
if unhandled_element_processing:
debug_log(f"Processing Unhandled Element: '{child.name}'")
result.extend(unhandled_element_processing(child, options, indent_level))
keep_html = options.get('keep_html', []) if options else []
if child.name in keep_html:
debug_log(f"Preserving HTML Element: '{child.name}'")
attrs = []
for k, v in child.attrs.items():
if k == 'class':
class_value = ' '.join(v) if isinstance(v, list) else v
attrs.append(f'class="{class_value}"')
elif v:
attrs.append(f'{k}="{v}"')
attrs = ' '.join(attrs)
result.append({
'type': 'preservedHtml',
'tag': child.name,
'attrs': attrs,
'content': html_to_markdown_ast(child, options, indent_level + 1)
})
else:
debug_log(f"Generic HTMLElement: '{child.name}'")
result.extend(html_to_markdown_ast(child, options, indent_level + 1))
unhandled_element_processing = options.get('process_unhandled_element') if options else None
if unhandled_element_processing:
debug_log(f"Processing Unhandled Element: '{child.name}'")
result.extend(unhandled_element_processing(child, options, indent_level))
else:
debug_log(f"Generic HTMLElement: '{child.name}'")
result.extend(html_to_markdown_ast(child, options, indent_level + 1))
elif child.string and child.string.strip():
result.append({'type': 'text', 'content': child.string.strip()})

Expand Down
21 changes: 18 additions & 3 deletions Python/domscribe/markdown_ast_to_string.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def debug_log(message: str):
markdown_string += override_result
continue

if node['type'] in ['text', 'bold', 'italic', 'strikethrough', 'link']:
if node['type'] in ['text', 'bold', 'italic', 'strikethrough', 'link', 'reflink']:
debug_log(f"Processing inline element: {node['type']}")
is_last_whitespace = markdown_string[-1].isspace() if markdown_string else False
is_starts_with_whitespace = node['content'][0].isspace() if node['content'] and isinstance(node['content'], str) else False
Expand All @@ -39,10 +39,13 @@ def debug_log(message: str):
markdown_string += f"~~{node['content']}~~"
elif node['type'] == 'link':
if len(node['content']) == 1 and node['content'][0]['type'] == 'text':
markdown_string += f" [{node['content'][0]['content']}]({node['href']})"
markdown_string += f"[{node['content'][0]['content']}]({node['href']})"
else:
link_content = markdown_ast_to_string(node['content'], options, indent_level + 1)
markdown_string += f" <a href=\"{node['href']}\">{link_content}</a>"
markdown_string += f"<a href=\"{node['href']}\">{link_content}</a>"
elif node['type'] == 'reflink':
link_content = markdown_ast_to_string(node['content'], options, indent_level + 1).strip()
markdown_string += f"[{link_content}]{node['href']}\n"

elif node['type'] == 'heading':
debug_log(f"Processing heading level {node['level']}")
Expand Down Expand Up @@ -128,5 +131,17 @@ def debug_log(message: str):
custom_node_rendering = options.get('render_custom_node')
if custom_node_rendering:
markdown_string += custom_node_rendering(node, options, indent_level)

elif node['type'] == 'preservedHtml':
debug_log(f"Processing preserved HTML: {node['tag']}")
content = markdown_ast_to_string(node['content'], options, indent_level)
# Ensure proper spacing before the opening tag
if markdown_string and not markdown_string.endswith(' '):
markdown_string += ' '
attrs = node['attrs'] if node['attrs'] else ''
markdown_string += f"<{node['tag']}{' ' + attrs if attrs else ''}>{content.strip()}</{node['tag']}>"
# Ensure proper spacing after the closing tag
if not content.endswith(' '):
markdown_string += ' '

return markdown_string
18 changes: 8 additions & 10 deletions Python/domscribe/url_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def process_element(element: Union[SemanticMarkdownAST, List[SemanticMarkdownAST
else:
element['content'] = [{
'type': 'text',
'content': element.get('content', '')[0].get('content', '').strip()
'content': ''.join(item.get('content', '') for item in element.get('content', [])).strip()
}]
element['href'] = f'[{ref_number}]' # Use reference number as href
element['type'] = 'reflink' # Change the type to 'reflink'
Expand All @@ -70,15 +70,13 @@ def process_element(element: Union[SemanticMarkdownAST, List[SemanticMarkdownAST
# Add reference links at the end
if isinstance(processed_ast, list):
processed_ast.append({'type': 'newline'})
processed_ast.append({'type': 'newline'})
reference_links = []
for url, ref_number in url_map.items():
processed_ast.append({
'type': 'text',
'content': f"[{ref_number}]: {url}"
})
processed_ast.append({'type': 'newline'})
# Remove the last newline to match the expected output
if processed_ast and processed_ast[-1]['type'] == 'newline':
processed_ast.pop()
reference_links.append(f"[{ref_number}]: {url}")
processed_ast.append({
'type': 'text',
'content': '\n'.join(reference_links)
})
processed_ast.append({'type': 'newline'})

return processed_ast
6 changes: 3 additions & 3 deletions Python/tests/test_converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,9 +186,9 @@ def test_refify_urls_with_repeated_links():
'''
options = {'refify_urls': True}
expected = '''
Link 1[1] and Link 2[2]
[Link 1][1] and [Link 2][2]
Here's Link 1[1] again.
Here's [Link 1][1] again.
[1]: https://example.com
[2]: https://example.org
Expand All @@ -201,7 +201,7 @@ def test_refify_urls_with_different_text_same_url():
'''
options = {'refify_urls': True}
expected = '''
First link[1] and Second link[1]
[First link][1] and [Second link][1]
[1]: https://example.com
'''.strip() + '\n'
Expand Down

0 comments on commit 268d51f

Please sign in to comment.