Skip to content

Commit

Permalink
Merge branch 'develop'
Browse files Browse the repository at this point in the history
  • Loading branch information
AlexVonB committed Nov 24, 2024
2 parents b5c724a + fe8a821 commit e935ce8
Show file tree
Hide file tree
Showing 9 changed files with 233 additions and 119 deletions.
4 changes: 2 additions & 2 deletions README.rst
Original file line number Diff line number Diff line change
Expand Up @@ -128,9 +128,9 @@ escape_underscores
Defaults to ``True``.

escape_misc
If set to ``False``, do not escape miscellaneous punctuation characters
If set to ``True``, escape miscellaneous punctuation characters
that sometimes have Markdown significance in text.
Defaults to ``True``.
Defaults to ``False``.

keep_inline_images_in
Images are converted to their alt-text when the images are located inside
Expand Down
153 changes: 105 additions & 48 deletions markdownify/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
convert_heading_re = re.compile(r'convert_h(\d+)')
line_beginning_re = re.compile(r'^', re.MULTILINE)
whitespace_re = re.compile(r'[\t ]+')
all_whitespace_re = re.compile(r'[\s]+')
all_whitespace_re = re.compile(r'[\t \r\n]+')
newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
html_heading_re = re.compile(r'h[1-6]')


Expand Down Expand Up @@ -66,6 +67,23 @@ def _todict(obj):
return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))


def should_remove_whitespace_inside(el):
"""Return to remove whitespace immediately inside a block-level element."""
if not el or not el.name:
return False
if html_heading_re.match(el.name) is not None:
return True
return el.name in ('p', 'blockquote',
'ol', 'ul', 'li',
'table', 'thead', 'tbody', 'tfoot',
'tr', 'td', 'th')


def should_remove_whitespace_outside(el):
"""Return to remove whitespace immediately outside a block-level element."""
return should_remove_whitespace_inside(el) or (el and el.name == 'pre')


class MarkdownConverter(object):
class DefaultOptions:
autolinks = True
Expand All @@ -76,7 +94,7 @@ class DefaultOptions:
default_title = False
escape_asterisks = True
escape_underscores = True
escape_misc = True
escape_misc = False
heading_style = UNDERLINED
keep_inline_images_in = []
newline_style = SPACES
Expand Down Expand Up @@ -119,27 +137,23 @@ def process_tag(self, node, convert_as_inline, children_only=False):
if not children_only and (isHeading or isCell):
convert_children_as_inline = True

# Remove whitespace-only textnodes in purely nested nodes
def is_nested_node(el):
return el and el.name in ['ol', 'ul', 'li',
'table', 'thead', 'tbody', 'tfoot',
'tr', 'td', 'th']

if is_nested_node(node):
for el in node.children:
# Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent
# - el is the last element in its parent
# - el is adjacent to an nested node
can_extract = (not el.previous_sibling
or not el.next_sibling
or is_nested_node(el.previous_sibling)
or is_nested_node(el.next_sibling))
if (isinstance(el, NavigableString)
and six.text_type(el).strip() == ''
and can_extract):
el.extract()
# Remove whitespace-only textnodes just before, after or
# inside block-level elements.
should_remove_inside = should_remove_whitespace_inside(node)
for el in node.children:
# Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent (block-level)
# - el is the last element in its parent (block-level)
# - el is adjacent to a block-level node
can_extract = (should_remove_inside and (not el.previous_sibling
or not el.next_sibling)
or should_remove_whitespace_outside(el.previous_sibling)
or should_remove_whitespace_outside(el.next_sibling))
if (isinstance(el, NavigableString)
and six.text_type(el).strip() == ''
and can_extract):
el.extract()

# Convert the children first
for el in node.children:
Expand All @@ -148,7 +162,13 @@ def is_nested_node(el):
elif isinstance(el, NavigableString):
text += self.process_text(el)
else:
text += self.process_tag(el, convert_children_as_inline)
text_strip = text.rstrip('\n')
newlines_left = len(text) - len(text_strip)
next_text = self.process_tag(el, convert_children_as_inline)
next_text_strip = next_text.lstrip('\n')
newlines_right = len(next_text) - len(next_text_strip)
newlines = '\n' * max(newlines_left, newlines_right)
text = text_strip + newlines + next_text_strip

if not children_only:
convert_fn = getattr(self, 'convert_%s' % node.name, None)
Expand All @@ -162,18 +182,26 @@ def process_text(self, el):

# normalize whitespace if we're not inside a preformatted element
if not el.find_parent('pre'):
text = whitespace_re.sub(' ', text)
if self.options['wrap']:
text = all_whitespace_re.sub(' ', text)
else:
text = newline_whitespace_re.sub('\n', text)
text = whitespace_re.sub(' ', text)

# escape special characters if we're not inside a preformatted or code element
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
text = self.escape(text)

# remove trailing whitespaces if any of the following condition is true:
# - current text node is the last node in li
# - current text node is followed by an embedded list
if (el.parent.name == 'li'
and (not el.next_sibling
or el.next_sibling.name in ['ul', 'ol'])):
# remove leading whitespace at the start or just after a
# block-level element; remove traliing whitespace at the end
# or just before a block-level element.
if (should_remove_whitespace_outside(el.previous_sibling)
or (should_remove_whitespace_inside(el.parent)
and not el.previous_sibling)):
text = text.lstrip()
if (should_remove_whitespace_outside(el.next_sibling)
or (should_remove_whitespace_inside(el.parent)
and not el.next_sibling)):
text = text.rstrip()

return text
Expand Down Expand Up @@ -208,20 +236,32 @@ def escape(self, text):
if not text:
return ''
if self.options['escape_misc']:
text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
# A sequence of one or more consecutive '-', preceded and
# followed by whitespace or start/end of fragment, might
# be confused with an underline of a header, or with a
# list marker.
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
# A sequence of up to six consecutive '#', preceded and
# followed by whitespace or start/end of fragment, might
# be confused with an ATX heading.
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
# '.' or ')' preceded by up to nine digits might be
# confused with a list item.
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
text)
if self.options['escape_asterisks']:
text = text.replace('*', r'\*')
if self.options['escape_underscores']:
text = text.replace('_', r'\_')
return text

def indent(self, text, level):
return line_beginning_re.sub('\t' * level, text) if text else ''
def indent(self, text, columns):
return line_beginning_re.sub(' ' * columns, text) if text else ''

def underline(self, text, pad_char):
text = (text or '').rstrip()
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

def convert_a(self, el, text, convert_as_inline):
prefix, suffix, text = chomp(text)
Expand All @@ -246,7 +286,7 @@ def convert_a(self, el, text, convert_as_inline):
def convert_blockquote(self, el, text, convert_as_inline):

if convert_as_inline:
return text
return ' ' + text.strip() + ' '

return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''

Expand Down Expand Up @@ -280,10 +320,11 @@ def convert_hn(self, n, el, text, convert_as_inline):
if style == UNDERLINED and n <= 2:
line = '=' if n == 1 else '-'
return self.underline(text, line)
text = all_whitespace_re.sub(' ', text)
hashes = '#' * n
if style == ATX_CLOSED:
return '%s %s %s\n\n' % (hashes, text, hashes)
return '%s %s\n\n' % (hashes, text)
return '\n%s %s %s\n\n' % (hashes, text, hashes)
return '\n%s %s\n\n' % (hashes, text)

def convert_hr(self, el, text, convert_as_inline):
return '\n\n---\n\n'
Expand Down Expand Up @@ -317,8 +358,8 @@ def convert_list(self, el, text, convert_as_inline):
el = el.parent
if nested:
# remove trailing newline if nested
return '\n' + self.indent(text, 1).rstrip()
return text + ('\n' if before_paragraph else '')
return '\n' + text.rstrip()
return '\n\n' + text + ('\n' if before_paragraph else '')

convert_ul = convert_list
convert_ol = convert_list
Expand All @@ -339,17 +380,33 @@ def convert_li(self, el, text, convert_as_inline):
el = el.parent
bullets = self.options['bullets']
bullet = bullets[depth % len(bullets)]
return '%s %s\n' % (bullet, (text or '').strip())
bullet = bullet + ' '
text = (text or '').strip()
text = self.indent(text, len(bullet))
if text:
text = bullet + text[len(bullet):]
return '%s\n' % text

def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return text
return ' ' + text.strip() + ' '
if self.options['wrap']:
text = fill(text,
width=self.options['wrap_width'],
break_long_words=False,
break_on_hyphens=False)
return '%s\n\n' % text if text else ''
# Preserve newlines (and preceding whitespace) resulting
# from <br> tags. Newlines in the input have already been
# replaced by spaces.
lines = text.split('\n')
new_lines = []
for line in lines:
line = line.lstrip()
line_no_trailing = line.rstrip()
trailing = line[len(line_no_trailing):]
line = fill(line,
width=self.options['wrap_width'],
break_long_words=False,
break_on_hyphens=False)
new_lines.append(line + trailing)
text = '\n'.join(new_lines)
return '\n\n%s\n\n' % text if text else ''

def convert_pre(self, el, text, convert_as_inline):
if not text:
Expand Down
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

[project]
name = "markdownify"
version = "0.13.1"
version = "0.14.0"
authors = [{name = "Matthew Tretter", email = "[email protected]"}]
description = "Convert HTML to markdown."
readme = "README.rst"
Expand Down
2 changes: 1 addition & 1 deletion tests/test_advanced.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def test_chomp():

def test_nested():
text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
assert text == 'This is an [example link](http://example.com/).\n\n'
assert text == '\n\nThis is an [example link](http://example.com/).\n\n'


def test_ignore_comments():
Expand Down
1 change: 1 addition & 0 deletions tests/test_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ def test_soup():

def test_whitespace():
assert md(' a b \t\t c ') == ' a b c '
assert md(' a b \n\n c ') == ' a b\nc '
Loading

0 comments on commit e935ce8

Please sign in to comment.