diff --git a/markdownify/__init__.py b/markdownify/__init__.py index eeaaf74..da04ebf 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -6,6 +6,7 @@ import six convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') +all_whitespace_re = re.compile(r'[\s]+') html_heading_re = re.compile(r'h[1-6]') @@ -83,12 +84,18 @@ class MarkdownConverter(object): if not children_only and isHeading: convert_children_as_inline = True + # Remove whitespace-only textnodes in lists + if node.name in ['ol', 'ul', 'li']: + for el in node.children: + if isinstance(el, NavigableString) and six.text_type(el).strip() == '': + el.extract() + # Convert the children first for el in node.children: if isinstance(el, Comment): continue elif isinstance(el, NavigableString): - text += self.process_text(six.text_type(el)) + text += self.process_text(el) else: text += self.process_tag(el, convert_children_as_inline) @@ -99,7 +106,10 @@ class MarkdownConverter(object): return text - def process_text(self, text): + def process_text(self, el): + text = six.text_type(el) + if el.parent.name == 'li': + return escape(all_whitespace_re.sub(' ', text or '')).rstrip() return escape(whitespace_re.sub(' ', text or '')) def __getattr__(self, attr): @@ -199,6 +209,9 @@ class MarkdownConverter(object): # Ignoring convert_to_inline for list. nested = False + before_paragraph = False + if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']: + before_paragraph = True while el: if el.name == 'li': nested = True @@ -207,7 +220,7 @@ class MarkdownConverter(object): if nested: # remove trailing newline if nested return '\n' + self.indent(text, 1).rstrip() - return '\n' + text + '\n' + return text + ('\n' if before_paragraph else '') convert_ul = convert_list convert_ol = convert_list diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 9d25940..caac0fd 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -2,7 +2,7 @@ from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCOR import re -nested_uls = re.sub(r'\s+', '', """ +nested_uls = """
foo
bar
') == 'foo\n\n\n* a\n* b\n\nbar\n\n' + assert md('foo
bar
') == 'foo\n\n* a\n* b\n\nbar\n\n' def test_nested_uls(): @@ -278,11 +301,11 @@ def test_nested_uls(): Nested ULs should alternate bullet characters. """ - assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n\n' + assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' def test_bullets(): - assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n\n' + assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' def test_img():