From 146104b41fb68a6b6710cf78017d34e5aae3c5a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi?= Date: Wed, 20 Nov 2019 10:37:39 +0100 Subject: [PATCH 1/6] Remove newline-only textnodes outside

---
 markdownify/__init__.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index 25608bf..f97bcbf 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -59,6 +59,11 @@ class MarkdownConverter(object):
     def process_tag(self, node, children_only=False):
         text = ''
 
+        # Clean newline-only textnodes outside 
+        for el in node.children:
+            if node.name != 'pre' and isinstance(el, NavigableString) and six.text_type(el) == '\n':
+                el.extract()
+
         # Convert the children first
         for el in node.children:
             if isinstance(el, NavigableString):

From 7b788bafd4d4d71dce4733dde8c0d88372f148f9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?R=C3=A9mi?= 
Date: Thu, 21 Nov 2019 09:35:34 +0100
Subject: [PATCH 2/6] Add nested OL test (for newlines) and correct lists
 nesting

---
 markdownify/__init__.py   |  9 +++++++--
 tests/test_conversions.py | 29 +++++++++++++++++++++++++----
 2 files changed, 32 insertions(+), 6 deletions(-)

diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index f97bcbf..b7c9545 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -151,14 +151,19 @@ class MarkdownConverter(object):
 
     def convert_list(self, el, text):
         nested = False
+        before_paragraph = False
+        print(el.name, repr(el.next_sibling), repr(text))
+        if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
+            print(el.name, repr(el.next_sibling))
+            before_paragraph = True
         while el:
             if el.name == 'li':
                 nested = True
                 break
             el = el.parent
         if nested:
-            text = '\n' + self.indent(text, 1)
-        return '\n' + text + '\n'
+            text = '\n' + self.indent(text, 1).rstrip()
+        return text + ('\n' if before_paragraph else '')
 
     convert_ul = convert_list
     convert_ol = convert_list
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index dfc8d3c..98065bb 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -2,7 +2,7 @@ from markdownify import markdownify as md, ATX, ATX_CLOSED
 import re
 
 
-nested_uls = re.sub('\s+', '', """
+nested_uls = """
     
  • 1
      @@ -19,7 +19,26 @@ nested_uls = re.sub('\s+', '', """
    • 2
    • 3
    • -
    """) +
""" + +nested_ols = """ +
    +
  1. 1 +
      +
    1. a +
        +
      1. I
      2. +
      3. II
      4. +
      5. III
      6. +
      +
    2. +
    3. b
    4. +
    5. c
    6. +
    +
  2. +
  3. 2
  4. +
  5. 3
  6. + """ def test_a(): @@ -92,6 +111,8 @@ def test_i(): def test_ol(): assert md('
    1. a
    2. b
    ') == '1. a\n2. b\n' +def test_nested_ols(): + assert md(nested_ols) == '1. 1 \n\t1. a \n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n' def test_p(): assert md('

    hello

    ') == 'hello\n\n' @@ -113,11 +134,11 @@ def test_nested_uls(): Nested ULs should alternate bullet characters. """ - assert md(nested_uls) == '* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t\t\n\t+ b\n\t+ c\n\t\n* 2\n* 3\n' + assert md(nested_uls) == '* 1 \n\t+ a \n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' def test_bullets(): - assert md(nested_uls, bullets='-') == '- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t\t\n\t- b\n\t- c\n\t\n- 2\n- 3\n' + assert md(nested_uls, bullets='-') == '- 1 \n\t- a \n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' def test_img(): From 6a0e5d8176294758877a557de075778f97f08d38 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi?= Date: Thu, 21 Nov 2019 09:46:22 +0100 Subject: [PATCH 3/6] Correct inline UL test as paragraphs are followed by two newlines --- tests/test_conversions.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 98065bb..d27b008 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -126,8 +126,7 @@ def test_ul(): assert md('
    • a
    • b
    ') == '* a\n* b\n' def test_inline_ul(): - assert md('

    foo

    • a
    • b

    bar

    ') == 'foo \n* a\n* b\n\nbar' - + assert md('

    foo

    • a
    • b

    bar

    ') == 'foo\n\n* a\n* b\n\nbar\n\n' def test_nested_uls(): """ From d23596706d95600bb7bf15b11f7e0108a2d5afbf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi?= Date: Fri, 22 Nov 2019 11:49:22 +0100 Subject: [PATCH 4/6] Remove debug prints --- markdownify/__init__.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index b7c9545..f0fe118 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -152,9 +152,7 @@ class MarkdownConverter(object): def convert_list(self, el, text): nested = False before_paragraph = False - print(el.name, repr(el.next_sibling), repr(text)) if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']: - print(el.name, repr(el.next_sibling)) before_paragraph = True while el: if el.name == 'li': From 2c7e4a0100e053c18417932cc9611f27a0888685 Mon Sep 17 00:00:00 2001 From: SimonIT Date: Wed, 26 Aug 2020 19:47:11 +0200 Subject: [PATCH 5/6] Fix tests --- tests/test_conversions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 3a75907..07aae57 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -140,8 +140,8 @@ def test_i(): def test_ol(): - assert md('
    1. a
    2. b
    ') == '\n1. a\n2. b\n\n' - assert md('
    1. a
    2. b
    ') == '\n3. a\n4. b\n\n' + assert md('
    1. a
    2. b
    ') == '1. a\n2. b\n' + assert md('
    1. a
    2. b
    ') == '3. a\n4. b\n' def test_nested_ols(): @@ -157,7 +157,7 @@ def test_strong(): def test_ul(): - assert md('
    • a
    • b
    ') == '\n* a\n* b\n\n' + assert md('
    • a
    • b
    ') == '* a\n* b\n' def test_inline_ul(): @@ -169,11 +169,11 @@ def test_nested_uls(): Nested ULs should alternate bullet characters. """ - assert md(nested_uls) == '* 1 \n\t+ a \n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n\n' + assert md(nested_uls) == '* 1 \n\t+ a \n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' def test_bullets(): - assert md(nested_uls, bullets='-') == '- 1 \n\t- a \n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n\n' + assert md(nested_uls, bullets='-') == '- 1 \n\t- a \n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' def test_img(): From 73800ced360d907f262b4d548926cfff9c9f42f5 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 2 May 2021 13:44:09 +0200 Subject: [PATCH 6/6] fixed whitespace issues at nested lists --- markdownify/__init__.py | 17 +++++++++++------ tests/test_conversions.py | 12 ++++++------ 2 files changed, 17 insertions(+), 12 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 1322ac0..da04ebf 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -6,6 +6,7 @@ import six convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') +all_whitespace_re = re.compile(r'[\s]+') html_heading_re = re.compile(r'h[1-6]') @@ -83,17 +84,18 @@ class MarkdownConverter(object): if not children_only and isHeading: convert_children_as_inline = True - # Clean newline-only textnodes outside
    -        for el in node.children:
    -            if node.name != 'pre' and isinstance(el, NavigableString) and six.text_type(el) == '\n':
    -                el.extract()
    +        # Remove whitespace-only textnodes in lists
    +        if node.name in ['ol', 'ul', 'li']:
    +            for el in node.children:
    +                if isinstance(el, NavigableString) and six.text_type(el).strip() == '':
    +                    el.extract()
     
             # Convert the children first
             for el in node.children:
                 if isinstance(el, Comment):
                     continue
                 elif isinstance(el, NavigableString):
    -                text += self.process_text(six.text_type(el))
    +                text += self.process_text(el)
                 else:
                     text += self.process_tag(el, convert_children_as_inline)
     
    @@ -104,7 +106,10 @@ class MarkdownConverter(object):
     
             return text
     
    -    def process_text(self, text):
    +    def process_text(self, el):
    +        text = six.text_type(el)
    +        if el.parent.name == 'li':
    +            return escape(all_whitespace_re.sub(' ', text or '')).rstrip()
             return escape(whitespace_re.sub(' ', text or ''))
     
         def __getattr__(self, attr):
    diff --git a/tests/test_conversions.py b/tests/test_conversions.py
    index 68bb81e..caac0fd 100644
    --- a/tests/test_conversions.py
    +++ b/tests/test_conversions.py
    @@ -276,10 +276,6 @@ def test_ol():
         assert md('
    1. a
    2. b
    ') == '3. a\n4. b\n' -def test_nested_ols(): - assert md(nested_ols) == '1. 1 \n\t1. a \n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n' - - def test_p(): assert md('

    hello

    ') == 'hello\n\n' @@ -292,6 +288,10 @@ def test_ul(): assert md('
    • a
    • b
    ') == '* a\n* b\n' +def test_nested_ols(): + assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n' + + def test_inline_ul(): assert md('

    foo

    • a
    • b

    bar

    ') == 'foo\n\n* a\n* b\n\nbar\n\n' @@ -301,11 +301,11 @@ def test_nested_uls(): Nested ULs should alternate bullet characters. """ - assert md(nested_uls) == '* 1 \n\t+ a \n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' + assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n' def test_bullets(): - assert md(nested_uls, bullets='-') == '- 1 \n\t- a \n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' + assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n' def test_img():