Merge branch 'develop'

bump to version v0.14.1
prevent <hn> to call convert_hn and crash
2024-11-24 23:05:17 +01:00 · 2024-11-24 23:05:02 +01:00 · 2024-11-24 21:20:57 +01:00 · 2024-11-24 21:11:42 +01:00 · 2024-11-24 12:26:53 +01:00 · 2024-11-24 12:26:29 +01:00
9 changed files with 239 additions and 119 deletions
--- a/README.rst
+++ b/README.rst
@@ -128,9 +128,9 @@ escape_underscores
  Defaults to ``True``.

 escape_misc
-  If set to ``False``, do not escape miscellaneous punctuation characters
+  If set to ``True``, escape miscellaneous punctuation characters
  that sometimes have Markdown significance in text.
-  Defaults to ``True``.
+  Defaults to ``False``.

 keep_inline_images_in
  Images are converted to their alt-text when the images are located inside
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -7,7 +7,8 @@ import six
 convert_heading_re = re.compile(r'convert_h(\d+)')
 line_beginning_re = re.compile(r'^', re.MULTILINE)
 whitespace_re = re.compile(r'[\t ]+')
-all_whitespace_re = re.compile(r'[\s]+')
+all_whitespace_re = re.compile(r'[\t \r\n]+')
+newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
 html_heading_re = re.compile(r'h[1-6]')


@@ -66,6 +67,23 @@ def _todict(obj):
    return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))


+def should_remove_whitespace_inside(el):
+    """Return to remove whitespace immediately inside a block-level element."""
+    if not el or not el.name:
+        return False
+    if html_heading_re.match(el.name) is not None:
+        return True
+    return el.name in ('p', 'blockquote',
+                       'ol', 'ul', 'li',
+                       'table', 'thead', 'tbody', 'tfoot',
+                       'tr', 'td', 'th')
+
+
+def should_remove_whitespace_outside(el):
+    """Return to remove whitespace immediately outside a block-level element."""
+    return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
+
+
 class MarkdownConverter(object):
    class DefaultOptions:
        autolinks = True
@@ -76,7 +94,7 @@ class MarkdownConverter(object):
        default_title = False
        escape_asterisks = True
        escape_underscores = True
-        escape_misc = True
+        escape_misc = False
        heading_style = UNDERLINED
        keep_inline_images_in = []
        newline_style = SPACES
@@ -119,27 +137,23 @@ class MarkdownConverter(object):
        if not children_only and (isHeading or isCell):
            convert_children_as_inline = True

-        # Remove whitespace-only textnodes in purely nested nodes
-        def is_nested_node(el):
-            return el and el.name in ['ol', 'ul', 'li',
-                                      'table', 'thead', 'tbody', 'tfoot',
-                                      'tr', 'td', 'th']
-
-        if is_nested_node(node):
-            for el in node.children:
-                # Only extract (remove) whitespace-only text node if any of the
-                # conditions is true:
-                # - el is the first element in its parent
-                # - el is the last element in its parent
-                # - el is adjacent to an nested node
-                can_extract = (not el.previous_sibling
-                               or not el.next_sibling
-                               or is_nested_node(el.previous_sibling)
-                               or is_nested_node(el.next_sibling))
-                if (isinstance(el, NavigableString)
-                        and six.text_type(el).strip() == ''
-                        and can_extract):
-                    el.extract()
+        # Remove whitespace-only textnodes just before, after or
+        # inside block-level elements.
+        should_remove_inside = should_remove_whitespace_inside(node)
+        for el in node.children:
+            # Only extract (remove) whitespace-only text node if any of the
+            # conditions is true:
+            # - el is the first element in its parent (block-level)
+            # - el is the last element in its parent (block-level)
+            # - el is adjacent to a block-level node
+            can_extract = (should_remove_inside and (not el.previous_sibling
+                                                     or not el.next_sibling)
+                           or should_remove_whitespace_outside(el.previous_sibling)
+                           or should_remove_whitespace_outside(el.next_sibling))
+            if (isinstance(el, NavigableString)
+                    and six.text_type(el).strip() == ''
+                    and can_extract):
+                el.extract()

        # Convert the children first
        for el in node.children:
@@ -148,7 +162,13 @@ class MarkdownConverter(object):
            elif isinstance(el, NavigableString):
                text += self.process_text(el)
            else:
-                text += self.process_tag(el, convert_children_as_inline)
+                text_strip = text.rstrip('\n')
+                newlines_left = len(text) - len(text_strip)
+                next_text = self.process_tag(el, convert_children_as_inline)
+                next_text_strip = next_text.lstrip('\n')
+                newlines_right = len(next_text) - len(next_text_strip)
+                newlines = '\n' * max(newlines_left, newlines_right)
+                text = text_strip + newlines + next_text_strip

        if not children_only:
            convert_fn = getattr(self, 'convert_%s' % node.name, None)
@@ -162,18 +182,26 @@ class MarkdownConverter(object):

        # normalize whitespace if we're not inside a preformatted element
        if not el.find_parent('pre'):
-            text = whitespace_re.sub(' ', text)
+            if self.options['wrap']:
+                text = all_whitespace_re.sub(' ', text)
+            else:
+                text = newline_whitespace_re.sub('\n', text)
+                text = whitespace_re.sub(' ', text)

        # escape special characters if we're not inside a preformatted or code element
        if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
            text = self.escape(text)

-        # remove trailing whitespaces if any of the following condition is true:
-        # - current text node is the last node in li
-        # - current text node is followed by an embedded list
-        if (el.parent.name == 'li'
-                and (not el.next_sibling
-                     or el.next_sibling.name in ['ul', 'ol'])):
+        # remove leading whitespace at the start or just after a
+        # block-level element; remove traliing whitespace at the end
+        # or just before a block-level element.
+        if (should_remove_whitespace_outside(el.previous_sibling)
+                or (should_remove_whitespace_inside(el.parent)
+                    and not el.previous_sibling)):
+            text = text.lstrip()
+        if (should_remove_whitespace_outside(el.next_sibling)
+                or (should_remove_whitespace_inside(el.parent)
+                    and not el.next_sibling)):
            text = text.rstrip()

        return text
@@ -185,7 +213,7 @@ class MarkdownConverter(object):
            n = int(m.group(1))

            def convert_tag(el, text, convert_as_inline):
-                return self.convert_hn(n, el, text, convert_as_inline)
+                return self._convert_hn(n, el, text, convert_as_inline)

            convert_tag.__name__ = 'convert_h%s' % n
            setattr(self, convert_tag.__name__, convert_tag)
@@ -208,20 +236,32 @@ class MarkdownConverter(object):
        if not text:
            return ''
        if self.options['escape_misc']:
-            text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
-            text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
+            text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
+            # A sequence of one or more consecutive '-', preceded and
+            # followed by whitespace or start/end of fragment, might
+            # be confused with an underline of a header, or with a
+            # list marker.
+            text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
+            # A sequence of up to six consecutive '#', preceded and
+            # followed by whitespace or start/end of fragment, might
+            # be confused with an ATX heading.
+            text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
+            # '.' or ')' preceded by up to nine digits might be
+            # confused with a list item.
+            text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
+                          text)
        if self.options['escape_asterisks']:
            text = text.replace('*', r'\*')
        if self.options['escape_underscores']:
            text = text.replace('_', r'\_')
        return text

-    def indent(self, text, level):
-        return line_beginning_re.sub('\t' * level, text) if text else ''
+    def indent(self, text, columns):
+        return line_beginning_re.sub(' ' * columns, text) if text else ''

    def underline(self, text, pad_char):
        text = (text or '').rstrip()
-        return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
+        return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

    def convert_a(self, el, text, convert_as_inline):
        prefix, suffix, text = chomp(text)
@@ -246,7 +286,7 @@ class MarkdownConverter(object):
    def convert_blockquote(self, el, text, convert_as_inline):

        if convert_as_inline:
-            return text
+            return ' ' + text.strip() + ' '

        return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''

@@ -271,19 +311,24 @@ class MarkdownConverter(object):

    convert_kbd = convert_code

-    def convert_hn(self, n, el, text, convert_as_inline):
+    def _convert_hn(self, n, el, text, convert_as_inline):
+        """ Method name prefixed with _ to prevent <hn> to call this """
        if convert_as_inline:
            return text

+        # prevent MemoryErrors in case of very large n
+        n = max(1, min(6, n))
+
        style = self.options['heading_style'].lower()
        text = text.strip()
        if style == UNDERLINED and n <= 2:
            line = '=' if n == 1 else '-'
            return self.underline(text, line)
+        text = all_whitespace_re.sub(' ', text)
        hashes = '#' * n
        if style == ATX_CLOSED:
-            return '%s %s %s\n\n' % (hashes, text, hashes)
-        return '%s %s\n\n' % (hashes, text)
+            return '\n%s %s %s\n\n' % (hashes, text, hashes)
+        return '\n%s %s\n\n' % (hashes, text)

    def convert_hr(self, el, text, convert_as_inline):
        return '\n\n---\n\n'
@@ -317,8 +362,8 @@ class MarkdownConverter(object):
            el = el.parent
        if nested:
            # remove trailing newline if nested
-            return '\n' + self.indent(text, 1).rstrip()
-        return text + ('\n' if before_paragraph else '')
+            return '\n' + text.rstrip()
+        return '\n\n' + text + ('\n' if before_paragraph else '')

    convert_ul = convert_list
    convert_ol = convert_list
@@ -339,17 +384,33 @@ class MarkdownConverter(object):
                el = el.parent
            bullets = self.options['bullets']
            bullet = bullets[depth % len(bullets)]
-        return '%s %s\n' % (bullet, (text or '').strip())
+        bullet = bullet + ' '
+        text = (text or '').strip()
+        text = self.indent(text, len(bullet))
+        if text:
+            text = bullet + text[len(bullet):]
+        return '%s\n' % text

    def convert_p(self, el, text, convert_as_inline):
        if convert_as_inline:
-            return text
+            return ' ' + text.strip() + ' '
        if self.options['wrap']:
-            text = fill(text,
-                        width=self.options['wrap_width'],
-                        break_long_words=False,
-                        break_on_hyphens=False)
-        return '%s\n\n' % text if text else ''
+            # Preserve newlines (and preceding whitespace) resulting
+            # from <br> tags.  Newlines in the input have already been
+            # replaced by spaces.
+            lines = text.split('\n')
+            new_lines = []
+            for line in lines:
+                line = line.lstrip()
+                line_no_trailing = line.rstrip()
+                trailing = line[len(line_no_trailing):]
+                line = fill(line,
+                            width=self.options['wrap_width'],
+                            break_long_words=False,
+                            break_on_hyphens=False)
+                new_lines.append(line + trailing)
+            text = '\n'.join(new_lines)
+        return '\n\n%s\n\n' % text if text else ''

    def convert_pre(self, el, text, convert_as_inline):
        if not text:
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "markdownify"
-version = "0.13.1"
+version = "0.14.1"
 authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
 description = "Convert HTML to markdown."
 readme = "README.rst"
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -14,7 +14,7 @@ def test_chomp():

 def test_nested():
    text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
-    assert text == 'This is an [example link](http://example.com/).\n\n'
+    assert text == '\n\nThis is an [example link](http://example.com/).\n\n'


 def test_ignore_comments():
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -11,3 +11,4 @@ def test_soup():

 def test_whitespace():
    assert md(' a  b \t\t c ') == ' a b c '
+    assert md(' a  b \n\n c ') == ' a b\nc '
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
+from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE


 def inline_tests(tag, markup):
@@ -66,7 +66,7 @@ def test_blockquote_with_paragraph():

 def test_blockquote_nested():
    text = md('<blockquote>And she was like <blockquote>Hello</blockquote></blockquote>')
-    assert text == '\n> And she was like \n> > Hello\n\n'
+    assert text == '\n> And she was like\n> > Hello\n\n'


 def test_br():
@@ -112,36 +112,41 @@ def test_em():


 def test_header_with_space():
-    assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
-    assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
-    assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
-    assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
-    assert md('<h5>\n\nHello   \n\n</h5>') == '##### Hello\n\n'
+    assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
+    assert md('<h3>Hello\n\n\nWorld</h3>') == '\n### Hello World\n\n'
+    assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
+    assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
+    assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
+    assert md('<h5>\n\nHello   \n\n</h5>') == '\n##### Hello\n\n'


 def test_h1():
-    assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
+    assert md('<h1>Hello</h1>') == '\n\nHello\n=====\n\n'


 def test_h2():
-    assert md('<h2>Hello</h2>') == 'Hello\n-----\n\n'
+    assert md('<h2>Hello</h2>') == '\n\nHello\n-----\n\n'


 def test_hn():
-    assert md('<h3>Hello</h3>') == '### Hello\n\n'
-    assert md('<h4>Hello</h4>') == '#### Hello\n\n'
-    assert md('<h5>Hello</h5>') == '##### Hello\n\n'
-    assert md('<h6>Hello</h6>') == '###### Hello\n\n'
+    assert md('<h3>Hello</h3>') == '\n### Hello\n\n'
+    assert md('<h4>Hello</h4>') == '\n#### Hello\n\n'
+    assert md('<h5>Hello</h5>') == '\n##### Hello\n\n'
+    assert md('<h6>Hello</h6>') == '\n###### Hello\n\n'
+    assert md('<h10>Hello</h10>') == md('<h6>Hello</h6>')
+    assert md('<hn>Hello</hn>') == md('Hello')


 def test_hn_chained():
-    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n'
-    assert md('X<h1>First</h1>', heading_style=ATX) == 'X# First\n\n'
+    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n'
+    assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
+    assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
+    assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'


 def test_hn_nested_tag_heading_style():
-    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '# A P C #\n\n'
-    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '# A P C\n\n'
+    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '\n# A P C #\n\n'
+    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '\n# A P C\n\n'


 def test_hn_nested_simple_tag():
@@ -157,12 +162,12 @@ def test_hn_nested_simple_tag():
    ]

    for tag, markdown in tag_to_markdown:
-        assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '### A ' + markdown + ' B\n\n'
+        assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '\n### A ' + markdown + ' B\n\n'

-    assert md('<h3>A <br>B</h3>', heading_style=ATX) == '### A B\n\n'
+    assert md('<h3>A <br>B</h3>', heading_style=ATX) == '\n### A B\n\n'

    # Nested lists not supported
-    # assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '### A li1 li2 B\n\n'
+    # assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '\n### A li1 li2 B\n\n'


 def test_hn_nested_img():
@@ -172,18 +177,18 @@ def test_hn_nested_img():
        ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
    ]
    for image_attributes, markdown, title in image_attributes_to_markdown:
-        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
-        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'


 def test_hn_atx_headings():
-    assert md('<h1>Hello</h1>', heading_style=ATX) == '# Hello\n\n'
-    assert md('<h2>Hello</h2>', heading_style=ATX) == '## Hello\n\n'
+    assert md('<h1>Hello</h1>', heading_style=ATX) == '\n# Hello\n\n'
+    assert md('<h2>Hello</h2>', heading_style=ATX) == '\n## Hello\n\n'


 def test_hn_atx_closed_headings():
-    assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '# Hello #\n\n'
-    assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '## Hello ##\n\n'
+    assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '\n# Hello #\n\n'
+    assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n'


 def test_head():
@@ -193,7 +198,7 @@ def test_head():
 def test_hr():
    assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
    assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
-    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n'
+    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n---\n\nWorld\n\n'


 def test_i():
@@ -210,12 +215,23 @@ def test_kbd():


 def test_p():
-    assert md('<p>hello</p>') == 'hello\n\n'
-    assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
-    assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
-    assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
-    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
-    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
+    assert md('<p>hello</p>') == '\n\nhello\n\n'
+    assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
+    assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
+    assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
+    assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
+    assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345  \n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345  \n67890\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901  \n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901  \n12345\n\n'
+    assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n'
+    assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012  \n67890\n\n'
+    assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'


 def test_pre():
@@ -289,3 +305,13 @@ def test_lang_callback():
    assert md('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n    foo\nbar\n```\n'
    assert md('<pre class="javascript"><code>test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
    assert md('<pre class="javascript"><code class="javascript">test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
+
+
+def test_spaces():
+    assert md('<p> a b </p> <p> c d </p>') == '\n\na b\n\nc d\n\n'
+    assert md('<p> <i>a</i> </p>') == '\n\n*a*\n\n'
+    assert md('test <p> again </p>') == 'test\n\nagain\n\n'
+    assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
+    assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
+    assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
+    assert md('test <pre> foo </pre> bar') == 'test\n```\n foo \n```\nbar'
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -1,3 +1,5 @@
+import warnings
+from bs4 import MarkupResemblesLocatorWarning
 from markdownify import markdownify as md


@@ -12,7 +14,7 @@ def test_underscore():


 def test_xml_entities():
-    assert md('&amp;') == r'\&'
+    assert md('&amp;', escape_misc=True) == r'\&'


 def test_named_entities():
@@ -25,23 +27,49 @@ def test_hexadecimal_entities():


 def test_single_escaping_entities():
-    assert md('&amp;amp;') == r'\&amp;'
+    assert md('&amp;amp;', escape_misc=True) == r'\&amp;'


-def text_misc():
-    assert md('\\*') == r'\\\*'
-    assert md('<foo>') == r'\<foo\>'
-    assert md('# foo') == r'\# foo'
-    assert md('> foo') == r'\> foo'
-    assert md('~~foo~~') == r'\~\~foo\~\~'
-    assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
-    assert md('---\n') == '\\-\\-\\-\n'
-    assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
-    assert md('`x`') == r'\`x\`'
-    assert md('[text](link)') == r'\[text](link)'
-    assert md('1. x') == r'1\. x'
-    assert md('not a number. x') == r'not a number. x'
-    assert md('1) x') == r'1\) x'
-    assert md('not a number) x') == r'not a number) x'
-    assert md('|not table|') == r'\|not table\|'
-    assert md(r'\ <foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'
+def test_misc():
+    # ignore the bs4 warning that "1.2" or "*" looks like a filename
+    warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
+
+    assert md('\\*', escape_misc=True) == r'\\\*'
+    assert md('&lt;foo>', escape_misc=True) == r'\<foo\>'
+    assert md('# foo', escape_misc=True) == r'\# foo'
+    assert md('#5', escape_misc=True) == r'#5'
+    assert md('5#', escape_misc=True) == '5#'
+    assert md('####### foo', escape_misc=True) == r'####### foo'
+    assert md('> foo', escape_misc=True) == r'\> foo'
+    assert md('~~foo~~', escape_misc=True) == r'\~\~foo\~\~'
+    assert md('foo\n===\n', escape_misc=True) == 'foo\n\\=\\=\\=\n'
+    assert md('---\n', escape_misc=True) == '\\---\n'
+    assert md('- test', escape_misc=True) == r'\- test'
+    assert md('x - y', escape_misc=True) == r'x \- y'
+    assert md('test-case', escape_misc=True) == 'test-case'
+    assert md('x-', escape_misc=True) == 'x-'
+    assert md('-y', escape_misc=True) == '-y'
+    assert md('+ x\n+ y\n', escape_misc=True) == '\\+ x\n\\+ y\n'
+    assert md('`x`', escape_misc=True) == r'\`x\`'
+    assert md('[text](link)', escape_misc=True) == r'\[text](link)'
+    assert md('1. x', escape_misc=True) == r'1\. x'
+    # assert md('1<span>.</span> x', escape_misc=True) == r'1\. x'
+    assert md('<span>1.</span> x', escape_misc=True) == r'1\. x'
+    assert md(' 1. x', escape_misc=True) == r' 1\. x'
+    assert md('123456789. x', escape_misc=True) == r'123456789\. x'
+    assert md('1234567890. x', escape_misc=True) == r'1234567890. x'
+    assert md('A1. x', escape_misc=True) == r'A1. x'
+    assert md('1.2', escape_misc=True) == r'1.2'
+    assert md('not a number. x', escape_misc=True) == r'not a number. x'
+    assert md('1) x', escape_misc=True) == r'1\) x'
+    # assert md('1<span>)</span> x', escape_misc=True) == r'1\) x'
+    assert md('<span>1)</span> x', escape_misc=True) == r'1\) x'
+    assert md(' 1) x', escape_misc=True) == r' 1\) x'
+    assert md('123456789) x', escape_misc=True) == r'123456789\) x'
+    assert md('1234567890) x', escape_misc=True) == r'1234567890) x'
+    assert md('(1) x', escape_misc=True) == r'(1) x'
+    assert md('A1) x', escape_misc=True) == r'A1) x'
+    assert md('1)x', escape_misc=True) == r'1)x'
+    assert md('not a number) x', escape_misc=True) == r'not a number) x'
+    assert md('|not table|', escape_misc=True) == r'\|not table\|'
+    assert md(r'\ &lt;foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -41,19 +41,21 @@ nested_ols = """


 def test_ol():
-    assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
-    assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
-    assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
-    assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
-    assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
+    assert md('<ol><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '\n\n3. a\n4. b\n'
+    assert md('foo<ol start="3"><li>a</li><li>b</li></ol>bar') == 'foo\n\n3. a\n4. b\n\nbar'
+    assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="1234"><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ol>') == '\n\n1234. first para\n      \n      second para\n1235. third para\n      \n      fourth para\n'


 def test_nested_ols():
-    assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
+    assert md(nested_ols) == '\n\n1. 1\n   1. a\n      1. I\n      2. II\n      3. III\n   2. b\n   3. c\n2. 2\n3. 3\n'


 def test_ul():
-    assert md('<ul><li>a</li><li>b</li></ul>') == '* a\n* b\n'
+    assert md('<ul><li>a</li><li>b</li></ul>') == '\n\n* a\n* b\n'
    assert md("""<ul>
     <li>
             a
@@ -61,11 +63,13 @@ def test_ul():
     <li> b </li>
     <li>   c
     </li>
- </ul>""") == '* a\n* b\n* c\n'
+ </ul>""") == '\n\n* a\n* b\n* c\n'
+    assert md('<ul><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ul>') == '\n\n* first para\n  \n  second para\n* third para\n  \n  fourth para\n'


 def test_inline_ul():
-    assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == 'foo\n\n* a\n* b\n\nbar\n\n'
+    assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n'
+    assert md('foo<ul><li>bar</li></ul>baz') == 'foo\n\n* bar\n\nbaz'


 def test_nested_uls():
@@ -73,12 +77,12 @@ def test_nested_uls():
    Nested ULs should alternate bullet characters.

    """
-    assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
+    assert md(nested_uls) == '\n\n* 1\n  + a\n    - I\n    - II\n    - III\n  + b\n  + c\n* 2\n* 3\n'


 def test_bullets():
-    assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
+    assert md(nested_uls, bullets='-') == '\n\n- 1\n  - a\n    - I\n    - II\n    - III\n  - b\n  - c\n- 2\n- 3\n'


 def test_li_text():
-    assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar  </li><li>foo <b>bar</b>   <i>space</i>.</ul>') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
+    assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar  </li><li>foo <b>bar</b>   <i>space</i>.</ul>') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -242,7 +242,7 @@ def test_table():
    assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
-    assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith  Jackson | 50 |\n| Eve | Jackson  Smith | 94 |\n\n'
+    assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
    assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
Author	SHA1	Message	Date
AlexVonB	8f70e3952f	Merge branch 'develop'	2024-11-24 23:05:17 +01:00
AlexVonB	6258f5c38b	bump to version v0.14.1	2024-11-24 23:05:02 +01:00
AlexVonB	3466061ca9	prevent `<hn>` to call convert_hn and crash fixes #142	2024-11-24 21:20:57 +01:00
AlexVonB	9595618796	prevent very large headline prefixes for example: `<h9999999>` could crash the conversion. fixes #143	2024-11-24 21:11:42 +01:00
AlexVonB	e935ce819e	Merge branch 'develop'	2024-11-24 12:26:53 +01:00
AlexVonB	fe8a821a20	bump to version v0.14.0	2024-11-24 12:26:29 +01:00
AlexVonB	54c7ca9937	renamed functions that return boolean	2024-11-24 12:10:57 +01:00
AlexVonB	19780834af	Merge branch 'alfonsrv-fix-pr-118' into jsm28-list-indentation	2024-11-24 12:07:59 +01:00
AlexVonB	9202027e26	ignore bs4 warnings in tests	2024-11-24 12:00:27 +01:00
AlexVonB	9bf4ff14b9	Merge branch 'jsm28-selective-escaping' into jsm28-list-indentation	2024-11-20 14:16:06 +01:00
alfonsrv	7ff4d835ae	Set escape_misc to False by default to improve backwards compatibility	2024-10-09 18:55:50 +02:00
Joseph Myers	c13bdd5c14	Fix logic for indentation inside list items This fixes problems with the markdownify logic for indentation inside list items. This PR uses a branch building on that for #120, #150 and #151, so those three PRs should be merged first before merging this one. There is limited logic in markdownify for handling indentation in the case of nested lists. There are two major problems with this logic: * As it's in `convert_list`, causing a list to be indented when inside another list, it does not add indentation for any other elements such as paragraphs that may be found inside list items (or `<pre>`, `<blockquote>`, etc.), so such elements are wrongly not indented and terminate the list in the output. * It uses fixed indentation of one tab. Following CommonMark, a tab in Markdown is considered equivalent to four spaces, which is not sufficient indentation in ordered list items with a number of three or more digits. Fix both of these issues by making `convert_li` handle indentation for the contents of `<li>`, based on the length of the list item marker, rather than doing it in `convert_list` at all.	2024-10-03 21:04:40 +00:00
Joseph Myers	340aecbe98	More thorough cleanup of input whitespace This improves the markdownify logic for cleaning up input whitespace that has no semantic significance in HTML. This PR uses a branch based on that for #150 (which in turn is based on that for #120) to avoid conflicts with those fixes. The suggested order of merging is just first to merge #120, then the rest of #150, then the rest of this PR. Whitespace in HTML input isn't generally significant before or after block-level elements, or at the start of end of such an element other than `<pre>`. There is some limited logic in markdownify for removing it, (a) for whitespace-only nodes in conjunction with a limited list of elements (and with questionable logic that ony removes whitespace adjacent to such an element when also inside such an element) and (b) only for trailing whitespace, in certain places in relation to lists. Replace both those places with more thorough logic using a common list of block-level elements (which could be expanded more). In general, this reduces the number of unnecessary blank lines in output from markdownify (sometimes lines with just a newline, sometimes lines containing a space as well as that newline). There are open issues about cases where propagating such input whitespace to the output actually results in badly formed Markdown output (wrongly indented output), but #120 (which this builds on) fixes those issues, sometimes leaving unnecessary lines with just a space on them in the output, which are dealt with fully by the present PR. There are a few testcases that are affected because they were relying on such whitespace for good output from bad HTML input that used `<p>` or `<blockquote>` inside header tags. To keep reasonable output in those cases of bad input now input whitespace adjacent to those two tags is ignored, make the `<p>` and `<blockquote>` output explicitly include leading and trailing spaces if `convert_as_inline`; such explicit spaces seem the best that can be done for such bad input. Given those fixes, all the remaining changes needed to the expectations of existing tests seem like improvements (removing useless spaces or newlines from the output).	2024-10-03 20:16:23 +00:00
Joseph Myers	c2ffe46e85	Fix whitespace issues around wrapping This fixes various issues relating to how input whitespace is handled and how wrapping handles whitespace resulting from hard line breaks. This PR uses a branch based on that for #120 to avoid conflicts with the fixes and associated test changes there. My suggestion is thus first to merge #120 (which fixes two open issues), then to merge the remaining changes from this PR. Wrapping paragraphs has the effect of losing all newlines including those from `<br>` tags, contrary to HTML semantics (wrapping should be a matter of pretty-printing the output; input whitespace from the HTML input should be normalized, but `<br>` should remain as a hard line break). To fix this, we need to wrap the portions of a paragraph between hard line breaks separately. For this to work, ensure that when wrapping, all input whitespace is normalized at an early stage, including turning newlines into spaces. (Only ASCII whitespace is handled this way; `\s` is not used as it's not clear Unicode whitespace should get such normalization.) When not wrapping, there is still too much input whitespace preservation. If the input contains a blank line, that ends up as a paragraph break in the output, or breaks the header formatting when appearing in a header tag, though in terms of HTML semantics such a blank line is no different from a space. In the case of an ATX header, even a single newline appearing in the output breaks the Markdown. Thus, when not wrapping, arrange for input whitespace containing at least one `\r` or `\n` to be normalized to a single newline, and in the ATX header case, normalize to a space. Fixes #130 (probably, not sure exactly what the HTML input there is) Fixes #88 (a related case, anyway; the actual input in #88 has already been fixed)	2024-10-03 00:30:50 +00:00
Joseph Myers	a369e07211	More selective escaping of `-#.)` (alternative approach) This is a partial alternative to #122 (open since April) for more selective escaping of some special characters. Here, we fix the test function naming (as noted in that PR) so the tests are actually run (and fix some incorrect test assertions so they pass). We also make escaping of `-#.)` (the most common cases of unnecessary escaping in my use case) more selective, while still being conservatively safe in escaping all cases of those characters that might have Markdown significance (including in the presence of wrapping, unlike in #122). (Being conservatively safe doesn't include the cases where `.` or `)` start a fragment, where the existing code already was not conservatively safe.) There are certainly more cases where the code could also be made more selective while remaining conservatively safe (including in the presence of wrapping), so this is not a complete replacement for #122, but by fixing some of the most common cases in a safe way, and getting the tests actually running, I hope this allows progress to be made where the previous attempt appears to have stalled, while still allowing further incremental progress with appropriately safe logic for other characters where useful.	2024-10-02 21:59:39 +00:00
Joseph Myers	4399ee75db	Merge branch 'develop' into para-newlines-92-98	2024-09-30 18:05:32 +00:00
Joseph Myers	60d86663d7	More carefully separate inline text from block content There are various cases in which inline text fails to be separated by (sufficiently many) newlines from adjacent block content. A paragraph needs a blank line (two newlines) separating it from prior text, as does an underlined header; an ATX header needs a single newline separating it from prior text. A list needs at least one newline separating it from prior text, but in general two newlines (for an ordered list starting other than at 1, which will only be recognized given a blank line before). To avoid accumulation of more newlines than necessary, take care when concatenating the results of converting consecutive tags to remove redundant newlines (keeping the greater of the number ending the prior text and the number starting the subsequent text). This is thus an alternative to #108 that tries to avoid the excess newline accumulation that was a concern there, as well as fixing more cases than just paragraphs, and updating tests. Fixes #92 Fixes #98	2024-04-09 16:54:33 +00:00