More carefully separate inline text from block content
There are various cases in which inline text fails to be separated by (sufficiently many) newlines from adjacent block content. A paragraph needs a blank line (two newlines) separating it from prior text, as does an underlined header; an ATX header needs a single newline separating it from prior text. A list needs at least one newline separating it from prior text, but in general two newlines (for an ordered list starting other than at 1, which will only be recognized given a blank line before). To avoid accumulation of more newlines than necessary, take care when concatenating the results of converting consecutive tags to remove redundant newlines (keeping the greater of the number ending the prior text and the number starting the subsequent text). This is thus an alternative to #108 that tries to avoid the excess newline accumulation that was a concern there, as well as fixing more cases than just paragraphs, and updating tests. Fixes #92 Fixes #98
This commit is contained in:
@@ -143,7 +143,13 @@ class MarkdownConverter(object):
|
||||
elif isinstance(el, NavigableString):
|
||||
text += self.process_text(el)
|
||||
else:
|
||||
text += self.process_tag(el, convert_children_as_inline)
|
||||
text_strip = text.rstrip('\n')
|
||||
newlines_left = len(text) - len(text_strip)
|
||||
next_text = self.process_tag(el, convert_children_as_inline)
|
||||
next_text_strip = next_text.lstrip('\n')
|
||||
newlines_right = len(next_text) - len(next_text_strip)
|
||||
newlines = '\n' * max(newlines_left, newlines_right)
|
||||
text = text_strip + newlines + next_text_strip
|
||||
|
||||
if not children_only:
|
||||
convert_fn = getattr(self, 'convert_%s' % node.name, None)
|
||||
@@ -216,7 +222,7 @@ class MarkdownConverter(object):
|
||||
|
||||
def underline(self, text, pad_char):
|
||||
text = (text or '').rstrip()
|
||||
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
|
||||
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
|
||||
|
||||
def convert_a(self, el, text, convert_as_inline):
|
||||
prefix, suffix, text = chomp(text)
|
||||
@@ -277,8 +283,8 @@ class MarkdownConverter(object):
|
||||
return self.underline(text, line)
|
||||
hashes = '#' * n
|
||||
if style == ATX_CLOSED:
|
||||
return '%s %s %s\n\n' % (hashes, text, hashes)
|
||||
return '%s %s\n\n' % (hashes, text)
|
||||
return '\n%s %s %s\n\n' % (hashes, text, hashes)
|
||||
return '\n%s %s\n\n' % (hashes, text)
|
||||
|
||||
def convert_hr(self, el, text, convert_as_inline):
|
||||
return '\n\n---\n\n'
|
||||
@@ -313,7 +319,7 @@ class MarkdownConverter(object):
|
||||
if nested:
|
||||
# remove trailing newline if nested
|
||||
return '\n' + self.indent(text, 1).rstrip()
|
||||
return text + ('\n' if before_paragraph else '')
|
||||
return '\n\n' + text + ('\n' if before_paragraph else '')
|
||||
|
||||
convert_ul = convert_list
|
||||
convert_ol = convert_list
|
||||
@@ -344,7 +350,7 @@ class MarkdownConverter(object):
|
||||
width=self.options['wrap_width'],
|
||||
break_long_words=False,
|
||||
break_on_hyphens=False)
|
||||
return '%s\n\n' % text if text else ''
|
||||
return '\n\n%s\n\n' % text if text else ''
|
||||
|
||||
def convert_pre(self, el, text, convert_as_inline):
|
||||
if not text:
|
||||
|
||||
@@ -14,7 +14,7 @@ def test_chomp():
|
||||
|
||||
def test_nested():
|
||||
text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
|
||||
assert text == 'This is an [example link](http://example.com/).\n\n'
|
||||
assert text == '\n\nThis is an [example link](http://example.com/).\n\n'
|
||||
|
||||
|
||||
def test_ignore_comments():
|
||||
|
||||
@@ -112,36 +112,38 @@ def test_em():
|
||||
|
||||
|
||||
def test_header_with_space():
|
||||
assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
|
||||
assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
|
||||
assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello \n\n</h5>') == '##### Hello\n\n'
|
||||
assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
|
||||
assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
|
||||
assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello \n\n</h5>') == '\n##### Hello\n\n'
|
||||
|
||||
|
||||
def test_h1():
|
||||
assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
|
||||
assert md('<h1>Hello</h1>') == '\n\nHello\n=====\n\n'
|
||||
|
||||
|
||||
def test_h2():
|
||||
assert md('<h2>Hello</h2>') == 'Hello\n-----\n\n'
|
||||
assert md('<h2>Hello</h2>') == '\n\nHello\n-----\n\n'
|
||||
|
||||
|
||||
def test_hn():
|
||||
assert md('<h3>Hello</h3>') == '### Hello\n\n'
|
||||
assert md('<h4>Hello</h4>') == '#### Hello\n\n'
|
||||
assert md('<h5>Hello</h5>') == '##### Hello\n\n'
|
||||
assert md('<h6>Hello</h6>') == '###### Hello\n\n'
|
||||
assert md('<h3>Hello</h3>') == '\n### Hello\n\n'
|
||||
assert md('<h4>Hello</h4>') == '\n#### Hello\n\n'
|
||||
assert md('<h5>Hello</h5>') == '\n##### Hello\n\n'
|
||||
assert md('<h6>Hello</h6>') == '\n###### Hello\n\n'
|
||||
|
||||
|
||||
def test_hn_chained():
|
||||
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n'
|
||||
assert md('X<h1>First</h1>', heading_style=ATX) == 'X# First\n\n'
|
||||
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n\n## Second\n\n\n### Third\n\n'
|
||||
assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
|
||||
assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
|
||||
assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'
|
||||
|
||||
|
||||
def test_hn_nested_tag_heading_style():
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '# A P C #\n\n'
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '# A P C\n\n'
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '\n# A P C #\n\n'
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '\n# A P C\n\n'
|
||||
|
||||
|
||||
def test_hn_nested_simple_tag():
|
||||
@@ -157,12 +159,12 @@ def test_hn_nested_simple_tag():
|
||||
]
|
||||
|
||||
for tag, markdown in tag_to_markdown:
|
||||
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '### A ' + markdown + ' B\n\n'
|
||||
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '\n### A ' + markdown + ' B\n\n'
|
||||
|
||||
assert md('<h3>A <br>B</h3>', heading_style=ATX) == '### A B\n\n'
|
||||
assert md('<h3>A <br>B</h3>', heading_style=ATX) == '\n### A B\n\n'
|
||||
|
||||
# Nested lists not supported
|
||||
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '### A li1 li2 B\n\n'
|
||||
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '\n### A li1 li2 B\n\n'
|
||||
|
||||
|
||||
def test_hn_nested_img():
|
||||
@@ -172,18 +174,18 @@ def test_hn_nested_img():
|
||||
("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
|
||||
]
|
||||
for image_attributes, markdown, title in image_attributes_to_markdown:
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A  B\n\n'
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A ' + markdown + ' B\n\n'
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A  B\n\n'
|
||||
|
||||
|
||||
def test_hn_atx_headings():
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX) == '# Hello\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX) == '## Hello\n\n'
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX) == '\n# Hello\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX) == '\n## Hello\n\n'
|
||||
|
||||
|
||||
def test_hn_atx_closed_headings():
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '# Hello #\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '## Hello ##\n\n'
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '\n# Hello #\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n'
|
||||
|
||||
|
||||
def test_head():
|
||||
@@ -193,7 +195,7 @@ def test_head():
|
||||
def test_hr():
|
||||
assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
|
||||
assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
|
||||
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n'
|
||||
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n\n---\n\n\nWorld\n\n'
|
||||
|
||||
|
||||
def test_i():
|
||||
@@ -210,12 +212,13 @@ def test_kbd():
|
||||
|
||||
|
||||
def test_p():
|
||||
assert md('<p>hello</p>') == 'hello\n\n'
|
||||
assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
|
||||
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
|
||||
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
|
||||
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
|
||||
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
|
||||
assert md('<p>hello</p>') == '\n\nhello\n\n'
|
||||
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
|
||||
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
|
||||
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
|
||||
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
|
||||
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
|
||||
assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'
|
||||
|
||||
|
||||
def test_pre():
|
||||
|
||||
@@ -41,16 +41,17 @@ nested_ols = """
|
||||
|
||||
|
||||
def test_ol():
|
||||
assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
|
||||
assert md('<ol><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
|
||||
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '\n\n3. a\n4. b\n'
|
||||
assert md('foo<ol start="3"><li>a</li><li>b</li></ol>bar') == 'foo\n\n3. a\n4. b\n\nbar'
|
||||
|
||||
|
||||
def test_nested_ols():
|
||||
assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
|
||||
assert md(nested_ols) == '\n\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
|
||||
|
||||
|
||||
def test_ul():
|
||||
assert md('<ul><li>a</li><li>b</li></ul>') == '* a\n* b\n'
|
||||
assert md('<ul><li>a</li><li>b</li></ul>') == '\n\n* a\n* b\n'
|
||||
assert md("""<ul>
|
||||
<li>
|
||||
a
|
||||
@@ -58,11 +59,12 @@ def test_ul():
|
||||
<li> b </li>
|
||||
<li> c
|
||||
</li>
|
||||
</ul>""") == '* a\n* b\n* c\n'
|
||||
</ul>""") == '\n\n* a\n* b\n* c\n'
|
||||
|
||||
|
||||
def test_inline_ul():
|
||||
assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == 'foo\n\n* a\n* b\n\nbar\n\n'
|
||||
assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n'
|
||||
assert md('foo<ul><li>bar</li></ul>baz') == 'foo\n\n* bar\n\nbaz'
|
||||
|
||||
|
||||
def test_nested_uls():
|
||||
@@ -70,12 +72,12 @@ def test_nested_uls():
|
||||
Nested ULs should alternate bullet characters.
|
||||
|
||||
"""
|
||||
assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
|
||||
assert md(nested_uls) == '\n\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
|
||||
|
||||
|
||||
def test_bullets():
|
||||
assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
|
||||
assert md(nested_uls, bullets='-') == '\n\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
|
||||
|
||||
|
||||
def test_li_text():
|
||||
assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar </li><li>foo <b>bar</b> <i>space</i>.</ul>') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
|
||||
assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar </li><li>foo <b>bar</b> <i>space</i>.</ul>') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
|
||||
|
||||
Reference in New Issue
Block a user