diff --git a/markdownify/__init__.py b/markdownify/__init__.py index efb2d15..a37f870 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -7,7 +7,8 @@ import six convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') -all_whitespace_re = re.compile(r'[\s]+') +all_whitespace_re = re.compile(r'[\t \r\n]+') +newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') html_heading_re = re.compile(r'h[1-6]') @@ -168,7 +169,11 @@ class MarkdownConverter(object): # normalize whitespace if we're not inside a preformatted element if not el.find_parent('pre'): - text = whitespace_re.sub(' ', text) + if self.options['wrap']: + text = all_whitespace_re.sub(' ', text) + else: + text = newline_whitespace_re.sub('\n', text) + text = whitespace_re.sub(' ', text) # escape special characters if we're not inside a preformatted or code element if not el.find_parent(['pre', 'code', 'kbd', 'samp']): @@ -286,6 +291,7 @@ class MarkdownConverter(object): if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) + text = all_whitespace_re.sub(' ', text) hashes = '#' * n if style == ATX_CLOSED: return '\n%s %s %s\n\n' % (hashes, text, hashes) @@ -351,10 +357,21 @@ class MarkdownConverter(object): if convert_as_inline: return text if self.options['wrap']: - text = fill(text, - width=self.options['wrap_width'], - break_long_words=False, - break_on_hyphens=False) + # Preserve newlines (and preceding whitespace) resulting + # from
tags. Newlines in the input have already been + # replaced by spaces. + lines = text.split('\n') + new_lines = [] + for line in lines: + line = line.lstrip() + line_no_trailing = line.rstrip() + trailing = line[len(line_no_trailing):] + line = fill(line, + width=self.options['wrap_width'], + break_long_words=False, + break_on_hyphens=False) + new_lines.append(line + trailing) + text = '\n'.join(new_lines) return '\n\n%s\n\n' % text if text else '' def convert_pre(self, el, text, convert_as_inline): diff --git a/tests/test_basic.py b/tests/test_basic.py index bf25ee0..66f8b6c 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -11,3 +11,4 @@ def test_soup(): def test_whitespace(): assert md(' a b \t\t c ') == ' a b c ' + assert md(' a b \n\n c ') == ' a b\nc ' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index baa294b..9c1edc3 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE +from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE def inline_tests(tag, markup): @@ -113,6 +113,7 @@ def test_em(): def test_header_with_space(): assert md('

\n\nHello

') == '\n### Hello\n\n' + assert md('

Hello\n\n\nWorld

') == '\n### Hello World\n\n' assert md('

\n\nHello

') == '\n#### Hello\n\n' assert md('
\n\nHello
') == '\n##### Hello\n\n' assert md('
\n\nHello\n\n
') == '\n##### Hello\n\n' @@ -174,7 +175,7 @@ def test_hn_nested_img(): ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), ] for image_attributes, markdown, title in image_attributes_to_markdown: - assert md('

A B

') == '\n### A ' + markdown + ' B\n\n' + assert md('

A B

') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' assert md('

A B

', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' @@ -214,10 +215,20 @@ def test_kbd(): def test_p(): assert md('

hello

') == '\n\nhello\n\n' assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' + assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' + assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n' + assert md('

12345
67890

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n' assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' + assert md('

12345678901
12345

', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' + assert md('

1234 5678 9012
67890

', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' assert md('First

Second

Third

Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' diff --git a/tests/test_tables.py b/tests/test_tables.py index 594e5bf..fc6eee6 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -242,7 +242,7 @@ def test_table(): assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' + assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'