From 1b3136ad04179736755f1fb71b5023ca97083a32 Mon Sep 17 00:00:00 2001 From: SimonIT Date: Mon, 31 Aug 2020 13:15:10 +0200 Subject: [PATCH 01/41] Fix parsing corrupt html --- markdownify/__init__.py | 8 +------- tests/test_conversions.py | 4 ++++ 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 33d5b8f..2d5daf1 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -6,8 +6,6 @@ import six convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\r\n\s\t ]+') -FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__' -wrapped = '
%%s
' % FRAGMENT_ID # Heading styles @@ -62,12 +60,8 @@ class MarkdownConverter(object): ' convert, but not both.') def convert(self, html): - # We want to take advantage of the html5 parsing, but we don't actually - # want a full document. Therefore, we'll mark our fragment with an id, - # create the document, and extract the element with the id. - html = wrapped % html soup = BeautifulSoup(html, 'html.parser') - return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True) + return self.process_tag(soup, children_only=True) def process_tag(self, node, children_only=False): text = '' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 9e7be24..12ce36b 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -157,3 +157,7 @@ def test_bullets(): def test_img(): assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")' assert md('Alt text') == '![Alt text](/path/to/img.jpg)' + + +def test_div(): + assert md('Hello World') == 'Hello World' From 25d68b4265071402f3addf0f35db040ae87c5864 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Tue, 1 Sep 2020 18:09:24 +0200 Subject: [PATCH 02/41] Bump version 0.5.3 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 06ab404..a3cf276 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.5.2', + '__version__': '0.5.3', } From d558617cd769d5f01693adb929ea044e91d55b02 Mon Sep 17 00:00:00 2001 From: Igor Dvorkin Date: Sun, 15 Nov 2020 09:04:22 -0800 Subject: [PATCH 03/41] Add support for headings that include nested block elements --- markdownify/__init__.py | 64 +++++++++++++++++++++++++++------------ tests/test_conversions.py | 8 +++++ 2 files changed, 52 insertions(+), 20 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 2d5daf1..0a376a7 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -61,22 +61,28 @@ class MarkdownConverter(object): def convert(self, html): soup = BeautifulSoup(html, 'html.parser') - return self.process_tag(soup, children_only=True) + return self.process_tag(soup, convert_as_inline=False, children_only=True) - def process_tag(self, node, children_only=False): + def process_tag(self, node, convert_as_inline, children_only=False): text = '' + # markdown headings can't include block elements (elements w/newlines) + isHeading = node.name.startswith('h') + convert_children_as_inline = convert_as_inline + + if not children_only and isHeading: + convert_children_as_inline = True # Convert the children first for el in node.children: if isinstance(el, NavigableString): text += self.process_text(six.text_type(el)) else: - text += self.process_tag(el) + text += self.process_tag(el, convert_children_as_inline) if not children_only: convert_fn = getattr(self, 'convert_%s' % node.name, None) if convert_fn and self.should_convert_tag(node.name): - text = convert_fn(node, text) + text = convert_fn(node, text, convert_as_inline) return text @@ -89,8 +95,8 @@ class MarkdownConverter(object): if m: n = int(m.group(1)) - def convert_tag(el, text): - return self.convert_hn(n, el, text) + def convert_tag(el, text, convert_as_inline): + return self.convert_hn(n, el, text, convert_as_inline) convert_tag.__name__ = 'convert_h%s' % n setattr(self, convert_tag.__name__, convert_tag) @@ -116,10 +122,12 @@ class MarkdownConverter(object): text = (text or '').rstrip() return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' - def convert_a(self, el, text): + def convert_a(self, el, text, convert_as_inline): prefix, suffix, text = chomp(text) if not text: return '' + if convert_as_inline: + return text href = el.get('href') title = el.get('title') if self.options['autolinks'] and text == href and not title: @@ -128,22 +136,32 @@ class MarkdownConverter(object): title_part = ' "%s"' % title.replace('"', r'\"') if title else '' return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text - def convert_b(self, el, text): - return self.convert_strong(el, text) + def convert_b(self, el, text, convert_as_inline): + return self.convert_strong(el, text, convert_as_inline) + + def convert_blockquote(self, el, text, convert_as_inline): + + if convert_as_inline: + return text - def convert_blockquote(self, el, text): return '\n' + line_beginning_re.sub('> ', text) if text else '' - def convert_br(self, el, text): + def convert_br(self, el, text, convert_as_inline): + if convert_as_inline: + return "" + return ' \n' - def convert_em(self, el, text): + def convert_em(self, el, text, convert_as_inline): prefix, suffix, text = chomp(text) if not text: return '' return '%s*%s*%s' % (prefix, text, suffix) - def convert_hn(self, n, el, text): + def convert_hn(self, n, el, text, convert_as_inline): + if convert_as_inline: + return text + style = self.options['heading_style'] text = text.rstrip() if style == UNDERLINED and n <= 2: @@ -154,10 +172,14 @@ class MarkdownConverter(object): return '%s %s %s\n\n' % (hashes, text, hashes) return '%s %s\n\n' % (hashes, text) - def convert_i(self, el, text): - return self.convert_em(el, text) + def convert_i(self, el, text, convert_as_inline): + return self.convert_em(el, text, convert_as_inline) + + def convert_list(self, el, text, convert_as_inline): + + # Converting a list to inline is undefined. + # Ignoring convert_to_inline for list. - def convert_list(self, el, text): nested = False while el: if el.name == 'li': @@ -172,7 +194,7 @@ class MarkdownConverter(object): convert_ul = convert_list convert_ol = convert_list - def convert_li(self, el, text): + def convert_li(self, el, text, convert_as_inline): parent = el.parent if parent is not None and parent.name == 'ol': if parent.get("start"): @@ -190,16 +212,18 @@ class MarkdownConverter(object): bullet = bullets[depth % len(bullets)] return '%s %s\n' % (bullet, text or '') - def convert_p(self, el, text): + def convert_p(self, el, text, convert_as_inline): + if convert_as_inline: + return text return '%s\n\n' % text if text else '' - def convert_strong(self, el, text): + def convert_strong(self, el, text, convert_as_inline): prefix, suffix, text = chomp(text) if not text: return '' return '%s**%s**%s' % (prefix, text, suffix) - def convert_img(self, el, text): + def convert_img(self, el, text, convert_as_inline): alt = el.attrs.get('alt', None) or '' src = el.attrs.get('src', None) or '' title = el.attrs.get('title', None) or '' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 12ce36b..65dbfd2 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -107,6 +107,14 @@ def test_hn(): assert md('
Hello
') == '###### Hello\n\n' +def test_hn_nested_tag(): + assert md('

A Bold C

') == '### A **Bold** C\n\n' + assert md('

A

P

C

') == '### A P C\n\n' + assert md('

A

P

C

', heading_style=ATX_CLOSED) == '# A P C #\n\n' + assert md('

A

P

C

', heading_style=ATX) == '# A P C\n\n' + assert md('

A
BQ
C

') == '### A BQ C\n\n' + + def test_atx_headings(): assert md('

Hello

', heading_style=ATX) == '# Hello\n\n' assert md('

Hello

', heading_style=ATX) == '## Hello\n\n' From 7780f82c302483a5537175f435d271d66cfc4d84 Mon Sep 17 00:00:00 2001 From: Igor Dvorkin Date: Fri, 11 Dec 2020 16:54:14 -0800 Subject: [PATCH 04/41] Using a regexp to determine if a tag is a heading. --- markdownify/__init__.py | 3 ++- tests/test_conversions.py | 8 ++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 0a376a7..cb12d43 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -6,6 +6,7 @@ import six convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\r\n\s\t ]+') +html_heading_re = re.compile(r'h[1-6]') # Heading styles @@ -66,7 +67,7 @@ class MarkdownConverter(object): def process_tag(self, node, convert_as_inline, children_only=False): text = '' # markdown headings can't include block elements (elements w/newlines) - isHeading = node.name.startswith('h') + isHeading = html_heading_re.match(node.name) is not None convert_children_as_inline = convert_as_inline if not children_only and isHeading: diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 65dbfd2..ab1ce05 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -115,6 +115,14 @@ def test_hn_nested_tag(): assert md('

A
BQ
C

') == '### A BQ C\n\n' +def test_hr(): + assert md('
hr') == 'hr' + + +def test_head(): + assert md('head') == 'head' + + def test_atx_headings(): assert md('

Hello

', heading_style=ATX) == '# Hello\n\n' assert md('

Hello

', heading_style=ATX) == '## Hello\n\n' From 05ea8dc58ab35e1d3e8bae5b84df77bd4e3dc14d Mon Sep 17 00:00:00 2001 From: Igor Dvorkin Date: Sun, 13 Dec 2020 17:39:08 +0000 Subject: [PATCH 05/41] Add many tests and support image tag --- markdownify/__init__.py | 3 +++ tests/test_conversions.py | 38 ++++++++++++++++++++++++++++++++++---- 2 files changed, 37 insertions(+), 4 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index cb12d43..2cd8fc8 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -229,6 +229,9 @@ class MarkdownConverter(object): src = el.attrs.get('src', None) or '' title = el.attrs.get('title', None) or '' title_part = ' "%s"' % title.replace('"', r'\"') if title else '' + if convert_as_inline: + return alt + return '![%s](%s%s)' % (alt, src, title_part) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index ab1ce05..f5fc1c2 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -107,12 +107,42 @@ def test_hn(): assert md('
Hello
') == '###### Hello\n\n' -def test_hn_nested_tag(): - assert md('

A Bold C

') == '### A **Bold** C\n\n' - assert md('

A

P

C

') == '### A P C\n\n' +def test_hn_nested_tag_heading_style(): assert md('

A

P

C

', heading_style=ATX_CLOSED) == '# A P C #\n\n' assert md('

A

P

C

', heading_style=ATX) == '# A P C\n\n' - assert md('

A
BQ
C

') == '### A BQ C\n\n' + + +def test_hn_nested_simple_tag(): + tag_to_markdown = [ + ("strong", "**strong**"), + ("b", "**b**"), + ("em", "*em*"), + ("i", "*i*"), + ("p", "p"), + ("a", "a"), + ("div", "div"), + ("blockquote", "blockquote"), + ] + + for tag, markdown in tag_to_markdown: + assert md('

A <' + tag + '>' + tag + ' B

') == '### A ' + markdown + ' B\n\n' + + assert md('

A
B

', heading_style=ATX) == '### A B\n\n' + + # Nested lists not supported + # assert md('

A
  • li1
  • l2

', heading_style=ATX) == '### A li1 li2 B\n\n' + + +def test_hn_nested_img(): + assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")' + assert md('Alt text') == '![Alt text](/path/to/img.jpg)' + image_attributes_to_markdown = [ + ("", ""), + ("alt='Alt Text'", "Alt Text"), + ("alt='Alt Text' title='Optional title'", "Alt Text"), + ] + for image_attributes, markdown in image_attributes_to_markdown: + assert md('

A B

') == '### A ' + markdown + ' B\n\n' def test_hr(): From 3544322ed238dcada6f69a86e39380279fcefafc Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 13 Dec 2020 23:41:56 +0100 Subject: [PATCH 06/41] Bump Version 0.6.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index a3cf276..8740fc4 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.5.3', + '__version__': '0.6.0', } From 4f8937810b1484a7bcbaf1d45df79433725e20a2 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Tue, 29 Dec 2020 10:28:50 +0100 Subject: [PATCH 07/41] dont replace newlines and tabs with spaces this should fix #17, as all leading new lines were replaced with a single space, which in turn was rendered before the # of a headline --- markdownify/__init__.py | 2 +- tests/test_basic.py | 2 +- tests/test_conversions.py | 4 ++++ 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 2cd8fc8..8ca1904 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -5,7 +5,7 @@ import six convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) -whitespace_re = re.compile(r'[\r\n\s\t ]+') +whitespace_re = re.compile(r'[\t ]+') html_heading_re = re.compile(r'h[1-6]') diff --git a/tests/test_basic.py b/tests/test_basic.py index 78775b6..bf25ee0 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -10,4 +10,4 @@ def test_soup(): def test_whitespace(): - assert md(' a b \n\n c ') == ' a b c ' + assert md(' a b \t\t c ') == ' a b c ' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index f5fc1c2..657c016 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -106,6 +106,10 @@ def test_hn(): assert md('

Hello

') == '### Hello\n\n' assert md('
Hello
') == '###### Hello\n\n' +def test_hn_chained(): + assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n' + assert md('X

First

', heading_style=ATX) == 'X# First\n\n' + def test_hn_nested_tag_heading_style(): assert md('

A

P

C

', heading_style=ATX_CLOSED) == '# A P C #\n\n' From 453b6040962713ef90f6272c6984a5a19ec9d636 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sat, 2 Jan 2021 17:22:27 +0100 Subject: [PATCH 08/41] Fixing autolinks When checking a links href and text for equality, first un-escape the underscores in the text -- because six escapes them. This should fix #29. --- markdownify/__init__.py | 3 ++- tests/test_conversions.py | 6 +++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 2cd8fc8..c2e2ec0 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -131,7 +131,8 @@ class MarkdownConverter(object): return text href = el.get('href') title = el.get('title') - if self.options['autolinks'] and text == href and not title: + # For the replacement see #29: text nodes underscores are escaped + if self.options['autolinks'] and text.replace(r'\_', '_') == href and not title: # Shortcut syntax return '<%s>' % href title_part = ' "%s"' % title.replace('"', r'\"') if title else '' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index f5fc1c2..2896322 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -34,7 +34,11 @@ def test_chomp(): def test_a(): - assert md('Google') == '[Google](http://google.com)' + assert md('Google') == '[Google](https://google.com)' + assert md('https://google.com', autolinks=False) == '[https://google.com](https://google.com)' + assert md('https://google.com') == '' + assert md('https://community.kde.org/Get_Involved') == '' + assert md('https://community.kde.org/Get_Involved', autolinks=False) == '[https://community.kde.org/Get\\_Involved](https://community.kde.org/Get_Involved)' def test_a_spaces(): From b7e1ab889d986330009fe6092723bc836e2f70a0 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Mon, 4 Jan 2021 10:21:27 +0100 Subject: [PATCH 09/41] bump to v0.6.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 8740fc4..ec7dea2 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.6.0', + '__version__': '0.6.1', } From 77d1e99bd5246193138d7646882f3e72c04ce26f Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Tue, 12 Jan 2021 22:42:06 +0100 Subject: [PATCH 10/41] satisfy linter --- tests/test_conversions.py | 1 + 1 file changed, 1 insertion(+) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 4cf05ff..edaefbc 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -110,6 +110,7 @@ def test_hn(): assert md('

Hello

') == '### Hello\n\n' assert md('
Hello
') == '###### Hello\n\n' + def test_hn_chained(): assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n' assert md('X

First

', heading_style=ATX) == 'X# First\n\n' From 321e9eb5f64ee15474d2853efee5ed3cffe47a64 Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Fri, 5 Feb 2021 19:38:24 +0000 Subject: [PATCH 11/41] Add ignore comment tags --- markdownify/__init__.py | 6 ++++-- tests/test_advanced.py | 8 ++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 6d93e47..5c008d3 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -1,4 +1,4 @@ -from bs4 import BeautifulSoup, NavigableString +from bs4 import BeautifulSoup, NavigableString, Comment import re import six @@ -75,7 +75,9 @@ class MarkdownConverter(object): # Convert the children first for el in node.children: - if isinstance(el, NavigableString): + if isinstance(el, Comment): + continue + elif isinstance(el, NavigableString): text += self.process_text(six.text_type(el)) else: text += self.process_tag(el, convert_children_as_inline) diff --git a/tests/test_advanced.py b/tests/test_advanced.py index 4c480d7..2f8aeb1 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -4,3 +4,11 @@ from markdownify import markdownify as md def test_nested(): text = md('

This is an example link.

') assert text == 'This is an [example link](http://example.com/).\n\n' + +def test_ignore_comments(): + text = md("") + assert text == "" + +def test_ignore_comments_with_other_tags(): + text = md("example link") + assert text == "[example link](http://example.com/)" \ No newline at end of file From 457454c713ce03752ca96309a796e66275edc9dd Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Fri, 5 Feb 2021 19:41:43 +0000 Subject: [PATCH 12/41] Add new line at the end of file --- tests/test_advanced.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/test_advanced.py b/tests/test_advanced.py index 2f8aeb1..7ee61d2 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -5,10 +5,12 @@ def test_nested(): text = md('

This is an example link.

') assert text == 'This is an [example link](http://example.com/).\n\n' + def test_ignore_comments(): text = md("") assert text == "" + def test_ignore_comments_with_other_tags(): text = md("example link") - assert text == "[example link](http://example.com/)" \ No newline at end of file + assert text == "[example link](http://example.com/)" From 73f7644c0d998ad9621099c2d93974033f5fbda8 Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Mon, 8 Feb 2021 16:56:10 +0000 Subject: [PATCH 13/41] Add basic support for HTML tables --- markdownify/__init__.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 5c008d3..5fdcbf3 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -237,6 +237,24 @@ class MarkdownConverter(object): return '![%s](%s%s)' % (alt, src, title_part) + def convert_table(self, el, text, convert_as_inline): + rows = el.find_all('tr') + text_data = [] + for row in rows: + headers = row.find_all('th') + columns = row.find_all('td') + if len(headers) > 0: + headers = [head.text.strip() for head in headers] + headers = [head for head in headers if head] + text_data.append(' | '.join(headers)) + text_data.append(' | '.join(['---'] * len(headers))) + elif len(columns) > 0: + columns = [colm.text.strip() for colm in columns] + text_data.append(' | '.join([colm for colm in columns if colm])) + else: + continue + return '\n'.join(text_data) + def markdownify(html, **options): return MarkdownConverter(**options).convert(html) From db96eeb7852f644e4269aa7d3b4372b58c7e4fb4 Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Mon, 8 Feb 2021 16:56:41 +0000 Subject: [PATCH 14/41] Add tests for basic and thead/tbody tables --- tests/test_conversions.py | 56 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 56 insertions(+) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index edaefbc..f274324 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -22,6 +22,52 @@ nested_uls = re.sub(r'\s+', '', """ """) +table = re.sub(r'\s+', '', """ + + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
+""") + + +table_head_body = re.sub(r'\s+', '', """ + + + + + + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
+""") + + def test_chomp(): assert md(' ') == ' ' assert md(' ') == ' ' @@ -31,6 +77,11 @@ def test_chomp(): assert md(' s ') == ' **s** ' assert md(' s ') == ' **s** ' assert md(' s ') == ' **s** ' + assert md('bold with br
italic') == '**bold with br***italic*' + + +def test_chomp_ext(): + assert md('bold with br
italic') == '**bold with br***italic*' def test_a(): @@ -216,3 +267,8 @@ def test_img(): def test_div(): assert md('Hello World') == 'Hello World' + + +def test_table(): + assert md(table) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' + assert md(table_head_body) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' From 292d64bbf4015e800282149249a5a3c3c1a394ef Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Mon, 8 Feb 2021 19:26:27 +0000 Subject: [PATCH 15/41] Remove unnecessary tests --- tests/test_conversions.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index f274324..8c5e369 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -77,11 +77,6 @@ def test_chomp(): assert md(' s ') == ' **s** ' assert md(' s ') == ' **s** ' assert md(' s ') == ' **s** ' - assert md('bold with br
italic') == '**bold with br***italic*' - - -def test_chomp_ext(): - assert md('bold with br
italic') == '**bold with br***italic*' def test_a(): From a152c5b7068bab9289e1650f1ca6c280011c8a2a Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Mon, 8 Feb 2021 19:32:35 +0000 Subject: [PATCH 16/41] Fix lint --- tests/test_conversions.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 8c5e369..cff19bd 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -26,17 +26,17 @@ table = re.sub(r'\s+', '', """ - + - + - +
FirstnameLastnameLastname Age
JillSmithSmith 50
EveJacksonJackson 94
@@ -55,12 +55,12 @@ table_head_body = re.sub(r'\s+', '', """ Jill - Smith + Smith 50 Eve - Jackson + Jackson 94 From 8c28ade348d766705513e07ecb700718a1eb7f2c Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Mon, 8 Feb 2021 20:50:15 +0000 Subject: [PATCH 17/41] Remove empty header validation to allow empty header --- markdownify/__init__.py | 3 +-- tests/test_conversions.py | 25 +++++++++++++++++++++++++ 2 files changed, 26 insertions(+), 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 5fdcbf3..fcdc32a 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -245,12 +245,11 @@ class MarkdownConverter(object): columns = row.find_all('td') if len(headers) > 0: headers = [head.text.strip() for head in headers] - headers = [head for head in headers if head] text_data.append(' | '.join(headers)) text_data.append(' | '.join(['---'] * len(headers))) elif len(columns) > 0: columns = [colm.text.strip() for colm in columns] - text_data.append(' | '.join([colm for colm in columns if colm])) + text_data.append(' | '.join(columns)) else: continue return '\n'.join(text_data) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index cff19bd..2d2e825 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -67,6 +67,30 @@ table_head_body = re.sub(r'\s+', '', """ """) +table_missing_header = re.sub(r'\s+', '', """ + + + + + + + + + + + + + + + + + + + + +
LastnameAge
JillSmith50
EveJackson94
+""") + def test_chomp(): assert md(' ') == ' ' @@ -267,3 +291,4 @@ def test_div(): def test_table(): assert md(table) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' assert md(table_head_body) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' + assert md(table_missing_header) == ' | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' From de6f91af0e10e7ecaa8a758d216f24a15b99a44a Mon Sep 17 00:00:00 2001 From: Bruno Miguens Date: Mon, 8 Feb 2021 20:56:18 +0000 Subject: [PATCH 18/41] Revert header validation and leave possibility to empty column --- markdownify/__init__.py | 1 + tests/test_conversions.py | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index fcdc32a..3bda85e 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -245,6 +245,7 @@ class MarkdownConverter(object): columns = row.find_all('td') if len(headers) > 0: headers = [head.text.strip() for head in headers] + headers = [head for head in headers if head] text_data.append(' | '.join(headers)) text_data.append(' | '.join(['---'] * len(headers))) elif len(columns) > 0: diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 2d2e825..de3307f 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -67,11 +67,11 @@ table_head_body = re.sub(r'\s+', '', """ """) -table_missing_header = re.sub(r'\s+', '', """ +table_missing_text = re.sub(r'\s+', '', """ - + @@ -79,7 +79,7 @@ table_missing_header = re.sub(r'\s+', '', """ - + @@ -291,4 +291,4 @@ def test_div(): def test_table(): assert md(table) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' assert md(table_head_body) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' - assert md(table_missing_header) == ' | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' + assert md(table_missing_text) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | | 50\nEve | Jackson | 94' From f093843f4018c39f82eda6ee327f626071bf3db5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 15 Feb 2021 16:19:19 +0100 Subject: [PATCH 19/41] Allow for a custom strong or emphasis symbol --- markdownify/__init__.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 5c008d3..4542f9a 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -46,6 +46,7 @@ class MarkdownConverter(object): autolinks = True heading_style = UNDERLINED bullets = '*+-' # An iterable of bullet types. + strong_em_symbol = '*' class Options(DefaultOptions): pass @@ -157,10 +158,11 @@ class MarkdownConverter(object): return ' \n' def convert_em(self, el, text, convert_as_inline): + em_tag = self.options['strong_em_symbol'] prefix, suffix, text = chomp(text) if not text: return '' - return '%s*%s*%s' % (prefix, text, suffix) + return '%s%s%s%s%s' % (prefix, em_tag, text, em_tag, suffix) def convert_hn(self, n, el, text, convert_as_inline): if convert_as_inline: @@ -222,10 +224,11 @@ class MarkdownConverter(object): return '%s\n\n' % text if text else '' def convert_strong(self, el, text, convert_as_inline): + strong_tag = 2 * self.options['strong_em_symbol'] prefix, suffix, text = chomp(text) if not text: return '' - return '%s**%s**%s' % (prefix, text, suffix) + return '%s%s%s%s%s' % (prefix, strong_tag, text, strong_tag, suffix) def convert_img(self, el, text, convert_as_inline): alt = el.attrs.get('alt', None) or '' From b3ac4606a6697c97a08c09dc3c54d98af84eaf59 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 15 Feb 2021 16:29:14 +0100 Subject: [PATCH 20/41] Allow for the use of backslash for newlines --- markdownify/__init__.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 4542f9a..59aa694 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -47,6 +47,7 @@ class MarkdownConverter(object): heading_style = UNDERLINED bullets = '*+-' # An iterable of bullet types. strong_em_symbol = '*' + newline = 'spaces' class Options(DefaultOptions): pass @@ -155,7 +156,10 @@ class MarkdownConverter(object): if convert_as_inline: return "" - return ' \n' + if self.options['newline'] == 'backslash': + return '\\\n' + else: + return ' \n' def convert_em(self, el, text, convert_as_inline): em_tag = self.options['strong_em_symbol'] From 29a4e551f772fb83f2c22d166bdce2ab24485de0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 15 Feb 2021 16:37:13 +0100 Subject: [PATCH 21/41] Update README with the two new options --- README.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/README.rst b/README.rst index 4d21411..0988ef1 100644 --- a/README.rst +++ b/README.rst @@ -75,6 +75,12 @@ bullets lists are nested. Otherwise, the bullet will alternate based on nesting level. Defaults to ``'*+-'``. +strong_em_symbol + In markdown, both `*` and `_` are used to encode **strong** or *emphasized* texts. The preferred symbol can be passed through this argument, which defaults to `*`. + +newline + Defines the style of marking linebreaks (`
`) in markdown. The default value `'spaces'` of this option means the regular ' \n' will be used (i.e. two spaces and a newline), while `'backslash'` will convert a linebreak to `''\\\n'` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of converters. + Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. From a79ed44ec38dc71e9739c5e95463877f4d9fb788 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 15 Feb 2021 16:51:20 +0100 Subject: [PATCH 22/41] Fix code ticks in README --- README.rst | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.rst b/README.rst index 0988ef1..eac2a00 100644 --- a/README.rst +++ b/README.rst @@ -76,10 +76,10 @@ bullets level. Defaults to ``'*+-'``. strong_em_symbol - In markdown, both `*` and `_` are used to encode **strong** or *emphasized* texts. The preferred symbol can be passed through this argument, which defaults to `*`. + In markdown, both ``*`` and ``_`` are used to encode **strong** or *emphasized* texts. The preferred symbol can be passed through this argument, that defaults to ``*``. newline - Defines the style of marking linebreaks (`
`) in markdown. The default value `'spaces'` of this option means the regular ' \n' will be used (i.e. two spaces and a newline), while `'backslash'` will convert a linebreak to `''\\\n'` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of converters. + Defines the style of marking linebreaks (``
``) in markdown. The default value ``'spaces'`` of this option means the regular `` \n`` will be used (i.e. two spaces and a newline), while ``'backslash'`` will convert a linebreak to ``\\n`` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of interpreters. Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. From f320cf87ffa92fb0499eff1bbaf5db5c1fddf564 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 21 Feb 2021 20:53:41 +0100 Subject: [PATCH 23/41] closing #25 and #18 Adds newlines after blockquotes, allowing for paragraphs after a blockquote. Due to merging problems with @lucafrance 's code I had to quickly copy and paste their code. Thanks for the contribution! --- markdownify/__init__.py | 2 +- tests/test_conversions.py | 10 +++++++--- 2 files changed, 8 insertions(+), 4 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 5c008d3..c9bc9a2 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -148,7 +148,7 @@ class MarkdownConverter(object): if convert_as_inline: return text - return '\n' + line_beginning_re.sub('> ', text) if text else '' + return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else '' def convert_br(self, el, text, convert_as_inline): if convert_as_inline: diff --git a/tests/test_conversions.py b/tests/test_conversions.py index edaefbc..a32bf65 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -75,12 +75,16 @@ def test_b_spaces(): def test_blockquote(): - assert md('
Hello
').strip() == '> Hello' + assert md('
Hello
') == '\n> Hello\n\n' + + +def test_blockquote_with_paragraph(): + assert md('
Hello

handsome

') == '\n> Hello\n\nhandsome\n\n' def test_nested_blockquote(): - text = md('
And she was like
Hello
').strip() - assert text == '> And she was like \n> > Hello' + text = md('
And she was like
Hello
') + assert text == '\n> And she was like \n> > Hello\n> \n> \n\n' def test_br(): From ed406d3206d6509b9c8c349fd92ebacaf5768729 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 21 Feb 2021 20:57:57 +0100 Subject: [PATCH 24/41] bump to v0.6.4 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ec7dea2..498879d 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.6.1', + '__version__': '0.6.4', } From 99365de66946f31e9bc85d4073b56de8016c638b Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 21 Feb 2021 23:06:21 +0100 Subject: [PATCH 25/41] upgrading code for python 3.x closes #38 --- .github/workflows/python-app.yml | 2 +- setup.py | 11 +++++++---- tests/test_escaping.py | 2 +- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 41240f8..de87672 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -23,7 +23,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8==2.5.4 pytest + pip install flake8==3.8.4 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | diff --git a/setup.py b/setup.py index 498879d..9488cd2 100644 --- a/setup.py +++ b/setup.py @@ -50,7 +50,7 @@ class LintCommand(Command): yield "%s.py" % filename def run(self): - from flake8.engine import get_style_guide + from flake8.api.legacy import get_style_guide flake8_style = get_style_guide(config_file='setup.cfg') paths = self.distribution_files() report = flake8_style.check_files(paths) @@ -70,13 +70,13 @@ setup( zip_safe=False, include_package_data=True, setup_requires=[ - 'flake8', + 'flake8>=3.8,<4', ], tests_require=[ - 'pytest', + 'pytest>=6.2,<7', ], install_requires=[ - 'beautifulsoup4', 'six' + 'beautifulsoup4>=4.9,<5', 'six>=1.15,<2' ], classifiers=[ 'Environment :: Web Environment', @@ -87,6 +87,9 @@ setup( 'Programming Language :: Python :: 2.5', 'Programming Language :: Python :: 2.6', 'Programming Language :: Python :: 2.7', + 'Programming Language :: Python :: 3.6', + 'Programming Language :: Python :: 3.7', + 'Programming Language :: Python :: 3.8', 'Topic :: Utilities' ], cmdclass={ diff --git a/tests/test_escaping.py b/tests/test_escaping.py index 9b0d4fa..23a828c 100644 --- a/tests/test_escaping.py +++ b/tests/test_escaping.py @@ -2,7 +2,7 @@ from markdownify import markdownify as md def test_underscore(): - assert md('_hey_dude_') == '\_hey\_dude\_' + assert md('_hey_dude_') == r'\_hey\_dude\_' def test_xml_entities(): From fd293a9714f41d470f0cd767401ee194225ea13c Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 21 Feb 2021 23:08:49 +0100 Subject: [PATCH 26/41] use python 3.8 instead of 3.6 --- .github/workflows/python-app.yml | 4 ++-- .github/workflows/python-publish.yml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index de87672..000c0b2 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -16,10 +16,10 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.6 + - name: Set up Python 3.8 uses: actions/setup-python@v2 with: - python-version: 3.6 + python-version: 3.8 - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 1a03a7b..9e3a349 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -17,7 +17,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install dependencies run: | python -m pip install --upgrade pip From a59e4b9f48c87bf2633b02bba9d46869c5355613 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 21 Feb 2021 23:09:44 +0100 Subject: [PATCH 27/41] bump to v0.6.5 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 9488cd2..848f8bc 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.6.4', + '__version__': '0.6.5', } From 8da0bdf998d6792016e8ef96fc6452a4be15b6dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 5 Apr 2021 10:28:46 +0200 Subject: [PATCH 28/41] Test strong_em_symbol --- tests/test_conversions.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index edaefbc..fef2203 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -65,6 +65,7 @@ def test_a_no_autolinks(): def test_b(): assert md('Hello') == '**Hello**' + assert md('Hello', strong_em_symbol='_') == '__Hello__' def test_b_spaces(): @@ -89,6 +90,7 @@ def test_br(): def test_em(): assert md('Hello') == '*Hello*' + assert md('Hello', strong_em_symbol='_') == '_Hello_' def test_em_spaces(): @@ -174,6 +176,7 @@ def test_atx_closed_headings(): def test_i(): assert md('Hello') == '*Hello*' + assert md('Hello', strong_em_symbol='_') == '_Hello_' def test_ol(): @@ -187,6 +190,7 @@ def test_p(): def test_strong(): assert md('Hello') == '**Hello**' + assert md('Hello', strong_em_symbol='_') == '__Hello__' def test_ul(): From c04ec855dd5c4ed3697d219d99209da4528fa3f8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 5 Apr 2021 10:44:20 +0200 Subject: [PATCH 29/41] Change option to newline_style and use variables like heading_style does --- README.rst | 4 ++-- markdownify/__init__.py | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/README.rst b/README.rst index eac2a00..6ab4911 100644 --- a/README.rst +++ b/README.rst @@ -78,8 +78,8 @@ bullets strong_em_symbol In markdown, both ``*`` and ``_`` are used to encode **strong** or *emphasized* texts. The preferred symbol can be passed through this argument, that defaults to ``*``. -newline - Defines the style of marking linebreaks (``
``) in markdown. The default value ``'spaces'`` of this option means the regular `` \n`` will be used (i.e. two spaces and a newline), while ``'backslash'`` will convert a linebreak to ``\\n`` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of interpreters. +newline_style + Defines the style of marking linebreaks (``
``) in markdown. The default value ``SPACES`` of this option means the regular `` \n`` will be used (i.e. two spaces and a newline), while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of interpreters. Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 59aa694..5aa6f91 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -15,6 +15,9 @@ ATX_CLOSED = 'atx_closed' UNDERLINED = 'underlined' SETEXT = UNDERLINED +# Newline style +SPACES = 'spaces' +BACKSLASH = 'backslash' def escape(text): if not text: @@ -47,7 +50,7 @@ class MarkdownConverter(object): heading_style = UNDERLINED bullets = '*+-' # An iterable of bullet types. strong_em_symbol = '*' - newline = 'spaces' + newline_style = SPACES class Options(DefaultOptions): pass @@ -156,7 +159,7 @@ class MarkdownConverter(object): if convert_as_inline: return "" - if self.options['newline'] == 'backslash': + if self.options['newline_style'] == BACKSLASH: return '\\\n' else: return ' \n' From 16dbc471b989847eeae685a0e26fc924e23cc174 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 5 Apr 2021 10:47:55 +0200 Subject: [PATCH 30/41] Test newline_style --- tests/test_conversions.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index fef2203..5f2ada7 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED +from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH import re @@ -86,6 +86,7 @@ def test_nested_blockquote(): def test_br(): assert md('a
b
c') == 'a \nb \nc' + assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' def test_em(): From 7ee87b1d3260250e2654c18056b1c55e18ca009e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 5 Apr 2021 10:50:23 +0200 Subject: [PATCH 31/41] Use .lower() on _style option fetching --- markdownify/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 5aa6f91..a23964c 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -159,7 +159,7 @@ class MarkdownConverter(object): if convert_as_inline: return "" - if self.options['newline_style'] == BACKSLASH: + if self.options['newline_style'].lower() == BACKSLASH: return '\\\n' else: return ' \n' @@ -175,7 +175,7 @@ class MarkdownConverter(object): if convert_as_inline: return text - style = self.options['heading_style'] + style = self.options['heading_style'].lower() text = text.rstrip() if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' From 650f377b645b85c460caba99cde0e91bb76a90e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 5 Apr 2021 11:13:19 +0200 Subject: [PATCH 32/41] Fix linting --- markdownify/__init__.py | 1 + 1 file changed, 1 insertion(+) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index a23964c..08819aa 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -19,6 +19,7 @@ SETEXT = UNDERLINED SPACES = 'spaces' BACKSLASH = 'backslash' + def escape(text): if not text: return '' From 5580b0b51d57d21d99e9cf27cbdaff15e72fbcf5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 5 Apr 2021 11:13:52 +0200 Subject: [PATCH 33/41] Update README.rst --- README.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.rst b/README.rst index 6ab4911..a19fabe 100644 --- a/README.rst +++ b/README.rst @@ -79,7 +79,7 @@ strong_em_symbol In markdown, both ``*`` and ``_`` are used to encode **strong** or *emphasized* texts. The preferred symbol can be passed through this argument, that defaults to ``*``. newline_style - Defines the style of marking linebreaks (``
``) in markdown. The default value ``SPACES`` of this option means the regular `` \n`` will be used (i.e. two spaces and a newline), while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of interpreters. + Defines the style of marking linebreaks (``
``) in markdown. The default value ``SPACES`` of this option will adopt the usual two spaces and a newline, while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of interpreters. Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. From e877602a5e988a665b9160158a020f8f825233c1 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Mon, 5 Apr 2021 11:28:42 +0200 Subject: [PATCH 34/41] Separate the strong_em_symbol and newline style tests --- tests/test_conversions.py | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 5f2ada7..d07d487 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -65,7 +65,6 @@ def test_a_no_autolinks(): def test_b(): assert md('Hello') == '**Hello**' - assert md('Hello', strong_em_symbol='_') == '__Hello__' def test_b_spaces(): @@ -86,12 +85,10 @@ def test_nested_blockquote(): def test_br(): assert md('a
b
c') == 'a \nb \nc' - assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' def test_em(): assert md('Hello') == '*Hello*' - assert md('Hello', strong_em_symbol='_') == '_Hello_' def test_em_spaces(): @@ -177,7 +174,6 @@ def test_atx_closed_headings(): def test_i(): assert md('Hello') == '*Hello*' - assert md('Hello', strong_em_symbol='_') == '_Hello_' def test_ol(): @@ -191,7 +187,6 @@ def test_p(): def test_strong(): assert md('Hello') == '**Hello**' - assert md('Hello', strong_em_symbol='_') == '__Hello__' def test_ul(): @@ -221,3 +216,14 @@ def test_img(): def test_div(): assert md('Hello World') == 'Hello World' + + +def test_strong_em_symbol(): + assert md('Hello', strong_em_symbol='_') == '__Hello__' + assert md('Hello', strong_em_symbol='_') == '__Hello__' + assert md('Hello', strong_em_symbol='_') == '_Hello_' + assert md('Hello', strong_em_symbol='_') == '_Hello_' + + +def test_newline_style(): + assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' From 29c794e17d8a04ff879ac2c6e520d74b12e0e250 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9=20van=20Delft?= Date: Sun, 18 Apr 2021 18:13:29 +0200 Subject: [PATCH 35/41] Introduce OPTIONs for `strong_em_symbol` --- README.rst | 10 ++++++++-- markdownify/__init__.py | 6 +++++- tests/test_conversions.py | 10 +++++----- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/README.rst b/README.rst index a19fabe..1e245c1 100644 --- a/README.rst +++ b/README.rst @@ -76,10 +76,16 @@ bullets level. Defaults to ``'*+-'``. strong_em_symbol - In markdown, both ``*`` and ``_`` are used to encode **strong** or *emphasized* texts. The preferred symbol can be passed through this argument, that defaults to ``*``. + In markdown, both ``*`` and ``_`` are used to encode **strong** or + *emphasized* texts. Either of these symbols can be chosen by the options + ``ASTERISK`` (default) or ``UNDERSCORE`` respectively. newline_style - Defines the style of marking linebreaks (``
``) in markdown. The default value ``SPACES`` of this option will adopt the usual two spaces and a newline, while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a newline). While the latter convention is non-standard, it is commonly preferred and supported by a lot of interpreters. + Defines the style of marking linebreaks (``
``) in markdown. The default + value ``SPACES`` of this option will adopt the usual two spaces and a newline, + while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a + newline). While the latter convention is non-standard, it is commonly + preferred and supported by a lot of interpreters. Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 08819aa..6f90d73 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -19,6 +19,10 @@ SETEXT = UNDERLINED SPACES = 'spaces' BACKSLASH = 'backslash' +# Strong and emphasis style +ASTERISK = '*' +UNDERSCORE = '_' + def escape(text): if not text: @@ -50,7 +54,7 @@ class MarkdownConverter(object): autolinks = True heading_style = UNDERLINED bullets = '*+-' # An iterable of bullet types. - strong_em_symbol = '*' + strong_em_symbol = ASTERISK newline_style = SPACES class Options(DefaultOptions): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index d07d487..e974c78 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH +from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE import re @@ -219,10 +219,10 @@ def test_div(): def test_strong_em_symbol(): - assert md('Hello', strong_em_symbol='_') == '__Hello__' - assert md('Hello', strong_em_symbol='_') == '__Hello__' - assert md('Hello', strong_em_symbol='_') == '_Hello_' - assert md('Hello', strong_em_symbol='_') == '_Hello_' + assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' + assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' + assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' + assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' def test_newline_style(): From d4882b86b9c308699fa73dfe79747799f19c5192 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Thu, 22 Apr 2021 12:12:51 +0200 Subject: [PATCH 36/41] bump to v0.6.6 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 848f8bc..db71182 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.6.5', + '__version__': '0.6.6', } From e1dbbfad42f6cb36da9a4d93710f8693ea82c374 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Thu, 22 Apr 2021 12:36:11 +0200 Subject: [PATCH 37/41] guard table lines with pipes, resolves the empty header problem --- markdownify/__init__.py | 7 +++---- tests/test_conversions.py | 8 ++++---- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 3bda85e..8200ca7 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -245,12 +245,11 @@ class MarkdownConverter(object): columns = row.find_all('td') if len(headers) > 0: headers = [head.text.strip() for head in headers] - headers = [head for head in headers if head] - text_data.append(' | '.join(headers)) - text_data.append(' | '.join(['---'] * len(headers))) + text_data.append('| ' + ' | '.join(headers) + ' |') + text_data.append('| ' + ' | '.join(['---'] * len(headers)) + ' |') elif len(columns) > 0: columns = [colm.text.strip() for colm in columns] - text_data.append(' | '.join(columns)) + text_data.append('| ' + ' | '.join(columns) + ' |') else: continue return '\n'.join(text_data) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index de3307f..bf09506 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -71,7 +71,7 @@ table_missing_text = re.sub(r'\s+', '', """
Firstname Lastname Age
JillSmith 50
- + @@ -289,6 +289,6 @@ def test_div(): def test_table(): - assert md(table) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' - assert md(table_head_body) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | Smith | 50\nEve | Jackson | 94' - assert md(table_missing_text) == 'Firstname | Lastname | Age\n--- | --- | ---\nJill | | 50\nEve | Jackson | 94' + assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' + assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' + assert md(table_missing_text) == '| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |' From 651d5f00e88d38d582ddfbecb28621494d653dec Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Thu, 22 Apr 2021 12:43:17 +0200 Subject: [PATCH 38/41] bump to v0.7.0 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index db71182..61d07ec 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.6.6', + '__version__': '0.7.0', } From 5f102d5223bed81f3540a3a7b23997eb10f87a50 Mon Sep 17 00:00:00 2001 From: Jiulong Wang Date: Wed, 28 Apr 2021 15:22:24 -0700 Subject: [PATCH 39/41] Add conversion for hr element --- markdownify/__init__.py | 3 +++ tests/test_conversions.py | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 0b2a620..eeaaf74 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -269,6 +269,9 @@ class MarkdownConverter(object): continue return '\n'.join(text_data) + def convert_hr(self, el, text, convert_as_inline): + return '\n\n---\n\n' + def markdownify(html, **options): return MarkdownConverter(**options).convert(html) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 6dcf9a6..fcb9bf3 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -307,3 +307,7 @@ def test_strong_em_symbol(): def test_newline_style(): assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' + + +def test_hr(): + assert md('Hello
World') == 'Hello\n\n---\n\nWorld' From 55fb96e3c056ad6f43217232dd489a0bad893654 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 2 May 2021 10:45:52 +0200 Subject: [PATCH 40/41] fix hr tests --- tests/test_conversions.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index fcb9bf3..9d25940 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -229,7 +229,9 @@ def test_hn_nested_img(): def test_hr(): - assert md('
hr') == 'hr' + assert md('Hello
World') == 'Hello\n\n---\n\nWorld' + assert md('Hello
World') == 'Hello\n\n---\n\nWorld' + assert md('

Hello

\n
\n

World

') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n' def test_head(): @@ -307,7 +309,3 @@ def test_strong_em_symbol(): def test_newline_style(): assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' - - -def test_hr(): - assert md('Hello
World') == 'Hello\n\n---\n\nWorld' From f59f9f9a5482f95595654258497366fcca5792e8 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 2 May 2021 10:50:49 +0200 Subject: [PATCH 41/41] bump to v0.7.1 --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 61d07ec..bdf2b70 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.7.0', + '__version__': '0.7.1', }
Firstname Lastname Age