From 71e1471e1829eac7d44f8c2360a064dfb78edc3b Mon Sep 17 00:00:00 2001 From: chrispy Date: Sun, 29 Dec 2024 12:33:46 -0500 Subject: [PATCH 01/22] do not construct Markdown links in code spans and code blocks Signed-off-by: chrispy --- markdownify/__init__.py | 2 ++ tests/test_conversions.py | 5 +++++ 2 files changed, 7 insertions(+) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 3272ce5..0fbb7a5 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -264,6 +264,8 @@ class MarkdownConverter(object): return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' def convert_a(self, el, text, convert_as_inline): + if el.find_parent(['pre', 'code', 'kbd', 'samp']): + return text prefix, suffix, text = chomp(text) if not text: return '' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 2283c29..55702e3 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -39,6 +39,11 @@ def test_a_no_autolinks(): assert md('https://google.com', autolinks=False) == '[https://google.com](https://google.com)' +def test_a_in_code(): + assert md('Google') == '`Google`' + assert md('
Google
') == '\n```\nGoogle\n```\n' + + def test_b(): assert md('Hello') == '**Hello**' From 1009087d4112ae3afe6b74fd46ba6638e3c7cdf0 Mon Sep 17 00:00:00 2001 From: chrispy Date: Sun, 29 Dec 2024 13:52:32 -0500 Subject: [PATCH 02/22] insert a blank line between table caption, table content Signed-off-by: chrispy --- markdownify/__init__.py | 2 +- tests/test_tables.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 3272ce5..cadd2bd 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -442,7 +442,7 @@ class MarkdownConverter(object): return '\n\n' + text + '\n' def convert_caption(self, el, text, convert_as_inline): - return text + '\n' + return text + '\n\n' def convert_figcaption(self, el, text, convert_as_inline): return '\n\n' + text + '\n\n' diff --git a/tests/test_tables.py b/tests/test_tables.py index fc6eee6..dcf9ad7 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -249,6 +249,6 @@ def test_table(): assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' + assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' From 600f77d2444233b1c5e838b69390cbbf92bde1e1 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Sat, 18 Jan 2025 19:20:22 -0500 Subject: [PATCH 03/22] allow a wrap_width value of None for unlimited line lengths (#169) allow a wrap_width value of None to reflow text to unlimited line length --- README.rst | 1 + markdownify/__init__.py | 25 +++++++++++++------------ tests/test_conversions.py | 1 + 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index 98473c6..af87f51 100644 --- a/README.rst +++ b/README.rst @@ -143,6 +143,7 @@ wrap, wrap_width If ``wrap`` is set to ``True``, all text paragraphs are wrapped at ``wrap_width`` characters. Defaults to ``False`` and ``80``. Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs. + A `wrap_width` value of `None` reflows lines to unlimited line length. Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 53d9cb1..b70a0e5 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -400,18 +400,19 @@ class MarkdownConverter(object): # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been # replaced by spaces. - lines = text.split('\n') - new_lines = [] - for line in lines: - line = line.lstrip() - line_no_trailing = line.rstrip() - trailing = line[len(line_no_trailing):] - line = fill(line, - width=self.options['wrap_width'], - break_long_words=False, - break_on_hyphens=False) - new_lines.append(line + trailing) - text = '\n'.join(new_lines) + if self.options['wrap_width'] is not None: + lines = text.split('\n') + new_lines = [] + for line in lines: + line = line.lstrip() + line_no_trailing = line.rstrip() + trailing = line[len(line_no_trailing):] + line = fill(line, + width=self.options['wrap_width'], + break_long_words=False, + break_on_hyphens=False) + new_lines.append(line + trailing) + text = '\n'.join(new_lines) return '\n\n%s\n\n' % text if text else '' def convert_pre(self, el, text, convert_as_inline): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 55702e3..01f8b91 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -224,6 +224,7 @@ def test_p(): assert md('

123456789 123456789

') == '\n\n123456789 123456789\n\n' assert md('

123456789\n\n\n123456789

') == '\n\n123456789\n123456789\n\n' assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' + assert md('

123456789\n\n\n123456789

', wrap=True, wrap_width=None) == '\n\n123456789 123456789\n\n' assert md('

123456789 123456789

', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n' assert md('

Some long link

', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n' assert md('

12345
67890

', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n' From 17c3678d0e59b997207bdedb73d8677aa68a8bae Mon Sep 17 00:00:00 2001 From: chrispy Date: Mon, 30 Dec 2024 08:22:33 -0500 Subject: [PATCH 04/22] optimize empty-line handling for li and blockquote content Signed-off-by: chrispy --- markdownify/__init__.py | 42 +++++++++++++++++++++++++++++---------- tests/test_conversions.py | 2 +- tests/test_lists.py | 4 ++-- 3 files changed, 34 insertions(+), 14 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index b70a0e5..fd03569 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -5,7 +5,7 @@ import six convert_heading_re = re.compile(r'convert_h(\d+)') -line_beginning_re = re.compile(r'^', re.MULTILINE) +line_with_content_re = re.compile(r'^(.*)', flags=re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') all_whitespace_re = re.compile(r'[\t \r\n]+') newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') @@ -256,9 +256,6 @@ class MarkdownConverter(object): text = text.replace('_', r'\_') return text - def indent(self, text, columns): - return line_beginning_re.sub(' ' * columns, text) if text else '' - def underline(self, text, pad_char): text = (text or '').rstrip() return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' @@ -286,11 +283,20 @@ class MarkdownConverter(object): convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol']) def convert_blockquote(self, el, text, convert_as_inline): - + # handle some early-exit scenarios + text = (text or '').strip() if convert_as_inline: - return ' ' + text.strip() + ' ' + return ' ' + text + ' ' + if not text: + return "\n" - return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else '' + # indent lines with blockquote marker + def _indent_for_blockquote(match): + line_content = match.group(1) + return '> ' + line_content if line_content else '>' + text = line_with_content_re.sub(_indent_for_blockquote, text) + + return '\n' + text + '\n\n' def convert_br(self, el, text, convert_as_inline): if convert_as_inline: @@ -371,6 +377,12 @@ class MarkdownConverter(object): convert_ol = convert_list def convert_li(self, el, text, convert_as_inline): + # handle some early-exit scenarios + text = (text or '').strip() + if not text: + return "\n" + + # determine list item bullet character to use parent = el.parent if parent is not None and parent.name == 'ol': if parent.get("start") and str(parent.get("start")).isnumeric(): @@ -387,10 +399,18 @@ class MarkdownConverter(object): bullets = self.options['bullets'] bullet = bullets[depth % len(bullets)] bullet = bullet + ' ' - text = (text or '').strip() - text = self.indent(text, len(bullet)) - if text: - text = bullet + text[len(bullet):] + bullet_width = len(bullet) + bullet_indent = ' ' * bullet_width + + # indent content lines by bullet width + def _indent_for_li(match): + line_content = match.group(1) + return bullet_indent + line_content if line_content else '' + text = line_with_content_re.sub(_indent_for_li, text) + + # insert bullet into first-line indent whitespace + text = bullet + text[bullet_width:] + return '%s\n' % text def convert_p(self, el, text, convert_as_inline): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 01f8b91..868db7c 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -62,7 +62,7 @@ def test_blockquote(): def test_blockquote_with_nested_paragraph(): assert md('

Hello

') == '\n> Hello\n\n' - assert md('

Hello

Hello again

') == '\n> Hello\n> \n> Hello again\n\n' + assert md('

Hello

Hello again

') == '\n> Hello\n>\n> Hello again\n\n' def test_blockquote_with_paragraph(): diff --git a/tests/test_lists.py b/tests/test_lists.py index a660778..ce54a87 100644 --- a/tests/test_lists.py +++ b/tests/test_lists.py @@ -47,7 +47,7 @@ def test_ol(): assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' assert md('
  1. a
  2. b
') == '\n\n1. a\n2. b\n' - assert md('
  1. first para

    second para

  2. third para

    fourth para

') == '\n\n1234. first para\n \n second para\n1235. third para\n \n fourth para\n' + assert md('
  1. first para

    second para

  2. third para

    fourth para

') == '\n\n1234. first para\n\n second para\n1235. third para\n\n fourth para\n' def test_nested_ols(): @@ -64,7 +64,7 @@ def test_ul():
  • c
  • """) == '\n\n* a\n* b\n* c\n' - assert md('
    • first para

      second para

    • third para

      fourth para

    ') == '\n\n* first para\n \n second para\n* third para\n \n fourth para\n' + assert md('
    • first para

      second para

    • third para

      fourth para

    ') == '\n\n* first para\n\n second para\n* third para\n\n fourth para\n' def test_inline_ul(): From 0fb855676d6a3459df9e14b3f5911da0ebf4f129 Mon Sep 17 00:00:00 2001 From: chrispy Date: Tue, 31 Dec 2024 10:28:32 -0500 Subject: [PATCH 05/22] support HTML definition lists (
    ,
    , and
    ) Signed-off-by: chrispy --- markdownify/__init__.py | 32 ++++++++++++++++++++++++++++++++ tests/test_conversions.py | 10 ++++++++++ 2 files changed, 42 insertions(+) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index fd03569..ac53077 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -319,6 +319,38 @@ class MarkdownConverter(object): convert_kbd = convert_code + def convert_dd(self, el, text, convert_as_inline): + text = (text or '').strip() + if convert_as_inline: + return ' ' + text + ' ' + if not text: + return '\n' + + # indent definition content lines by four spaces + def _indent_for_dd(match): + line_content = match.group(1) + return ' ' + line_content if line_content else '' + text = line_with_content_re.sub(_indent_for_dd, text) + + # insert definition marker into first-line indent whitespace + text = ':' + text[1:] + + return '%s\n' % text + + def convert_dt(self, el, text, convert_as_inline): + # remove newlines from term text + text = (text or '').strip() + text = all_whitespace_re.sub(' ', text) + if convert_as_inline: + return ' ' + text + ' ' + if not text: + return '\n' + + # TODO - format consecutive
    elements as directly adjacent lines): + # https://michelf.ca/projects/php-markdown/extra/#def-list + + return '\n%s\n' % text + def _convert_hn(self, n, el, text, convert_as_inline): """ Method name prefixed with _ to prevent to call this """ if convert_as_inline: diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 868db7c..cc5ebc7 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -104,6 +104,16 @@ def test_code(): assert md('foobarbaz', sub_symbol='^') == '`foobarbaz`' +def test_dl(): + assert md('
    term
    definition
    ') == '\nterm\n: definition\n' + assert md('

    te

    rm

    definition
    ') == '\nte rm\n: definition\n' + assert md('
    term

    definition-p1

    definition-p2

    ') == '\nterm\n: definition-p1\n\n definition-p2\n' + assert md('
    term

    definition 1

    definition 2

    ') == '\nterm\n: definition 1\n: definition 2\n' + assert md('
    term 1
    definition 1
    term 2
    definition 2
    ') == '\nterm 1\n: definition 1\nterm 2\n: definition 2\n' + assert md('
    term

    line 1

    line 2

    ') == '\nterm\n: > line 1\n >\n > line 2\n' + assert md('
    term
    1. 1

      • 2a
      • 2b
    2. 3

    ') == '\nterm\n: 1. 1\n\n * 2a\n * 2b\n 2. 3\n' + + def test_del(): inline_tests('del', '~~') From 3bf0b527a44c22c1b3dd8248f374fa487a9c3ede Mon Sep 17 00:00:00 2001 From: SomeBottle Date: Sun, 19 Jan 2025 21:13:24 +0800 Subject: [PATCH 06/22] Add a new configuration option to control tabler header row inference (#161) Add option to infer first table row as table header (defaults to false) --- README.rst | 5 ++++ markdownify/__init__.py | 33 ++++++++++++++++++-------- markdownify/main.py | 4 ++++ tests/test_tables.py | 51 ++++++++++++++++++++++++++++++++++++++--- 4 files changed, 81 insertions(+), 12 deletions(-) diff --git a/README.rst b/README.rst index af87f51..34ed7e0 100644 --- a/README.rst +++ b/README.rst @@ -139,6 +139,11 @@ keep_inline_images_in that should be allowed to contain inline images, for example ``['td']``. Defaults to an empty list. +table_infer_header + Controls handling of tables with no header row (as indicated by ```` + or ````). When set to ``True``, the first body row is used as the header row. + Defaults to ``False``, which leaves the header row empty. + wrap, wrap_width If ``wrap`` is set to ``True``, all text paragraphs are wrapped at ``wrap_width`` characters. Defaults to ``False`` and ``80``. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index ac53077..2360210 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -102,6 +102,7 @@ class MarkdownConverter(object): strong_em_symbol = ASTERISK sub_symbol = '' sup_symbol = '' + table_infer_header = False wrap = False wrap_width = 80 @@ -518,13 +519,24 @@ class MarkdownConverter(object): cells = el.find_all(['td', 'th']) is_headrow = ( all([cell.name == 'th' for cell in cells]) - or (not el.previous_sibling and not el.parent.name == 'tbody') + or (el.parent.name == 'thead' + # avoid multiple tr in thead + and len(el.parent.find_all('tr')) == 1) + ) + is_head_row_missing = ( + (not el.previous_sibling and not el.parent.name == 'tbody') or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) ) overline = '' underline = '' - if is_headrow and not el.previous_sibling: - # first row and is headline: print headline underline + if ((is_headrow + or (is_head_row_missing + and self.options['table_infer_header'])) + and not el.previous_sibling): + # first row and: + # - is headline or + # - headline is missing and header inference is enabled + # print headline underline full_colspan = 0 for cell in cells: if 'colspan' in cell.attrs and cell['colspan'].isdigit(): @@ -532,13 +544,16 @@ class MarkdownConverter(object): else: full_colspan += 1 underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' - elif (not el.previous_sibling - and (el.parent.name == 'table' - or (el.parent.name == 'tbody' - and not el.parent.previous_sibling))): + elif ((is_head_row_missing + and not self.options['table_infer_header']) + or (not el.previous_sibling + and (el.parent.name == 'table' + or (el.parent.name == 'tbody' + and not el.parent.previous_sibling)))): + # headline is missing and header inference is disabled or: # first row, not headline, and: - # - the parent is table or - # - the parent is tbody at the beginning of a table. + # - the parent is table or + # - the parent is tbody at the beginning of a table. # print empty headline above this row overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n' overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' diff --git a/markdownify/main.py b/markdownify/main.py index 4e1c874..432efb5 100644 --- a/markdownify/main.py +++ b/markdownify/main.py @@ -61,6 +61,10 @@ def main(argv=sys.argv[1:]): "should be converted to markdown images instead, this option can " "be set to a list of parent tags that should be allowed to " "contain inline images.") + parser.add_argument('--table-infer-header', dest='table_infer_header', + action='store_true', + help="When a table has no header row (as indicated by '' " + "or ''), use the first body row as the header row.") parser.add_argument('-w', '--wrap', action='store_true', help="Wrap all text paragraphs at --wrap-width characters.") parser.add_argument('--wrap-width', type=int, default=80) diff --git a/tests/test_tables.py b/tests/test_tables.py index dcf9ad7..da4bf53 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -141,6 +141,33 @@ table_head_body_missing_head = """
    """ +table_head_body_multiple_head = """ + + + + + + + + + + + + + + + + + + + + + + + + +
    CreatorEditorServer
    OperatorManagerEngineer
    BobOliverTom
    ThomasLucasEthan
    """ + table_missing_text = """ @@ -245,10 +272,28 @@ def test_table(): assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body_multiple_head) == '\n\n| | | |\n| --- | --- | --- |\n| Creator | Editor | Server |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' + assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_body) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n\n' assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' + + +def test_table_infer_header(): + assert md(table, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_html_content, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_paragraphs, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_linebreaks, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' + assert md(table_with_header_column, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body_multiple_head, table_infer_header=True) == '\n\n| Creator | Editor | Server |\n| --- | --- | --- |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n' + assert md(table_head_body_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_missing_text, table_infer_header=True) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_caption, table_infer_header=True) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' + assert md(table_with_colspan, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_undefined_colspan, table_infer_header=True) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n' From 1b3333073a7139938a25b65631beac62804c805f Mon Sep 17 00:00:00 2001 From: Fess-AKA-DeadMonk Date: Sun, 19 Jan 2025 17:48:08 +0300 Subject: [PATCH 07/22] for convert_* functions, allow for tags with special characters in their name (like "subtag-name") (#136) support custom conversion functions for tags with `:` and `-` characters in their names by mapping them to underscores in the function name --- markdownify/__init__.py | 3 ++- tests/test_custom_converter.py | 15 +++++++++++---- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 2360210..8e90a61 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -172,7 +172,8 @@ class MarkdownConverter(object): text = text_strip + newlines + next_text_strip if not children_only: - convert_fn = getattr(self, 'convert_%s' % node.name, None) + fn_name = 'convert_%s' % node.name.translate(''.maketrans(':-', '__')) + convert_fn = getattr(self, fn_name, None) if convert_fn and self.should_convert_tag(node.name): text = convert_fn(node, text, convert_as_inline) diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py index a3e33ac..adc83f7 100644 --- a/tests/test_custom_converter.py +++ b/tests/test_custom_converter.py @@ -2,22 +2,29 @@ from markdownify import MarkdownConverter from bs4 import BeautifulSoup -class ImageBlockConverter(MarkdownConverter): +class UnitTestConverter(MarkdownConverter): """ - Create a custom MarkdownConverter that adds two newlines after an image + Create a custom MarkdownConverter for unit tests """ def convert_img(self, el, text, convert_as_inline): + """Add two newlines after an image""" return super().convert_img(el, text, convert_as_inline) + '\n\n' + def convert_custom_tag(self, el, text, convert_as_inline): + """Ensure conversion function is found for tags with special characters in name""" + return "FUNCTION USED: %s" % text -def test_img(): + +def test_custom_conversion_functions(): # Create shorthand method for conversion def md(html, **options): - return ImageBlockConverter(**options).convert(html) + return UnitTestConverter(**options).convert(html) assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")\n\n' assert md('Alt text') == '![Alt text](/path/to/img.jpg)\n\n' + assert md("text") == "FUNCTION USED: text" + def test_soup(): html = 'test' From 7fec8a20802b81e222211377dbdda2cb5513e3d6 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Sun, 19 Jan 2025 10:23:58 -0500 Subject: [PATCH 08/22] code simplification to remove need for children_only parameter (#174) Signed-off-by: chrispy --- markdownify/__init__.py | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 8e90a61..5de983d 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -124,9 +124,9 @@ class MarkdownConverter(object): return self.convert_soup(soup) def convert_soup(self, soup): - return self.process_tag(soup, convert_as_inline=False, children_only=True) + return self.process_tag(soup, convert_as_inline=False) - def process_tag(self, node, convert_as_inline, children_only=False): + def process_tag(self, node, convert_as_inline): text = '' # markdown headings or cells can't include @@ -135,7 +135,7 @@ class MarkdownConverter(object): isCell = node.name in ['td', 'th'] convert_children_as_inline = convert_as_inline - if not children_only and (isHeading or isCell): + if isHeading or isCell: convert_children_as_inline = True # Remove whitespace-only textnodes just before, after or @@ -171,14 +171,18 @@ class MarkdownConverter(object): newlines = '\n' * max(newlines_left, newlines_right) text = text_strip + newlines + next_text_strip - if not children_only: - fn_name = 'convert_%s' % node.name.translate(''.maketrans(':-', '__')) - convert_fn = getattr(self, fn_name, None) - if convert_fn and self.should_convert_tag(node.name): - text = convert_fn(node, text, convert_as_inline) + # apply this tag's final conversion function + convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name) + convert_fn = getattr(self, convert_fn_name, None) + if convert_fn and self.should_convert_tag(node.name): + text = convert_fn(node, text, convert_as_inline) return text + def convert__document_(self, el, text, convert_as_inline): + # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is + return text + def process_text(self, el): text = six.text_type(el) or '' From f24ec9e83c9443309aaa493ba6614d75f15076ec Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Tue, 21 Jan 2025 11:00:51 -0500 Subject: [PATCH 09/22] add blank line before ATX-style headings to avoid ambiguity (#178) Signed-off-by: chrispy --- markdownify/__init__.py | 4 +-- tests/test_conversions.py | 51 +++++++++++++++++++++------------------ 2 files changed, 30 insertions(+), 25 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 5de983d..776e75e 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -373,8 +373,8 @@ class MarkdownConverter(object): text = all_whitespace_re.sub(' ', text) hashes = '#' * n if style == ATX_CLOSED: - return '\n%s %s %s\n\n' % (hashes, text, hashes) - return '\n%s %s\n\n' % (hashes, text) + return '\n\n%s %s %s\n\n' % (hashes, text, hashes) + return '\n\n%s %s\n\n' % (hashes, text) def convert_hr(self, el, text, convert_as_inline): return '\n\n---\n\n' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index cc5ebc7..ba8fcd8 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -127,12 +127,12 @@ def test_em(): def test_header_with_space(): - assert md('

    \n\nHello

    ') == '\n### Hello\n\n' - assert md('

    Hello\n\n\nWorld

    ') == '\n### Hello World\n\n' - assert md('

    \n\nHello

    ') == '\n#### Hello\n\n' - assert md('
    \n\nHello
    ') == '\n##### Hello\n\n' - assert md('
    \n\nHello\n\n
    ') == '\n##### Hello\n\n' - assert md('
    \n\nHello \n\n
    ') == '\n##### Hello\n\n' + assert md('

    \n\nHello

    ') == '\n\n### Hello\n\n' + assert md('

    Hello\n\n\nWorld

    ') == '\n\n### Hello World\n\n' + assert md('

    \n\nHello

    ') == '\n\n#### Hello\n\n' + assert md('
    \n\nHello
    ') == '\n\n##### Hello\n\n' + assert md('
    \n\nHello\n\n
    ') == '\n\n##### Hello\n\n' + assert md('
    \n\nHello \n\n
    ') == '\n\n##### Hello\n\n' def test_h1(): @@ -144,24 +144,24 @@ def test_h2(): def test_hn(): - assert md('

    Hello

    ') == '\n### Hello\n\n' - assert md('

    Hello

    ') == '\n#### Hello\n\n' - assert md('
    Hello
    ') == '\n##### Hello\n\n' - assert md('
    Hello
    ') == '\n###### Hello\n\n' + assert md('

    Hello

    ') == '\n\n### Hello\n\n' + assert md('

    Hello

    ') == '\n\n#### Hello\n\n' + assert md('
    Hello
    ') == '\n\n##### Hello\n\n' + assert md('
    Hello
    ') == '\n\n###### Hello\n\n' assert md('Hello') == md('
    Hello
    ') assert md('Hello') == md('Hello') def test_hn_chained(): - assert md('

    First

    \n

    Second

    \n

    Third

    ', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n' - assert md('X

    First

    ', heading_style=ATX) == 'X\n# First\n\n' - assert md('X

    First

    ', heading_style=ATX_CLOSED) == 'X\n# First #\n\n' + assert md('

    First

    \n

    Second

    \n

    Third

    ', heading_style=ATX) == '\n\n# First\n\n## Second\n\n### Third\n\n' + assert md('X

    First

    ', heading_style=ATX) == 'X\n\n# First\n\n' + assert md('X

    First

    ', heading_style=ATX_CLOSED) == 'X\n\n# First #\n\n' assert md('X

    First

    ') == 'X\n\nFirst\n=====\n\n' def test_hn_nested_tag_heading_style(): - assert md('

    A

    P

    C

    ', heading_style=ATX_CLOSED) == '\n# A P C #\n\n' - assert md('

    A

    P

    C

    ', heading_style=ATX) == '\n# A P C\n\n' + assert md('

    A

    P

    C

    ', heading_style=ATX_CLOSED) == '\n\n# A P C #\n\n' + assert md('

    A

    P

    C

    ', heading_style=ATX) == '\n\n# A P C\n\n' def test_hn_nested_simple_tag(): @@ -177,9 +177,9 @@ def test_hn_nested_simple_tag(): ] for tag, markdown in tag_to_markdown: - assert md('

    A <' + tag + '>' + tag + ' B

    ') == '\n### A ' + markdown + ' B\n\n' + assert md('

    A <' + tag + '>' + tag + ' B

    ') == '\n\n### A ' + markdown + ' B\n\n' - assert md('

    A
    B

    ', heading_style=ATX) == '\n### A B\n\n' + assert md('

    A
    B

    ', heading_style=ATX) == '\n\n### A B\n\n' # Nested lists not supported # assert md('

    A
    • li1
    • l2

    ', heading_style=ATX) == '\n### A li1 li2 B\n\n' @@ -192,18 +192,23 @@ def test_hn_nested_img(): ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""), ] for image_attributes, markdown, title in image_attributes_to_markdown: - assert md('

    A B

    ') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' - assert md('

    A B

    ', keep_inline_images_in=['h3']) == '\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' + assert md('

    A B

    ') == '\n\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n' + assert md('

    A B

    ', keep_inline_images_in=['h3']) == '\n\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n' def test_hn_atx_headings(): - assert md('

    Hello

    ', heading_style=ATX) == '\n# Hello\n\n' - assert md('

    Hello

    ', heading_style=ATX) == '\n## Hello\n\n' + assert md('

    Hello

    ', heading_style=ATX) == '\n\n# Hello\n\n' + assert md('

    Hello

    ', heading_style=ATX) == '\n\n## Hello\n\n' def test_hn_atx_closed_headings(): - assert md('

    Hello

    ', heading_style=ATX_CLOSED) == '\n# Hello #\n\n' - assert md('

    Hello

    ', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n' + assert md('

    Hello

    ', heading_style=ATX_CLOSED) == '\n\n# Hello #\n\n' + assert md('

    Hello

    ', heading_style=ATX_CLOSED) == '\n\n## Hello ##\n\n' + + +def test_hn_newlines(): + assert md("

    H1-1

    TEXT

    H2-2

    TEXT

    H1-2

    TEXT", heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT' + assert md('

    H1-1

    \n

    TEXT

    \n

    H2-2

    \n

    TEXT

    \n

    H1-2

    \n

    TEXT

    ', heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT\n\n' def test_head(): From dbb598880285d6e3f9489f7d53f95fdebb6281ac Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Tue, 21 Jan 2025 11:01:11 -0500 Subject: [PATCH 10/22] add blank line before/after preformatted block (#179) Signed-off-by: chrispy --- markdownify/__init__.py | 2 +- tests/test_conversions.py | 52 +++++++++++++++++++++------------------ 2 files changed, 29 insertions(+), 25 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 776e75e..ef4e7ca 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -481,7 +481,7 @@ class MarkdownConverter(object): if self.options['code_language_callback']: code_language = self.options['code_language_callback'](el) or code_language - return '\n```%s\n%s\n```\n' % (code_language, text) + return '\n\n```%s\n%s\n```\n\n' % (code_language, text) def convert_script(self, el, text, convert_as_inline): return '' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index ba8fcd8..05c6cd4 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -41,7 +41,7 @@ def test_a_no_autolinks(): def test_a_in_code(): assert md('Google') == '`Google`' - assert md('
    Google
    ') == '\n```\nGoogle\n```\n' + assert md('
    Google
    ') == '\n\n```\nGoogle\n```\n\n' def test_b(): @@ -256,23 +256,27 @@ def test_p(): def test_pre(): - assert md('
    test\n    foo\nbar
    ') == '\n```\ntest\n foo\nbar\n```\n' - assert md('
    test\n    foo\nbar
    ') == '\n```\ntest\n foo\nbar\n```\n' - assert md('
    *this_should_not_escape*
    ') == '\n```\n*this_should_not_escape*\n```\n' - assert md('
    *this_should_not_escape*
    ') == '\n```\n*this_should_not_escape*\n```\n' - assert md('
    \t\tthis  should\t\tnot  normalize
    ') == '\n```\n\t\tthis should\t\tnot normalize\n```\n' - assert md('
    \t\tthis  should\t\tnot  normalize
    ') == '\n```\n\t\tthis should\t\tnot normalize\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbaz
    ') == '\n```\nfoo\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ', sup_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n' - assert md('
    foo\nbar\nbaz
    ', sub_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n' + assert md('
    test\n    foo\nbar
    ') == '\n\n```\ntest\n foo\nbar\n```\n\n' + assert md('
    test\n    foo\nbar
    ') == '\n\n```\ntest\n foo\nbar\n```\n\n' + assert md('
    *this_should_not_escape*
    ') == '\n\n```\n*this_should_not_escape*\n```\n\n' + assert md('
    *this_should_not_escape*
    ') == '\n\n```\n*this_should_not_escape*\n```\n\n' + assert md('
    \t\tthis  should\t\tnot  normalize
    ') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' + assert md('
    \t\tthis  should\t\tnot  normalize
    ') == '\n\n```\n\t\tthis should\t\tnot normalize\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbaz
    ') == '\n\n```\nfoo\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ', sup_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + assert md('
    foo\nbar\nbaz
    ', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n' + + assert md('foo
    bar
    baz', sub_symbol='^') == 'foo\n\n```\nbar\n```\n\nbaz' + assert md("

    foo

    \n
    bar
    \n

    baz

    ", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz" def test_script(): @@ -315,17 +319,17 @@ def test_sup(): def test_lang(): - assert md('
    test\n    foo\nbar
    ', code_language='python') == '\n```python\ntest\n foo\nbar\n```\n' - assert md('
    test\n    foo\nbar
    ', code_language='javascript') == '\n```javascript\ntest\n foo\nbar\n```\n' + assert md('
    test\n    foo\nbar
    ', code_language='python') == '\n\n```python\ntest\n foo\nbar\n```\n\n' + assert md('
    test\n    foo\nbar
    ', code_language='javascript') == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' def test_lang_callback(): def callback(el): return el['class'][0] if el.has_attr('class') else None - assert md('
    test\n    foo\nbar
    ', code_language_callback=callback) == '\n```python\ntest\n foo\nbar\n```\n' - assert md('
    test\n    foo\nbar
    ', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n' - assert md('
    test\n    foo\nbar
    ', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n' + assert md('
    test\n    foo\nbar
    ', code_language_callback=callback) == '\n\n```python\ntest\n foo\nbar\n```\n\n' + assert md('
    test\n    foo\nbar
    ', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' + assert md('
    test\n    foo\nbar
    ', code_language_callback=callback) == '\n\n```javascript\ntest\n foo\nbar\n```\n\n' def test_spaces(): @@ -335,4 +339,4 @@ def test_spaces(): assert md('test
    text
    after') == 'test\n> text\n\nafter' assert md('
    1. x
    2. y
    ') == '\n\n1. x\n2. y\n' assert md('
    • x
    • y
    • ') == '\n\n* x\n* y\n' - assert md('test
       foo 
      bar') == 'test\n```\n foo \n```\nbar' + assert md('test
       foo 
      bar') == 'test\n\n```\n foo \n```\n\nbar' From ae0597d80cb57983e876d13fdd44a7728abcbe26 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Mon, 27 Jan 2025 11:55:32 -0500 Subject: [PATCH 11/22] remove superfluous leading/trailing whitespace (#181) --- README.rst | 7 +++++++ markdownify/__init__.py | 26 ++++++++++++++++++++++---- tests/test_advanced.py | 2 +- tests/test_args.py | 11 ++++++++++- tests/test_basic.py | 2 +- tests/test_conversions.py | 14 ++++++++------ tests/test_custom_converter.py | 4 ++-- tests/test_escaping.py | 2 +- tests/test_lists.py | 2 +- tests/test_tables.py | 7 +++++-- tests/utils.py | 9 +++++++++ 11 files changed, 67 insertions(+), 19 deletions(-) create mode 100644 tests/utils.py diff --git a/README.rst b/README.rst index 34ed7e0..b37a503 100644 --- a/README.rst +++ b/README.rst @@ -150,6 +150,13 @@ wrap, wrap_width Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs. A `wrap_width` value of `None` reflows lines to unlimited line length. +strip_document + Controls whether leading and/or trailing separation newlines are removed from + the final converted document. Supported values are ``LSTRIP`` (leading), + ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines + within the document are unaffected. + Defaults to ``STRIP``. + Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index ef4e7ca..7d14fe7 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -26,6 +26,11 @@ BACKSLASH = 'backslash' ASTERISK = '*' UNDERSCORE = '_' +# Document strip styles +LSTRIP = 'lstrip' +RSTRIP = 'rstrip' +STRIP = 'strip' + def chomp(text): """ @@ -99,6 +104,7 @@ class MarkdownConverter(object): keep_inline_images_in = [] newline_style = SPACES strip = None + strip_document = STRIP strong_em_symbol = ASTERISK sub_symbol = '' sup_symbol = '' @@ -180,7 +186,18 @@ class MarkdownConverter(object): return text def convert__document_(self, el, text, convert_as_inline): - # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is + """Final document-level formatting for BeautifulSoup object (node.name == "[document]")""" + if self.options['strip_document'] == LSTRIP: + text = text.lstrip('\n') # remove leading separation newlines + elif self.options['strip_document'] == RSTRIP: + text = text.rstrip('\n') # remove trailing separation newlines + elif self.options['strip_document'] == STRIP: + text = text.strip('\n') # remove leading and trailing separation newlines + elif self.options['strip_document'] is None: + pass # leave leading and trailing separation newlines as-is + else: + raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document']) + return text def process_text(self, el): @@ -454,6 +471,7 @@ class MarkdownConverter(object): def convert_p(self, el, text, convert_as_inline): if convert_as_inline: return ' ' + text.strip() + ' ' + text = text.strip() if self.options['wrap']: # Preserve newlines (and preceding whitespace) resulting # from
      tags. Newlines in the input have already been @@ -500,13 +518,13 @@ class MarkdownConverter(object): convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) def convert_table(self, el, text, convert_as_inline): - return '\n\n' + text + '\n' + return '\n\n' + text.strip() + '\n\n' def convert_caption(self, el, text, convert_as_inline): - return text + '\n\n' + return text.strip() + '\n\n' def convert_figcaption(self, el, text, convert_as_inline): - return '\n\n' + text + '\n\n' + return '\n\n' + text.strip() + '\n\n' def convert_td(self, el, text, convert_as_inline): colspan = 1 diff --git a/tests/test_advanced.py b/tests/test_advanced.py index a3a5fda..6123d8c 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md def test_chomp(): diff --git a/tests/test_args.py b/tests/test_args.py index ebce4a8..301c19f 100644 --- a/tests/test_args.py +++ b/tests/test_args.py @@ -2,7 +2,8 @@ Test whitelisting/blacklisting of specific tags. """ -from markdownify import markdownify as md +from markdownify import markdownify, LSTRIP, RSTRIP, STRIP +from .utils import md def test_strip(): @@ -23,3 +24,11 @@ def test_convert(): def test_do_not_convert(): text = md('Some Text', convert=[]) assert text == 'Some Text' + + +def test_strip_document(): + assert markdownify("

      Hello

      ") == "Hello" # test default of STRIP + assert markdownify("

      Hello

      ", strip_document=LSTRIP) == "Hello\n\n" + assert markdownify("

      Hello

      ", strip_document=RSTRIP) == "\n\nHello" + assert markdownify("

      Hello

      ", strip_document=STRIP) == "Hello" + assert markdownify("

      Hello

      ", strip_document=None) == "\n\nHello\n\n" diff --git a/tests/test_basic.py b/tests/test_basic.py index 66f8b6c..584adb9 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md def test_single_tag(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 05c6cd4..1367006 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,5 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE +from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE +from .utils import md def inline_tests(tag, markup): @@ -79,11 +80,6 @@ def test_br(): assert md('a
      b
      c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' -def test_caption(): - assert md('TEXT
      Caption
      SPAN
      ') == 'TEXT\n\nCaption\n\nSPAN' - assert md('
      SPAN
      Caption
      TEXT') == 'SPAN\n\nCaption\n\nTEXT' - - def test_code(): inline_tests('code', '`') assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' @@ -126,6 +122,11 @@ def test_em(): inline_tests('em', '*') +def test_figcaption(): + assert (md("TEXT
      \nCaption\n
      SPAN
      ") == "TEXT\n\nCaption\n\nSPAN") + assert (md("
      SPAN
      \nCaption\n
      TEXT") == "SPAN\n\nCaption\n\nTEXT") + + def test_header_with_space(): assert md('

      \n\nHello

      ') == '\n\n### Hello\n\n' assert md('

      Hello\n\n\nWorld

      ') == '\n\n### Hello World\n\n' @@ -236,6 +237,7 @@ def test_kbd(): def test_p(): assert md('

      hello

      ') == '\n\nhello\n\n' + assert md("

      hello

      ") == "\n\nhello\n\n" assert md('

      123456789 123456789

      ') == '\n\n123456789 123456789\n\n' assert md('

      123456789\n\n\n123456789

      ') == '\n\n123456789\n123456789\n\n' assert md('

      123456789\n\n\n123456789

      ', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n' diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py index adc83f7..0d3f6af 100644 --- a/tests/test_custom_converter.py +++ b/tests/test_custom_converter.py @@ -20,8 +20,8 @@ def test_custom_conversion_functions(): def md(html, **options): return UnitTestConverter(**options).convert(html) - assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")\n\n' - assert md('Alt text') == '![Alt text](/path/to/img.jpg)\n\n' + assert md('Alt texttext') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext' + assert md('Alt texttext') == '![Alt text](/path/to/img.jpg)\n\ntext' assert md("text") == "FUNCTION USED: text" diff --git a/tests/test_escaping.py b/tests/test_escaping.py index 878760a..d213675 100644 --- a/tests/test_escaping.py +++ b/tests/test_escaping.py @@ -1,6 +1,6 @@ import warnings from bs4 import MarkupResemblesLocatorWarning -from markdownify import markdownify as md +from .utils import md def test_asterisks(): diff --git a/tests/test_lists.py b/tests/test_lists.py index ce54a87..6b320ca 100644 --- a/tests/test_lists.py +++ b/tests/test_lists.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md nested_uls = """ diff --git a/tests/test_tables.py b/tests/test_tables.py index da4bf53..e41b389 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -1,4 +1,4 @@ -from markdownify import markdownify as md +from .utils import md table = """
    @@ -228,7 +228,10 @@ table_body = """
    """ -table_with_caption = """TEXT +table_with_caption = """TEXT
    Caption
    + diff --git a/tests/utils.py b/tests/utils.py new file mode 100644 index 0000000..0dac580 --- /dev/null +++ b/tests/utils.py @@ -0,0 +1,9 @@ +from markdownify import MarkdownConverter + + +# for unit testing, disable document-level stripping by default so that +# separation newlines are included in testing +def md(html, **options): + options = {"strip_document": None, **options} + + return MarkdownConverter(**options).convert(html) From d0c4b85fd513efc5efa3128c565a5a49ac6ef867 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Tue, 4 Feb 2025 15:20:42 -0500 Subject: [PATCH 12/22] simplify computation of convert_children_as_inline variable (#182) Signed-off-by: chrispy --- markdownify/__init__.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 7d14fe7..82e8ab5 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -135,14 +135,13 @@ class MarkdownConverter(object): def process_tag(self, node, convert_as_inline): text = '' - # markdown headings or cells can't include - # block elements (elements w/newlines) - isHeading = html_heading_re.match(node.name) is not None - isCell = node.name in ['td', 'th'] - convert_children_as_inline = convert_as_inline - - if isHeading or isCell: - convert_children_as_inline = True + # For Markdown headings and table cells, convert children as inline + # (so that block element children do not produce newlines). + convert_children_as_inline = ( + convert_as_inline # propagated from parent + or html_heading_re.match(node.name) is not None # headings + or node.name in ['td', 'th'] # table cells + ) # Remove whitespace-only textnodes just before, after or # inside block-level elements. From c52a50e66a916791f825262f60f96a206287bf30 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Tue, 4 Feb 2025 15:39:32 -0500 Subject: [PATCH 13/22] when computing
    1. numbering, ignore non-
    2. previous siblings (#183) Signed-off-by: chrispy --- markdownify/__init__.py | 2 +- tests/test_lists.py | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 82e8ab5..93a889f 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -443,7 +443,7 @@ class MarkdownConverter(object): start = int(parent.get("start")) else: start = 1 - bullet = '%s.' % (start + parent.index(el)) + bullet = '%s.' % (start + len(el.find_previous_siblings('li'))) else: depth = -1 while el: diff --git a/tests/test_lists.py b/tests/test_lists.py index 6b320ca..e9480ab 100644 --- a/tests/test_lists.py +++ b/tests/test_lists.py @@ -42,6 +42,7 @@ nested_ols = """ def test_ol(): assert md('
      1. a
      2. b
      ') == '\n\n1. a\n2. b\n' + assert md('
      1. a
      2. b
      ') == '\n\n1. a\n2. b\n' assert md('
      1. a
      2. b
      ') == '\n\n3. a\n4. b\n' assert md('foo
      1. a
      2. b
      bar') == 'foo\n\n3. a\n4. b\n\nbar' assert md('
      1. a
      2. b
      ') == '\n\n1. a\n2. b\n' From 3026602686f9a77ba0b2e0f6e0cbd42daea978f5 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Tue, 4 Feb 2025 18:09:24 -0500 Subject: [PATCH 14/22] make conversion non-destructive to soup; improve div/article/section handling (#184) Signed-off-by: chrispy --- markdownify/__init__.py | 111 ++++++++++++++++++++++++++++---------- tests/test_basic.py | 2 +- tests/test_conversions.py | 15 +++++- 3 files changed, 98 insertions(+), 30 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 93a889f..9e4c99f 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -1,4 +1,4 @@ -from bs4 import BeautifulSoup, NavigableString, Comment, Doctype +from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag from textwrap import fill import re import six @@ -79,6 +79,7 @@ def should_remove_whitespace_inside(el): if html_heading_re.match(el.name) is not None: return True return el.name in ('p', 'blockquote', + 'article', 'div', 'section', 'ol', 'ul', 'li', 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th') @@ -89,6 +90,41 @@ def should_remove_whitespace_outside(el): return should_remove_whitespace_inside(el) or (el and el.name == 'pre') +def _is_block_content_element(el): + """ + In a block context, returns: + + - True for content elements (tags and non-whitespace text) + - False for non-content elements (whitespace text, comments, doctypes) + """ + if isinstance(el, Tag): + return True + elif isinstance(el, (Comment, Doctype)): + return False # (subclasses of NavigableString, must test first) + elif isinstance(el, NavigableString): + return el.strip() != '' + else: + return False + + +def _prev_block_content_sibling(el): + """Returns the first previous sibling that is a content element, else None.""" + while el is not None: + el = el.previous_sibling + if _is_block_content_element(el): + return el + return None + + +def _next_block_content_sibling(el): + """Returns the first next sibling that is a content element, else None.""" + while el is not None: + el = el.next_sibling + if _is_block_content_element(el): + return el + return None + + class MarkdownConverter(object): class DefaultOptions: autolinks = True @@ -143,29 +179,38 @@ class MarkdownConverter(object): or node.name in ['td', 'th'] # table cells ) - # Remove whitespace-only textnodes just before, after or - # inside block-level elements. + # Collect child elements to process, ignoring whitespace-only text elements + # adjacent to the inner/outer boundaries of block elements. should_remove_inside = should_remove_whitespace_inside(node) - for el in node.children: - # Only extract (remove) whitespace-only text node if any of the - # conditions is true: - # - el is the first element in its parent (block-level) - # - el is the last element in its parent (block-level) - # - el is adjacent to a block-level node - can_extract = (should_remove_inside and (not el.previous_sibling - or not el.next_sibling) - or should_remove_whitespace_outside(el.previous_sibling) - or should_remove_whitespace_outside(el.next_sibling)) - if (isinstance(el, NavigableString) - and six.text_type(el).strip() == '' - and can_extract): - el.extract() + + def _can_ignore(el): + if isinstance(el, Tag): + # Tags are always processed. + return False + elif isinstance(el, (Comment, Doctype)): + # Comment and Doctype elements are always ignored. + # (subclasses of NavigableString, must test first) + return True + elif isinstance(el, NavigableString): + if six.text_type(el).strip() != '': + # Non-whitespace text nodes are always processed. + return False + elif should_remove_inside and (not el.previous_sibling or not el.next_sibling): + # Inside block elements (excluding
      ), ignore adjacent whitespace elements.
      +                    return True
      +                elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
      +                    # Outside block elements (including 
      ), ignore adjacent whitespace elements.
      +                    return True
      +                else:
      +                    return False
      +            else:
      +                raise ValueError('Unexpected element type: %s' % type(el))
      +
      +        children_to_convert = [child for child in node.children if not _can_ignore(child)]
       
               # Convert the children first
      -        for el in node.children:
      -            if isinstance(el, Comment) or isinstance(el, Doctype):
      -                continue
      -            elif isinstance(el, NavigableString):
      +        for el in children_to_convert:
      +            if isinstance(el, NavigableString):
                       text += self.process_text(el)
                   else:
                       text_strip = text.rstrip('\n')
      @@ -337,6 +382,16 @@ class MarkdownConverter(object):
       
           convert_del = abstract_inline_conversion(lambda self: '~~')
       
      +    def convert_div(self, el, text, convert_as_inline):
      +        if convert_as_inline:
      +            return ' ' + text.strip() + ' '
      +        text = text.strip()
      +        return '\n\n%s\n\n' % text if text else ''
      +
      +    convert_article = convert_div
      +
      +    convert_section = convert_div
      +
           convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
       
           convert_kbd = convert_code
      @@ -415,7 +470,8 @@ class MarkdownConverter(object):
       
               nested = False
               before_paragraph = False
      -        if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
      +        next_sibling = _next_block_content_sibling(el)
      +        if next_sibling and next_sibling.name not in ['ul', 'ol']:
                   before_paragraph = True
               while el:
                   if el.name == 'li':
      @@ -539,6 +595,7 @@ class MarkdownConverter(object):
       
           def convert_tr(self, el, text, convert_as_inline):
               cells = el.find_all(['td', 'th'])
      +        is_first_row = el.find_previous_sibling() is None
               is_headrow = (
                   all([cell.name == 'th' for cell in cells])
                   or (el.parent.name == 'thead'
      @@ -546,15 +603,15 @@ class MarkdownConverter(object):
                       and len(el.parent.find_all('tr')) == 1)
               )
               is_head_row_missing = (
      -            (not el.previous_sibling and not el.parent.name == 'tbody')
      -            or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
      +            (is_first_row and not el.parent.name == 'tbody')
      +            or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
               )
               overline = ''
               underline = ''
               if ((is_headrow
                    or (is_head_row_missing
                        and self.options['table_infer_header']))
      -                and not el.previous_sibling):
      +                and is_first_row):
                   # first row and:
                   # - is headline or
                   # - headline is missing and header inference is enabled
      @@ -568,10 +625,10 @@ class MarkdownConverter(object):
                   underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
               elif ((is_head_row_missing
                      and not self.options['table_infer_header'])
      -              or (not el.previous_sibling
      +              or (is_first_row
                         and (el.parent.name == 'table'
                              or (el.parent.name == 'tbody'
      -                           and not el.parent.previous_sibling)))):
      +                           and not el.parent.find_previous_sibling())))):
                   # headline is missing and header inference is disabled or:
                   # first row, not headline, and:
                   #  - the parent is table or
      diff --git a/tests/test_basic.py b/tests/test_basic.py
      index 584adb9..9be524e 100644
      --- a/tests/test_basic.py
      +++ b/tests/test_basic.py
      @@ -6,7 +6,7 @@ def test_single_tag():
       
       
       def test_soup():
      -    assert md('
      Hello
      ') == 'Hello' + assert md('
      Hello
      ') == '\n\nHello\n\n' def test_whitespace(): diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 1367006..1739cb9 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -114,8 +114,19 @@ def test_del(): inline_tests('del', '~~') -def test_div(): - assert md('Hello World') == 'Hello World' +def test_div_section_article(): + for tag in ['div', 'section', 'article']: + assert md(f'<{tag}>456') == '\n\n456\n\n' + assert md(f'123<{tag}>456789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>\n 456 \n789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>

      456

      789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>\n

      456

      \n789') == '123\n\n456\n\n789' + assert md(f'123<{tag}>
      4 5 6
      789') == '123\n\n```\n4 5 6\n```\n\n789' + assert md(f'123<{tag}>\n
      4 5 6
      \n789') == '123\n\n```\n4 5 6\n```\n\n789' + assert md(f'123<{tag}>4\n5\n6789') == '123\n\n4\n5\n6\n\n789' + assert md(f'123<{tag}>\n4\n5\n6\n789') == '123\n\n4\n5\n6\n\n789' + assert md(f'123<{tag}>\n

      \n4\n5\n6\n

      \n789') == '123\n\n4\n5\n6\n\n789' + assert md(f'<{tag}>

      title

      body', heading_style=ATX) == '\n\n# title\n\nbody\n\n' def test_em(): From c52ba47166acb1225dccc58ed814d89197f073c3 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Mon, 17 Feb 2025 08:44:01 -0500 Subject: [PATCH 15/22] use list-based processing (inspired by AlextheYounga) (#186) --- markdownify/__init__.py | 57 +++++++++++++++++++++++++++++++---------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 9e4c99f..e2cacd9 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -11,6 +11,10 @@ all_whitespace_re = re.compile(r'[\t \r\n]+') newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') html_heading_re = re.compile(r'h[1-6]') +# extract (leading_nl, content, trailing_nl) from a string +# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) +extract_newlines_re = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) + # Heading styles ATX = 'atx' @@ -168,6 +172,12 @@ class MarkdownConverter(object): def convert_soup(self, soup): return self.process_tag(soup, convert_as_inline=False) + def process_element(self, node, convert_as_inline): + if isinstance(node, NavigableString): + return self.process_text(node) + else: + return self.process_tag(node, convert_as_inline) + def process_tag(self, node, convert_as_inline): text = '' @@ -203,23 +213,44 @@ class MarkdownConverter(object): return True else: return False + elif el is None: + return True else: raise ValueError('Unexpected element type: %s' % type(el)) - children_to_convert = [child for child in node.children if not _can_ignore(child)] + children_to_convert = [el for el in node.children if not _can_ignore(el)] - # Convert the children first - for el in children_to_convert: - if isinstance(el, NavigableString): - text += self.process_text(el) - else: - text_strip = text.rstrip('\n') - newlines_left = len(text) - len(text_strip) - next_text = self.process_tag(el, convert_children_as_inline) - next_text_strip = next_text.lstrip('\n') - newlines_right = len(next_text) - len(next_text_strip) - newlines = '\n' * max(newlines_left, newlines_right) - text = text_strip + newlines + next_text_strip + # Convert the children elements into a list of result strings. + child_strings = [self.process_element(el, convert_children_as_inline) for el in children_to_convert] + + # Remove empty string values. + child_strings = [s for s in child_strings if s] + + # Collapse newlines at child element boundaries, if needed. + if node.name == 'pre' or node.find_parent('pre'): + # Inside
       blocks, do not collapse newlines.
      +            pass
      +        else:
      +            # Collapse newlines at child element boundaries.
      +            updated_child_strings = ['']  # so the first lookback works
      +            for child_string in child_strings:
      +                # Separate the leading/trailing newlines from the content.
      +                leading_nl, content, trailing_nl = extract_newlines_re.match(child_string).groups()
      +
      +                # If the last child had trailing newlines and this child has leading newlines,
      +                # use the larger newline count, limited to 2.
      +                if updated_child_strings[-1] and leading_nl:
      +                    prev_trailing_nl = updated_child_strings.pop()  # will be replaced by the collapsed value
      +                    num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
      +                    leading_nl = '\n' * num_newlines
      +
      +                # Add the results to the updated child string list.
      +                updated_child_strings.extend([leading_nl, content, trailing_nl])
      +
      +            child_strings = updated_child_strings
      +
      +        # Join all child text strings into a single string.
      +        text = ''.join(child_strings)
       
               # apply this tag's final conversion function
               convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
      
      From 5655f27208d283b82327c3d3a855e91d012ad222 Mon Sep 17 00:00:00 2001
      From: Chris Papademetrious 
      Date: Tue, 18 Feb 2025 16:35:36 -0500
      Subject: [PATCH 16/22] propagate parent tag context downward to improve
       runtime (#191)
      
      ---
       README.rst                     |   8 +-
       markdownify/__init__.py        | 143 ++++++++++++++++++---------------
       tests/test_custom_converter.py |   6 +-
       3 files changed, 84 insertions(+), 73 deletions(-)
      
      diff --git a/README.rst b/README.rst
      index b37a503..c6c6d84 100644
      --- a/README.rst
      +++ b/README.rst
      @@ -180,7 +180,7 @@ If you have a special usecase that calls for a special conversion, you can
       always inherit from ``MarkdownConverter`` and override the method you want to
       change.
       The function that handles a HTML tag named ``abc`` is called
      -``convert_abc(self, el, text, convert_as_inline)`` and returns a string
      +``convert_abc(self, el, text, parent_tags)`` and returns a string
       containing the converted HTML tag.
       The ``MarkdownConverter`` object will handle the conversion based on the
       function names:
      @@ -193,8 +193,8 @@ function names:
               """
               Create a custom MarkdownConverter that adds two newlines after an image
               """
      -        def convert_img(self, el, text, convert_as_inline):
      -            return super().convert_img(el, text, convert_as_inline) + '\n\n'
      +        def convert_img(self, el, text, parent_tags):
      +            return super().convert_img(el, text, parent_tags) + '\n\n'
       
           # Create shorthand method for conversion
           def md(html, **options):
      @@ -208,7 +208,7 @@ function names:
               """
               Create a custom MarkdownConverter that ignores paragraphs
               """
      -        def convert_p(self, el, text, convert_as_inline):
      +        def convert_p(self, el, text, parent_tags):
                   return ''
       
           # Create shorthand method for conversion
      diff --git a/markdownify/__init__.py b/markdownify/__init__.py
      index e2cacd9..79ba8e7 100644
      --- a/markdownify/__init__.py
      +++ b/markdownify/__init__.py
      @@ -57,13 +57,13 @@ def abstract_inline_conversion(markup_fn):
           the text if it looks like an HTML tag. markup_fn is necessary to allow for
           references to self.strong_em_symbol etc.
           """
      -    def implementation(self, el, text, convert_as_inline):
      +    def implementation(self, el, text, parent_tags):
               markup_prefix = markup_fn(self)
               if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
                   markup_suffix = ' to call this """
      -        if convert_as_inline:
      +        if '_inline' in parent_tags:
                   return text
       
               # prevent MemoryErrors in case of very large n
      @@ -478,46 +495,40 @@ class MarkdownConverter(object):
                   return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
               return '\n\n%s %s\n\n' % (hashes, text)
       
      -    def convert_hr(self, el, text, convert_as_inline):
      +    def convert_hr(self, el, text, parent_tags):
               return '\n\n---\n\n'
       
           convert_i = convert_em
       
      -    def convert_img(self, el, text, convert_as_inline):
      +    def convert_img(self, el, text, parent_tags):
               alt = el.attrs.get('alt', None) or ''
               src = el.attrs.get('src', None) or ''
               title = el.attrs.get('title', None) or ''
               title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
      -        if (convert_as_inline
      +        if ('_inline' in parent_tags
                       and el.parent.name not in self.options['keep_inline_images_in']):
                   return alt
       
               return '![%s](%s%s)' % (alt, src, title_part)
       
      -    def convert_list(self, el, text, convert_as_inline):
      +    def convert_list(self, el, text, parent_tags):
       
               # Converting a list to inline is undefined.
      -        # Ignoring convert_to_inline for list.
      +        # Ignoring inline conversion parents for list.
       
      -        nested = False
               before_paragraph = False
               next_sibling = _next_block_content_sibling(el)
               if next_sibling and next_sibling.name not in ['ul', 'ol']:
                   before_paragraph = True
      -        while el:
      -            if el.name == 'li':
      -                nested = True
      -                break
      -            el = el.parent
      -        if nested:
      -            # remove trailing newline if nested
      +        if 'li' in parent_tags:
      +            # remove trailing newline if we're in a nested list
                   return '\n' + text.rstrip()
               return '\n\n' + text + ('\n' if before_paragraph else '')
       
           convert_ul = convert_list
           convert_ol = convert_list
       
      -    def convert_li(self, el, text, convert_as_inline):
      +    def convert_li(self, el, text, parent_tags):
               # handle some early-exit scenarios
               text = (text or '').strip()
               if not text:
      @@ -554,8 +565,8 @@ class MarkdownConverter(object):
       
               return '%s\n' % text
       
      -    def convert_p(self, el, text, convert_as_inline):
      -        if convert_as_inline:
      +    def convert_p(self, el, text, parent_tags):
      +        if '_inline' in parent_tags:
                   return ' ' + text.strip() + ' '
               text = text.strip()
               if self.options['wrap']:
      @@ -577,7 +588,7 @@ class MarkdownConverter(object):
                       text = '\n'.join(new_lines)
               return '\n\n%s\n\n' % text if text else ''
       
      -    def convert_pre(self, el, text, convert_as_inline):
      +    def convert_pre(self, el, text, parent_tags):
               if not text:
                   return ''
               code_language = self.options['code_language']
      @@ -587,10 +598,10 @@ class MarkdownConverter(object):
       
               return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
       
      -    def convert_script(self, el, text, convert_as_inline):
      +    def convert_script(self, el, text, parent_tags):
               return ''
       
      -    def convert_style(self, el, text, convert_as_inline):
      +    def convert_style(self, el, text, parent_tags):
               return ''
       
           convert_s = convert_del
      @@ -603,28 +614,28 @@ class MarkdownConverter(object):
       
           convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
       
      -    def convert_table(self, el, text, convert_as_inline):
      +    def convert_table(self, el, text, parent_tags):
               return '\n\n' + text.strip() + '\n\n'
       
      -    def convert_caption(self, el, text, convert_as_inline):
      +    def convert_caption(self, el, text, parent_tags):
               return text.strip() + '\n\n'
       
      -    def convert_figcaption(self, el, text, convert_as_inline):
      +    def convert_figcaption(self, el, text, parent_tags):
               return '\n\n' + text.strip() + '\n\n'
       
      -    def convert_td(self, el, text, convert_as_inline):
      +    def convert_td(self, el, text, parent_tags):
               colspan = 1
               if 'colspan' in el.attrs and el['colspan'].isdigit():
                   colspan = int(el['colspan'])
               return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
       
      -    def convert_th(self, el, text, convert_as_inline):
      +    def convert_th(self, el, text, parent_tags):
               colspan = 1
               if 'colspan' in el.attrs and el['colspan'].isdigit():
                   colspan = int(el['colspan'])
               return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
       
      -    def convert_tr(self, el, text, convert_as_inline):
      +    def convert_tr(self, el, text, parent_tags):
               cells = el.find_all(['td', 'th'])
               is_first_row = el.find_previous_sibling() is None
               is_headrow = (
      diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py
      index 0d3f6af..f4734c9 100644
      --- a/tests/test_custom_converter.py
      +++ b/tests/test_custom_converter.py
      @@ -6,11 +6,11 @@ class UnitTestConverter(MarkdownConverter):
           """
           Create a custom MarkdownConverter for unit tests
           """
      -    def convert_img(self, el, text, convert_as_inline):
      +    def convert_img(self, el, text, parent_tags):
               """Add two newlines after an image"""
      -        return super().convert_img(el, text, convert_as_inline) + '\n\n'
      +        return super().convert_img(el, text, parent_tags) + '\n\n'
       
      -    def convert_custom_tag(self, el, text, convert_as_inline):
      +    def convert_custom_tag(self, el, text, parent_tags):
               """Ensure conversion function is found for tags with special characters in name"""
               return "FUNCTION USED: %s" % text
       
      
      From 3311f4d8963edf5fece6c06010e21705f9810a3a Mon Sep 17 00:00:00 2001
      From: Joseph Myers 
      Date: Wed, 19 Feb 2025 12:40:53 +0000
      Subject: [PATCH 17/22] Avoid stripping nonbreaking spaces (#188)
      
      ---
       markdownify/__init__.py   | 10 +++++-----
       tests/test_conversions.py |  2 ++
       2 files changed, 7 insertions(+), 5 deletions(-)
      
      diff --git a/markdownify/__init__.py b/markdownify/__init__.py
      index 79ba8e7..32a3cf6 100644
      --- a/markdownify/__init__.py
      +++ b/markdownify/__init__.py
      @@ -313,7 +313,7 @@ class MarkdownConverter(object):
               if (should_remove_whitespace_outside(el.previous_sibling)
                       or (should_remove_whitespace_inside(el.parent)
                           and not el.previous_sibling)):
      -            text = text.lstrip()
      +            text = text.lstrip(' \t\r\n')
               if (should_remove_whitespace_outside(el.next_sibling)
                       or (should_remove_whitespace_inside(el.parent)
                           and not el.next_sibling)):
      @@ -399,7 +399,7 @@ class MarkdownConverter(object):
       
           def convert_blockquote(self, el, text, parent_tags):
               # handle some early-exit scenarios
      -        text = (text or '').strip()
      +        text = (text or '').strip(' \t\r\n')
               if '_inline' in parent_tags:
                   return ' ' + text + ' '
               if not text:
      @@ -567,8 +567,8 @@ class MarkdownConverter(object):
       
           def convert_p(self, el, text, parent_tags):
               if '_inline' in parent_tags:
      -            return ' ' + text.strip() + ' '
      -        text = text.strip()
      +            return ' ' + text.strip(' \t\r\n') + ' '
      +        text = text.strip(' \t\r\n')
               if self.options['wrap']:
                   # Preserve newlines (and preceding whitespace) resulting
                   # from 
      tags. Newlines in the input have already been @@ -577,7 +577,7 @@ class MarkdownConverter(object): lines = text.split('\n') new_lines = [] for line in lines: - line = line.lstrip() + line = line.lstrip(' \t\r\n') line_no_trailing = line.rstrip() trailing = line[len(line_no_trailing):] line = fill(line, diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 1739cb9..e851ac2 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -59,6 +59,7 @@ def test_b_spaces(): def test_blockquote(): assert md('
      Hello
      ') == '\n> Hello\n\n' assert md('
      \nHello\n
      ') == '\n> Hello\n\n' + assert md('
       Hello
      ') == '\n> \u00a0Hello\n\n' def test_blockquote_with_nested_paragraph(): @@ -266,6 +267,7 @@ def test_p(): assert md('

      1234 5678 9012
      67890

      ', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n' assert md('

      1234 5678 9012
      67890

      ', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n' assert md('First

      Second

      Third

      Fourth') == 'First\n\nSecond\n\nThird\n\nFourth' + assert md('

       x y

      ', wrap=True, wrap_width=80) == '\n\n\u00a0x y\n\n' def test_pre(): From c7329ac1ef002d7ff35fa6395eefcf4c77e5e81f Mon Sep 17 00:00:00 2001 From: Joseph Myers Date: Wed, 19 Feb 2025 15:04:29 +0000 Subject: [PATCH 18/22] Escape right square brackets (#187) --- markdownify/__init__.py | 6 +++--- tests/test_escaping.py | 4 +++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 32a3cf6..3ff0380 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -305,7 +305,7 @@ class MarkdownConverter(object): # escape special characters if we're not inside a preformatted or code element if '_noformat' not in parent_tags: - text = self.escape(text) + text = self.escape(text, parent_tags) # remove leading whitespace at the start or just after a # block-level element; remove traliing whitespace at the end @@ -347,11 +347,11 @@ class MarkdownConverter(object): else: return True - def escape(self, text): + def escape(self, text, parent_tags): if not text: return '' if self.options['escape_misc']: - text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text) + text = re.sub(r'([]\\&<`[>~=+|])', r'\\\1', text) # A sequence of one or more consecutive '-', preceded and # followed by whitespace or start/end of fragment, might # be confused with an underline of a header, or with a diff --git a/tests/test_escaping.py b/tests/test_escaping.py index d213675..bab4d11 100644 --- a/tests/test_escaping.py +++ b/tests/test_escaping.py @@ -51,7 +51,9 @@ def test_misc(): assert md('-y', escape_misc=True) == '-y' assert md('+ x\n+ y\n', escape_misc=True) == '\\+ x\n\\+ y\n' assert md('`x`', escape_misc=True) == r'\`x\`' - assert md('[text](link)', escape_misc=True) == r'\[text](link)' + assert md('[text](notalink)', escape_misc=True) == r'\[text\](notalink)' + assert md('text]', escape_misc=True) == r'[text\]](link)' + assert md('[text]', escape_misc=True) == r'[\[text\]](link)' assert md('1. x', escape_misc=True) == r'1\. x' # assert md('1. x', escape_misc=True) == r'1\. x' assert md('1. x', escape_misc=True) == r'1\. x' From 24977fd1929e4b5d8287a04367837beb4e069f5a Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Wed, 19 Feb 2025 20:01:12 -0500 Subject: [PATCH 19/22] rename regex pattern variables (#195) Signed-off-by: chrispy --- markdownify/__init__.py | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 3ff0380..a1c6d9a 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -4,16 +4,16 @@ import re import six -convert_heading_re = re.compile(r'convert_h(\d+)') -line_with_content_re = re.compile(r'^(.*)', flags=re.MULTILINE) -whitespace_re = re.compile(r'[\t ]+') -all_whitespace_re = re.compile(r'[\t \r\n]+') -newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') -html_heading_re = re.compile(r'h[1-6]') +re_convert_heading = re.compile(r'convert_h(\d+)') +re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE) +re_whitespace = re.compile(r'[\t ]+') +re_all_whitespace = re.compile(r'[\t \r\n]+') +re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') +re_html_heading = re.compile(r'h[1-6]') # extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) -extract_newlines_re = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) +re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) # Heading styles @@ -80,7 +80,7 @@ def should_remove_whitespace_inside(el): """Return to remove whitespace immediately inside a block-level element.""" if not el or not el.name: return False - if html_heading_re.match(el.name) is not None: + if re_html_heading.match(el.name) is not None: return True return el.name in ('p', 'blockquote', 'article', 'div', 'section', @@ -221,7 +221,7 @@ class MarkdownConverter(object): # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag if ( - html_heading_re.match(node.name) is not None # headings + re_html_heading.match(node.name) is not None # headings or node.name in {'td', 'th'} # table cells ): parent_tags_for_children.add('_inline') @@ -248,7 +248,7 @@ class MarkdownConverter(object): updated_child_strings = [''] # so the first lookback works for child_string in child_strings: # Separate the leading/trailing newlines from the content. - leading_nl, content, trailing_nl = extract_newlines_re.match(child_string).groups() + leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups() # If the last child had trailing newlines and this child has leading newlines, # use the larger newline count, limited to 2. @@ -298,10 +298,10 @@ class MarkdownConverter(object): # normalize whitespace if we're not inside a preformatted element if 'pre' not in parent_tags: if self.options['wrap']: - text = all_whitespace_re.sub(' ', text) + text = re_all_whitespace.sub(' ', text) else: - text = newline_whitespace_re.sub('\n', text) - text = whitespace_re.sub(' ', text) + text = re_newline_whitespace.sub('\n', text) + text = re_whitespace.sub(' ', text) # escape special characters if we're not inside a preformatted or code element if '_noformat' not in parent_tags: @@ -323,7 +323,7 @@ class MarkdownConverter(object): def __getattr__(self, attr): # Handle headings - m = convert_heading_re.match(attr) + m = re_convert_heading.match(attr) if m: n = int(m.group(1)) @@ -409,7 +409,7 @@ class MarkdownConverter(object): def _indent_for_blockquote(match): line_content = match.group(1) return '> ' + line_content if line_content else '>' - text = line_with_content_re.sub(_indent_for_blockquote, text) + text = re_line_with_content.sub(_indent_for_blockquote, text) return '\n' + text + '\n\n' @@ -455,7 +455,7 @@ class MarkdownConverter(object): def _indent_for_dd(match): line_content = match.group(1) return ' ' + line_content if line_content else '' - text = line_with_content_re.sub(_indent_for_dd, text) + text = re_line_with_content.sub(_indent_for_dd, text) # insert definition marker into first-line indent whitespace text = ':' + text[1:] @@ -465,7 +465,7 @@ class MarkdownConverter(object): def convert_dt(self, el, text, parent_tags): # remove newlines from term text text = (text or '').strip() - text = all_whitespace_re.sub(' ', text) + text = re_all_whitespace.sub(' ', text) if '_inline' in parent_tags: return ' ' + text + ' ' if not text: @@ -489,7 +489,7 @@ class MarkdownConverter(object): if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) - text = all_whitespace_re.sub(' ', text) + text = re_all_whitespace.sub(' ', text) hashes = '#' * n if style == ATX_CLOSED: return '\n\n%s %s %s\n\n' % (hashes, text, hashes) @@ -558,7 +558,7 @@ class MarkdownConverter(object): def _indent_for_li(match): line_content = match.group(1) return bullet_indent + line_content if line_content else '' - text = line_with_content_re.sub(_indent_for_li, text) + text = re_line_with_content.sub(_indent_for_li, text) # insert bullet into first-line indent whitespace text = bullet + text[bullet_width:] From 6984dca7ab65c3d206ac2f31e8f4352558f8e807 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Mon, 24 Feb 2025 11:48:40 -0500 Subject: [PATCH 20/22] use a conversion function cache to improve runtime (#196) Signed-off-by: chrispy --- markdownify/__init__.py | 48 ++++++++++++++++++++++++++--------------- 1 file changed, 31 insertions(+), 17 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index a1c6d9a..31001de 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -4,12 +4,11 @@ import re import six -re_convert_heading = re.compile(r'convert_h(\d+)') re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE) re_whitespace = re.compile(r'[\t ]+') re_all_whitespace = re.compile(r'[\t \r\n]+') re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') -re_html_heading = re.compile(r'h[1-6]') +re_html_heading = re.compile(r'h(\d+)') # extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) @@ -165,6 +164,9 @@ class MarkdownConverter(object): raise ValueError('You may specify either tags to strip or tags to' ' convert, but not both.') + # Initialize the conversion function cache + self.convert_fn_cache = {} + def convert(self, html): soup = BeautifulSoup(html, 'html.parser') return self.convert_soup(soup) @@ -266,9 +268,8 @@ class MarkdownConverter(object): text = ''.join(child_strings) # apply this tag's final conversion function - convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name) - convert_fn = getattr(self, convert_fn_name, None) - if convert_fn and self.should_convert_tag(node.name): + convert_fn = self.get_conv_fn_cached(node.name) + if convert_fn is not None: text = convert_fn(node, text, parent_tags=parent_tags) return text @@ -321,23 +322,36 @@ class MarkdownConverter(object): return text - def __getattr__(self, attr): - # Handle headings - m = re_convert_heading.match(attr) - if m: - n = int(m.group(1)) + def get_conv_fn_cached(self, tag_name): + """Given a tag name, return the conversion function using the cache.""" + # If conversion function is not in cache, add it + if tag_name not in self.convert_fn_cache: + self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name) - def convert_tag(el, text, parent_tags): - return self._convert_hn(n, el, text, parent_tags) + # Return the cached entry + return self.convert_fn_cache[tag_name] - convert_tag.__name__ = 'convert_h%s' % n - setattr(self, convert_tag.__name__, convert_tag) - return convert_tag + def get_conv_fn(self, tag_name): + """Given a tag name, find and return the conversion function.""" + tag_name = tag_name.lower() - raise AttributeError(attr) + # Handle strip/convert exclusion options + if not self.should_convert_tag(tag_name): + return None + + # Handle headings with _convert_hn() function + match = re_html_heading.match(tag_name) + if match: + n = int(match.group(1)) + return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags) + + # For other tags, look up their conversion function by tag name + convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", tag_name) + convert_fn = getattr(self, convert_fn_name, None) + return convert_fn def should_convert_tag(self, tag): - tag = tag.lower() + """Given a tag name, return whether to convert based on strip/convert options.""" strip = self.options['strip'] convert = self.options['convert'] if strip is not None: From ba5e222b45cd0097b07f3c4c8ec3bfca6c59d2d3 Mon Sep 17 00:00:00 2001 From: Chris Papademetrious Date: Mon, 24 Feb 2025 12:29:09 -0500 Subject: [PATCH 21/22] use compiled regex for escaping patterns (#194) Signed-off-by: chrispy --- markdownify/__init__.py | 45 ++++++++++++++++++++++++++--------------- 1 file changed, 29 insertions(+), 16 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 31001de..5d21506 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -4,16 +4,38 @@ import re import six +# General-purpose regex patterns +re_convert_heading = re.compile(r'convert_h(\d+)') re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE) re_whitespace = re.compile(r'[\t ]+') re_all_whitespace = re.compile(r'[\t \r\n]+') re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') re_html_heading = re.compile(r'h(\d+)') -# extract (leading_nl, content, trailing_nl) from a string +# Pattern for creating convert_ function names from tag names +re_make_convert_fn_name = re.compile(r'[\[\]:-]') + +# Extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) +# Escape miscellaneous special Markdown characters +re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])') + +# Escape sequence of one or more consecutive '-', preceded +# and followed by whitespace or start/end of fragment, as it +# might be confused with an underline of a header, or with a +# list marker +re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))') + +# Escape sequence of up to six consecutive '#', preceded +# and followed by whitespace or start/end of fragment, as +# it might be confused with an ATX heading +re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))') + +# Escape '.' or ')' preceded by up to nine digits, as it might be +# confused with a list item +re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))') # Heading styles ATX = 'atx' @@ -346,7 +368,7 @@ class MarkdownConverter(object): return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags) # For other tags, look up their conversion function by tag name - convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", tag_name) + convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', tag_name) convert_fn = getattr(self, convert_fn_name, None) return convert_fn @@ -365,20 +387,11 @@ class MarkdownConverter(object): if not text: return '' if self.options['escape_misc']: - text = re.sub(r'([]\\&<`[>~=+|])', r'\\\1', text) - # A sequence of one or more consecutive '-', preceded and - # followed by whitespace or start/end of fragment, might - # be confused with an underline of a header, or with a - # list marker. - text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text) - # A sequence of up to six consecutive '#', preceded and - # followed by whitespace or start/end of fragment, might - # be confused with an ATX heading. - text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text) - # '.' or ')' preceded by up to nine digits might be - # confused with a list item. - text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2', - text) + text = re_escape_misc_chars.sub(r'\\\1', text) + text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text) + text = re_escape_misc_hashes.sub(r'\1\\\2', text) + text = re_escape_misc_list_items.sub(r'\1\\\2', text) + if self.options['escape_asterisks']: text = text.replace('*', r'\*') if self.options['escape_underscores']: From daa9e28287220f6f1a2d9ef423679f14c6175bd5 Mon Sep 17 00:00:00 2001 From: chrispy Date: Mon, 24 Feb 2025 16:18:23 -0500 Subject: [PATCH 22/22] bump to version v1.0.0 Signed-off-by: chrispy --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index e5ce8cb..a6ae3e5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "markdownify" -version = "0.14.1" +version = "1.0.0" authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}] description = "Convert HTML to markdown." readme = "README.rst"
    + Caption +
    Firstname Lastname Age