Merge branch 'develop'

bump to v0.13.0
fix pytest version to 8
2024-07-14 21:20:04 +02:00 · 2024-07-14 21:19:35 +02:00 · 2024-07-14 21:02:49 +02:00 · 2024-06-23 14:30:07 +02:00 · 2024-06-23 14:28:53 +02:00 · 2024-06-23 13:30:08 +02:00
9 changed files with 93 additions and 15 deletions
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -1 +1,2 @@
 include README.rst
+prune tests
--- a/README.rst
+++ b/README.rst
@@ -1,8 +1,8 @@
 |build| |version| |license| |downloads|

-.. |build| image:: https://img.shields.io/github/workflow/status/matthewwithanm/python-markdownify/Python%20application/develop
+.. |build| image:: https://img.shields.io/github/actions/workflow/status/matthewwithanm/python-markdownify/python-app.yml?branch=develop
    :alt: GitHub Workflow Status
-    :target: https://github.com/matthewwithanm/python-markdownify/actions?query=workflow%3A%22Python+application%22
+    :target: https://github.com/matthewwithanm/python-markdownify/actions/workflows/python-app.yml?query=workflow%3A%22Python+application%22

 .. |version| image:: https://img.shields.io/pypi/v/markdownify
    :alt: Pypi version
@@ -87,7 +87,11 @@ strong_em_symbol
 sub_symbol, sup_symbol
  Define the chars that surround ``<sub>`` and ``<sup>`` text. Defaults to an
  empty string, because this is non-standard behavior. Could be something like
-  ``~`` and ``^`` to result in ``~sub~`` and ``^sup^``.
+  ``~`` and ``^`` to result in ``~sub~`` and ``^sup^``.  If the value starts
+  with ``<`` and ends with ``>``, it is treated as an HTML tag and a ``/`` is
+  inserted after the ``<`` in the string used after the text; this allows
+  specifying ``<sub>`` to use raw HTML in the output for subscripts, for
+  example.

 newline_style
  Defines the style of marking linebreaks (``<br>``) in markdown. The default
@@ -123,6 +127,11 @@ escape_underscores
  If set to ``False``, do not escape ``_`` to ``\_`` in text.
  Defaults to ``True``.

+escape_misc
+  If set to ``False``, do not escape miscellaneous punctuation characters
+  that sometimes have Markdown significance in text.
+  Defaults to ``True``.
+
 keep_inline_images_in
  Images are converted to their alt-text when the images are located inside
  headlines or table cells. If some inline images should be converted to
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -43,15 +43,22 @@ def abstract_inline_conversion(markup_fn):
    """
    This abstracts all simple inline tags like b, em, del, ...
    Returns a function that wraps the chomped text in a pair of the string
-    that is returned by markup_fn. markup_fn is necessary to allow for
+    that is returned by markup_fn, with '/' inserted in the string used after
+    the text if it looks like an HTML tag. markup_fn is necessary to allow for
    references to self.strong_em_symbol etc.
    """
    def implementation(self, el, text, convert_as_inline):
-        markup = markup_fn(self)
+        markup_prefix = markup_fn(self)
+        if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
+            markup_suffix = '</' + markup_prefix[1:]
+        else:
+            markup_suffix = markup_prefix
+        if el.find_parent(['pre', 'code', 'kbd', 'samp']):
+            return text
        prefix, suffix, text = chomp(text)
        if not text:
            return ''
-        return '%s%s%s%s%s' % (prefix, markup, text, markup, suffix)
+        return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix)
    return implementation


@@ -69,6 +76,7 @@ class MarkdownConverter(object):
        default_title = False
        escape_asterisks = True
        escape_underscores = True
+        escape_misc = True
        heading_style = UNDERLINED
        keep_inline_images_in = []
        newline_style = SPACES
@@ -199,6 +207,9 @@ class MarkdownConverter(object):
    def escape(self, text):
        if not text:
            return ''
+        if self.options['escape_misc']:
+            text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
+            text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
        if self.options['escape_asterisks']:
            text = text.replace('*', r'\*')
        if self.options['escape_underscores']:
@@ -315,7 +326,7 @@ class MarkdownConverter(object):
    def convert_li(self, el, text, convert_as_inline):
        parent = el.parent
        if parent is not None and parent.name == 'ol':
-            if parent.get("start"):
+            if parent.get("start") and str(parent.get("start")).isnumeric():
                start = int(parent.get("start"))
            else:
                start = 1
@@ -377,13 +388,13 @@ class MarkdownConverter(object):

    def convert_td(self, el, text, convert_as_inline):
        colspan = 1
-        if 'colspan' in el.attrs:
+        if 'colspan' in el.attrs and el['colspan'].isdigit():
            colspan = int(el['colspan'])
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

    def convert_th(self, el, text, convert_as_inline):
        colspan = 1
-        if 'colspan' in el.attrs:
+        if 'colspan' in el.attrs and el['colspan'].isdigit():
            colspan = int(el['colspan'])
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

@@ -400,7 +411,7 @@ class MarkdownConverter(object):
            # first row and is headline: print headline underline
            full_colspan = 0
            for cell in cells:
-                if "colspan" in cell.attrs:
+                if 'colspan' in cell.attrs and cell['colspan'].isdigit():
                    full_colspan += int(cell["colspan"])
                else:
                    full_colspan += 1
--- a/setup.py
+++ b/setup.py
@@ -9,7 +9,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
 pkgmeta = {
    '__title__': 'markdownify',
    '__author__': 'Matthew Tretter',
-    '__version__': '0.12.1',
+    '__version__': '0.13.0',
 }

 read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -87,6 +87,16 @@ def test_code():
    assert md('<code><span>*this_should_not_escape*</span></code>') == '`*this_should_not_escape*`'
    assert md('<code>this  should\t\tnormalize</code>') == '`this should normalize`'
    assert md('<code><span>this  should\t\tnormalize</span></code>') == '`this should normalize`'
+    assert md('<code>foo<b>bar</b>baz</code>') == '`foobarbaz`'
+    assert md('<kbd>foo<i>bar</i>baz</kbd>') == '`foobarbaz`'
+    assert md('<samp>foo<del> bar </del>baz</samp>') == '`foo bar baz`'
+    assert md('<samp>foo <del>bar</del> baz</samp>') == '`foo bar baz`'
+    assert md('<code>foo<em> bar </em>baz</code>') == '`foo bar baz`'
+    assert md('<code>foo<code> bar </code>baz</code>') == '`foo bar baz`'
+    assert md('<code>foo<strong> bar </strong>baz</code>') == '`foo bar baz`'
+    assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
+    assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
+    assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'


 def test_del():
@@ -215,6 +225,17 @@ def test_pre():
    assert md('<pre><span>*this_should_not_escape*</span></pre>') == '\n```\n*this_should_not_escape*\n```\n'
    assert md('<pre>\t\tthis  should\t\tnot  normalize</pre>') == '\n```\n\t\tthis  should\t\tnot  normalize\n```\n'
    assert md('<pre><span>\t\tthis  should\t\tnot  normalize</span></pre>') == '\n```\n\t\tthis  should\t\tnot  normalize\n```\n'
+    assert md('<pre>foo<b>\nbar\n</b>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<i>\nbar\n</i>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo\n<i>bar</i>\nbaz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<i>\n</i>baz</pre>') == '\n```\nfoo\nbaz\n```\n'
+    assert md('<pre>foo<del>\nbar\n</del>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<em>\nbar\n</em>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<code>\nbar\n</code>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<strong>\nbar\n</strong>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<s>\nbar\n</s>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<sup>\nbar\n</sup>baz</pre>', sup_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>foo<sub>\nbar\n</sub>baz</pre>', sub_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'


 def test_script():
@@ -247,11 +268,13 @@ def test_strong_em_symbol():
 def test_sub():
    assert md('<sub>foo</sub>') == 'foo'
    assert md('<sub>foo</sub>', sub_symbol='~') == '~foo~'
+    assert md('<sub>foo</sub>', sub_symbol='<sub>') == '<sub>foo</sub>'


 def test_sup():
    assert md('<sup>foo</sup>') == 'foo'
    assert md('<sup>foo</sup>', sup_symbol='^') == '^foo^'
+    assert md('<sup>foo</sup>', sup_symbol='<sup>') == '<sup>foo</sup>'


 def test_lang():
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -12,7 +12,7 @@ def test_underscore():


 def test_xml_entities():
-    assert md('&amp;') == '&'
+    assert md('&amp;') == r'\&'


 def test_named_entities():
@@ -25,4 +25,23 @@ def test_hexadecimal_entities():


 def test_single_escaping_entities():
-    assert md('&amp;amp;') == '&amp;'
+    assert md('&amp;amp;') == r'\&amp;'
+
+
+def text_misc():
+    assert md('\\*') == r'\\\*'
+    assert md('<foo>') == r'\<foo\>'
+    assert md('# foo') == r'\# foo'
+    assert md('> foo') == r'\> foo'
+    assert md('~~foo~~') == r'\~\~foo\~\~'
+    assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
+    assert md('---\n') == '\\-\\-\\-\n'
+    assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
+    assert md('`x`') == r'\`x\`'
+    assert md('[text](link)') == r'\[text](link)'
+    assert md('1. x') == r'1\. x'
+    assert md('not a number. x') == r'not a number. x'
+    assert md('1) x') == r'1\) x'
+    assert md('not a number) x') == r'not a number) x'
+    assert md('|not table|') == r'\|not table\|'
+    assert md(r'\ <foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -43,6 +43,9 @@ nested_ols = """
 def test_ol():
    assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
    assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
+    assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
+    assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
+    assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'


 def test_nested_ols():
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -215,7 +215,7 @@ table_with_colspan = """<table>
        <th>Age</th>
    </tr>
    <tr>
-        <td>Jill</td>
+        <td colspan="1">Jill</td>
        <td>Smith</td>
        <td>50</td>
    </tr>
@@ -226,6 +226,17 @@ table_with_colspan = """<table>
    </tr>
 </table>"""

+table_with_undefined_colspan = """<table>
+    <tr>
+        <th colspan="undefined">Name</th>
+        <th>Age</th>
+    </tr>
+    <tr>
+        <td colspan="-1">Jill</td>
+        <td>Smith</td>
+    </tr>
+</table>"""
+

 def test_table():
    assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
@@ -240,3 +251,4 @@ def test_table():
    assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
    assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
--- a/tox.ini
+++ b/tox.ini
@@ -4,7 +4,7 @@ envlist = py38
 [testenv]
 passenv = PYTHONPATH
 deps =
-	pytest
+	pytest==8
 	flake8
 	restructuredtext_lint
 	Pygments
Author	SHA1	Message	Date
AlexVonB	8c810eb8a8	Merge branch 'develop'	2024-07-14 21:20:04 +02:00
AlexVonB	f6c8daf8a5	bump to v0.13.0	2024-07-14 21:19:35 +02:00
AlexVonB	75a678dab9	fix pytest version to 8	2024-07-14 21:02:49 +02:00
AlexVonB	0a5c89aa49	added test for ol start check	2024-06-23 14:30:07 +02:00
microdnd	51390d7389	handle ol start value is not number (#127 ) Co-authored-by: Mico <mico_wu@trendmicro.com>	2024-06-23 14:28:53 +02:00
AlexVonB	50b4640db2	better naming for markup variables	2024-06-23 13:30:08 +02:00
Joseph Myers	7861b330cd	Special-case use of HTML tags for converting `<sub>` / `<sup>` (#119 ) Allow different strings before / after `<sub>` / `<sup>` content In particular, this allows setting `sub_symbol='<sub>'`, `sup_symbol='<sup>'`, to use raw HTML in the output when converting subscripts and superscripts.	2024-06-23 13:28:05 +02:00
AlexVonB	2ec33384de	handle un-parsable colspan values fixes #126	2024-06-23 13:17:20 +02:00
samypr100	c1672aee44	Update MANIFEST.in to exclude tests during packaging (#125 )	2024-06-23 12:59:14 +02:00
AlexVonB	43dbe20aaf	fixed github action badges see https://github.com/badges/shields/issues/8671	2024-04-04 21:50:02 +02:00
Joseph Myers	46af45bb3c	Escape all characters with Markdown significance (#118 ) * Escape all characters with Markdown significance There are many punctuation characters that sometimes have significance in Markdown; more systematically escape them all (based on a new escape_misc configuration option). A limited attempt is made to limit the escaping of '.' and ')' to the context where they might have Markdown significance (after a number, where they can indicate an ordered list item); no such attempt is made for the other characters (and even that limiting of '.' and ')' may not be entirely safe in all cases, as it's possible the HTML could have the number outside the block being escaped in one go, e.g. `<span>1</span>.`. --------- Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>	2024-04-04 21:42:58 +02:00
Joseph Myers	2bd0772685	Avoid inline styles inside `<code>` / `<pre>` conversion (#117 ) * Avoid inline styles inside `<code>` / `<pre>` conversion The check used for this is analogous to that used to avoid escaping potential markup characters inside such tags. Fixes #103 --------- Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>	2024-04-04 20:55:54 +02:00