Escape all characters with Markdown significance (#118)

* Escape all characters with Markdown significance There are many punctuation characters that sometimes have significance in Markdown; more systematically escape them all (based on a new escape_misc configuration option). A limited attempt is made to limit the escaping of '.' and ')' to the context where they might have Markdown significance (after a number, where they can indicate an ordered list item); no such attempt is made for the other characters (and even that limiting of '.' and ')' may not be entirely safe in all cases, as it's possible the HTML could have the number outside the block being escaped in one go, e.g. `<span>1</span>.`. --------- Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>
2024-04-04 19:42:58 +00:00
parent 2bd0772685
commit 46af45bb3c
3 changed files with 30 additions and 2 deletions
--- a/README.rst
+++ b/README.rst
@@ -123,6 +123,11 @@ escape_underscores
  If set to ``False``, do not escape ``_`` to ``\_`` in text.
  Defaults to ``True``.

+escape_misc
+  If set to ``False``, do not escape miscellaneous punctuation characters
+  that sometimes have Markdown significance in text.
+  Defaults to ``True``.
+
 keep_inline_images_in
  Images are converted to their alt-text when the images are located inside
  headlines or table cells. If some inline images should be converted to
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -71,6 +71,7 @@ class MarkdownConverter(object):
        default_title = False
        escape_asterisks = True
        escape_underscores = True
+        escape_misc = True
        heading_style = UNDERLINED
        keep_inline_images_in = []
        newline_style = SPACES
@@ -201,6 +202,9 @@ class MarkdownConverter(object):
    def escape(self, text):
        if not text:
            return ''
+        if self.options['escape_misc']:
+            text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
+            text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
        if self.options['escape_asterisks']:
            text = text.replace('*', r'\*')
        if self.options['escape_underscores']:
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -12,7 +12,7 @@ def test_underscore():


 def test_xml_entities():
-    assert md('&amp;') == '&'
+    assert md('&amp;') == r'\&'


 def test_named_entities():
@@ -25,4 +25,23 @@ def test_hexadecimal_entities():


 def test_single_escaping_entities():
-    assert md('&amp;amp;') == '&amp;'
+    assert md('&amp;amp;') == r'\&amp;'
+
+
+def text_misc():
+    assert md('\\*') == r'\\\*'
+    assert md('<foo>') == r'\<foo\>'
+    assert md('# foo') == r'\# foo'
+    assert md('> foo') == r'\> foo'
+    assert md('~~foo~~') == r'\~\~foo\~\~'
+    assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
+    assert md('---\n') == '\\-\\-\\-\n'
+    assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
+    assert md('`x`') == r'\`x\`'
+    assert md('[text](link)') == r'\[text](link)'
+    assert md('1. x') == r'1\. x'
+    assert md('not a number. x') == r'not a number. x'
+    assert md('1) x') == r'1\) x'
+    assert md('not a number) x') == r'not a number) x'
+    assert md('|not table|') == r'\|not table\|'
+    assert md(r'\ <foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'