From a369e072116ec48e956c75fccb430bebc35a033f Mon Sep 17 00:00:00 2001
From: Joseph Myers <josmyers@redhat.com>
Date: Wed, 2 Oct 2024 21:59:39 +0000
Subject: [PATCH] More selective escaping of `-#.)` (alternative approach)

This is a partial alternative to #122 (open since April) for more
selective escaping of some special characters.

Here, we fix the test function naming (as noted in that PR) so the
tests are actually run (and fix some incorrect test assertions so they
pass).  We also make escaping of `-#.)` (the most common cases of
unnecessary escaping in my use case) more selective, while still being
conservatively safe in escaping all cases of those characters that
might have Markdown significance (including in the presence of
wrapping, unlike in #122).  (Being conservatively safe doesn't include
the cases where `.` or `)` start a fragment, where the existing code
already was not conservatively safe.)

There are certainly more cases where the code could also be made more
selective while remaining conservatively safe (including in the
presence of wrapping), so this is not a complete replacement for #122,
but by fixing some of the most common cases in a safe way, and getting
the tests actually running, I hope this allows progress to be made
where the previous attempt appears to have stalled, while still
allowing further incremental progress with appropriately safe logic
for other characters where useful.
---
 markdownify/__init__.py | 16 ++++++++++++++--
 tests/test_escaping.py  | 31 +++++++++++++++++++++++++++----
 2 files changed, 41 insertions(+), 6 deletions(-)
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index cd66a39..ceeb793 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -208,8 +208,20 @@ class MarkdownConverter(object):
         if not text:
             return ''
         if self.options['escape_misc']:
-            text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
-            text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
+            text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
+            # A sequence of one or more consecutive '-', preceded and
+            # followed by whitespace or start/end of fragment, might
+            # be confused with an underline of a header, or with a
+            # list marker.
+            text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
+            # A sequence of up to six consecutive '#', preceded and
+            # followed by whitespace or start/end of fragment, might
+            # be confused with an ATX heading.
+            text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
+            # '.' or ')' preceded by up to nine digits might be
+            # confused with a list item.
+            text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
+                          text)
         if self.options['escape_asterisks']:
             text = text.replace('*', r'\*')
         if self.options['escape_underscores']:
diff --git a/tests/test_escaping.py b/tests/test_escaping.py
index eaef77d..6eb8e07 100644
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -28,20 +28,43 @@ def test_single_escaping_entities():
     assert md('&amp;amp;') == r'\&amp;'
 
 
-def text_misc():
+def test_misc():
     assert md('\\*') == r'\\\*'
-    assert md('<foo>') == r'\<foo\>'
+    assert md('&lt;foo>') == r'\<foo\>'
     assert md('# foo') == r'\# foo'
+    assert md('#5') == r'#5'
+    assert md('5#') == '5#'
+    assert md('####### foo') == r'####### foo'
     assert md('> foo') == r'\> foo'
     assert md('~~foo~~') == r'\~\~foo\~\~'
     assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
-    assert md('---\n') == '\\-\\-\\-\n'
+    assert md('---\n') == '\\---\n'
+    assert md('- test') == r'\- test'
+    assert md('x - y') == r'x \- y'
+    assert md('test-case') == 'test-case'
+    assert md('x-') == 'x-'
+    assert md('-y') == '-y'
     assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
     assert md('`x`') == r'\`x\`'
     assert md('[text](link)') == r'\[text](link)'
     assert md('1. x') == r'1\. x'
+    # assert md('1<span>.</span> x') == r'1\. x'
+    assert md('<span>1.</span> x') == r'1\. x'
+    assert md(' 1. x') == r' 1\. x'
+    assert md('123456789. x') == r'123456789\. x'
+    assert md('1234567890. x') == r'1234567890. x'
+    assert md('A1. x') == r'A1. x'
+    assert md('1.2') == r'1.2'
     assert md('not a number. x') == r'not a number. x'
     assert md('1) x') == r'1\) x'
+    # assert md('1<span>)</span> x') == r'1\) x'
+    assert md('<span>1)</span> x') == r'1\) x'
+    assert md(' 1) x') == r' 1\) x'
+    assert md('123456789) x') == r'123456789\) x'
+    assert md('1234567890) x') == r'1234567890) x'
+    assert md('(1) x') == r'(1) x'
+    assert md('A1) x') == r'A1) x'
+    assert md('1)x') == r'1)x'
     assert md('not a number) x') == r'not a number) x'
     assert md('|not table|') == r'\|not table\|'
-    assert md(r'\ <foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'
+    assert md(r'\ &lt;foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'