From ae0597d80cb57983e876d13fdd44a7728abcbe26 Mon Sep 17 00:00:00 2001
From: Chris Papademetrious <chrispy@synopsys.com>
Date: Mon, 27 Jan 2025 11:55:32 -0500
Subject: [PATCH] remove superfluous leading/trailing whitespace (#181)

---
 README.rst                     |  7 +++++++
 markdownify/__init__.py        | 26 ++++++++++++++++++++++----
 tests/test_advanced.py         |  2 +-
 tests/test_args.py             | 11 ++++++++++-
 tests/test_basic.py            |  2 +-
 tests/test_conversions.py      | 14 ++++++++------
 tests/test_custom_converter.py |  4 ++--
 tests/test_escaping.py         |  2 +-
 tests/test_lists.py            |  2 +-
 tests/test_tables.py           |  7 +++++--
 tests/utils.py                 |  9 +++++++++
 11 files changed, 67 insertions(+), 19 deletions(-)
 create mode 100644 tests/utils.py
diff --git a/README.rst b/README.rst
index 34ed7e0..b37a503 100644
--- a/README.rst
+++ b/README.rst
@@ -150,6 +150,13 @@ wrap, wrap_width
   Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
   A `wrap_width` value of `None` reflows lines to unlimited line length.
 
+strip_document
+  Controls whether leading and/or trailing separation newlines are removed from
+  the final converted document. Supported values are ``LSTRIP`` (leading),
+  ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
+  within the document are unaffected.
+  Defaults to ``STRIP``.
+
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
 
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index ef4e7ca..7d14fe7 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -26,6 +26,11 @@ BACKSLASH = 'backslash'
 ASTERISK = '*'
 UNDERSCORE = '_'
 
+# Document strip styles
+LSTRIP = 'lstrip'
+RSTRIP = 'rstrip'
+STRIP = 'strip'
+
 
 def chomp(text):
     """
@@ -99,6 +104,7 @@ class MarkdownConverter(object):
         keep_inline_images_in = []
         newline_style = SPACES
         strip = None
+        strip_document = STRIP
         strong_em_symbol = ASTERISK
         sub_symbol = ''
         sup_symbol = ''
@@ -180,7 +186,18 @@ class MarkdownConverter(object):
         return text
 
     def convert__document_(self, el, text, convert_as_inline):
-        # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is
+        """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
+        if self.options['strip_document'] == LSTRIP:
+            text = text.lstrip('\n')  # remove leading separation newlines
+        elif self.options['strip_document'] == RSTRIP:
+            text = text.rstrip('\n')  # remove trailing separation newlines
+        elif self.options['strip_document'] == STRIP:
+            text = text.strip('\n')  # remove leading and trailing separation newlines
+        elif self.options['strip_document'] is None:
+            pass  # leave leading and trailing separation newlines as-is
+        else:
+            raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+
         return text
 
     def process_text(self, el):
@@ -454,6 +471,7 @@ class MarkdownConverter(object):
     def convert_p(self, el, text, convert_as_inline):
         if convert_as_inline:
             return ' ' + text.strip() + ' '
+        text = text.strip()
         if self.options['wrap']:
             # Preserve newlines (and preceding whitespace) resulting
             # from <br> tags.  Newlines in the input have already been
@@ -500,13 +518,13 @@ class MarkdownConverter(object):
     convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
 
     def convert_table(self, el, text, convert_as_inline):
-        return '\n\n' + text + '\n'
+        return '\n\n' + text.strip() + '\n\n'
 
     def convert_caption(self, el, text, convert_as_inline):
-        return text + '\n\n'
+        return text.strip() + '\n\n'
 
     def convert_figcaption(self, el, text, convert_as_inline):
-        return '\n\n' + text + '\n\n'
+        return '\n\n' + text.strip() + '\n\n'
 
     def convert_td(self, el, text, convert_as_inline):
         colspan = 1
diff --git a/tests/test_advanced.py b/tests/test_advanced.py
index a3a5fda..6123d8c 100644
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
 
 
 def test_chomp():
diff --git a/tests/test_args.py b/tests/test_args.py
index ebce4a8..301c19f 100644
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -2,7 +2,8 @@
 Test whitelisting/blacklisting of specific tags.
 
 """
-from markdownify import markdownify as md
+from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
+from .utils import md
 
 
 def test_strip():
@@ -23,3 +24,11 @@ def test_convert():
 def test_do_not_convert():
     text = md('<a href="https://github.com/matthewwithanm">Some Text</a>', convert=[])
     assert text == 'Some Text'
+
+
+def test_strip_document():
+    assert markdownify("<p>Hello</p>") == "Hello"  # test default of STRIP
+    assert markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
+    assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
+    assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
+    assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 66f8b6c..584adb9 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
 
 
 def test_single_tag():
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 05c6cd4..1367006 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -1,4 +1,5 @@
-from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
+from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
+from .utils import md
 
 
 def inline_tests(tag, markup):
@@ -79,11 +80,6 @@ def test_br():
     assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
 
 
-def test_caption():
-    assert md('TEXT<figure><figcaption>Caption</figcaption><span>SPAN</span></figure>') == 'TEXT\n\nCaption\n\nSPAN'
-    assert md('<figure><span>SPAN</span><figcaption>Caption</figcaption></figure>TEXT') == 'SPAN\n\nCaption\n\nTEXT'
-
-
 def test_code():
     inline_tests('code', '`')
     assert md('<code>*this_should_not_escape*</code>') == '`*this_should_not_escape*`'
@@ -126,6 +122,11 @@ def test_em():
     inline_tests('em', '*')
 
 
+def test_figcaption():
+    assert (md("TEXT<figure><figcaption>\nCaption\n</figcaption><span>SPAN</span></figure>") == "TEXT\n\nCaption\n\nSPAN")
+    assert (md("<figure><span>SPAN</span><figcaption>\nCaption\n</figcaption></figure>TEXT") == "SPAN\n\nCaption\n\nTEXT")
+
+
 def test_header_with_space():
     assert md('<h3>\n\nHello</h3>') == '\n\n### Hello\n\n'
     assert md('<h3>Hello\n\n\nWorld</h3>') == '\n\n### Hello World\n\n'
@@ -236,6 +237,7 @@ def test_kbd():
 
 def test_p():
     assert md('<p>hello</p>') == '\n\nhello\n\n'
+    assert md("<p><p>hello</p></p>") == "\n\nhello\n\n"
     assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
     assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
     assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py
index adc83f7..0d3f6af 100644
--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -20,8 +20,8 @@ def test_custom_conversion_functions():
     def md(html, **options):
         return UnitTestConverter(**options).convert(html)
 
-    assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />') == '![Alt text](/path/to/img.jpg "Optional title")\n\n'
-    assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)\n\n'
+    assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />text') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext'
+    assert md('<img src="/path/to/img.jpg" alt="Alt text" />text') == '![Alt text](/path/to/img.jpg)\n\ntext'
 
     assert md("<custom-tag>text</custom-tag>") == "FUNCTION USED: text"
 
diff --git a/tests/test_escaping.py b/tests/test_escaping.py
index 878760a..d213675 100644
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -1,6 +1,6 @@
 import warnings
 from bs4 import MarkupResemblesLocatorWarning
-from markdownify import markdownify as md
+from .utils import md
 
 
 def test_asterisks():
diff --git a/tests/test_lists.py b/tests/test_lists.py
index ce54a87..6b320ca 100644
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
 
 
 nested_uls = """
diff --git a/tests/test_tables.py b/tests/test_tables.py
index da4bf53..e41b389 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
 
 
 table = """<table>
@@ -228,7 +228,10 @@ table_body = """<table>
     </tbody>
 </table>"""
 
-table_with_caption = """TEXT<table><caption>Caption</caption>
+table_with_caption = """TEXT<table>
+    <caption>
+        Caption
+    </caption>
     <tbody><tr><td>Firstname</td>
             <td>Lastname</td>
             <td>Age</td>
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..0dac580
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,9 @@
+from markdownify import MarkdownConverter
+
+
+# for unit testing, disable document-level stripping by default so that
+# separation newlines are included in testing
+def md(html, **options):
+    options = {"strip_document": None, **options}
+
+    return MarkdownConverter(**options).convert(html)