From ae0597d80cb57983e876d13fdd44a7728abcbe26 Mon Sep 17 00:00:00 2001
From: Chris Papademetrious
Date: Mon, 27 Jan 2025 11:55:32 -0500
Subject: [PATCH] remove superfluous leading/trailing whitespace (#181)
---
README.rst | 7 +++++++
markdownify/__init__.py | 26 ++++++++++++++++++++++----
tests/test_advanced.py | 2 +-
tests/test_args.py | 11 ++++++++++-
tests/test_basic.py | 2 +-
tests/test_conversions.py | 14 ++++++++------
tests/test_custom_converter.py | 4 ++--
tests/test_escaping.py | 2 +-
tests/test_lists.py | 2 +-
tests/test_tables.py | 7 +++++--
tests/utils.py | 9 +++++++++
11 files changed, 67 insertions(+), 19 deletions(-)
create mode 100644 tests/utils.py
diff --git a/README.rst b/README.rst
index 34ed7e0..b37a503 100644
--- a/README.rst
+++ b/README.rst
@@ -150,6 +150,13 @@ wrap, wrap_width
Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
A `wrap_width` value of `None` reflows lines to unlimited line length.
+strip_document
+ Controls whether leading and/or trailing separation newlines are removed from
+ the final converted document. Supported values are ``LSTRIP`` (leading),
+ ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
+ within the document are unaffected.
+ Defaults to ``STRIP``.
+
Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index ef4e7ca..7d14fe7 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -26,6 +26,11 @@ BACKSLASH = 'backslash'
ASTERISK = '*'
UNDERSCORE = '_'
+# Document strip styles
+LSTRIP = 'lstrip'
+RSTRIP = 'rstrip'
+STRIP = 'strip'
+
def chomp(text):
"""
@@ -99,6 +104,7 @@ class MarkdownConverter(object):
keep_inline_images_in = []
newline_style = SPACES
strip = None
+ strip_document = STRIP
strong_em_symbol = ASTERISK
sub_symbol = ''
sup_symbol = ''
@@ -180,7 +186,18 @@ class MarkdownConverter(object):
return text
def convert__document_(self, el, text, convert_as_inline):
- # for BeautifulSoup objects (where node.name == "[document]"), return content results as-is
+ """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
+ if self.options['strip_document'] == LSTRIP:
+ text = text.lstrip('\n') # remove leading separation newlines
+ elif self.options['strip_document'] == RSTRIP:
+ text = text.rstrip('\n') # remove trailing separation newlines
+ elif self.options['strip_document'] == STRIP:
+ text = text.strip('\n') # remove leading and trailing separation newlines
+ elif self.options['strip_document'] is None:
+ pass # leave leading and trailing separation newlines as-is
+ else:
+ raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+
return text
def process_text(self, el):
@@ -454,6 +471,7 @@ class MarkdownConverter(object):
def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
return ' ' + text.strip() + ' '
+ text = text.strip()
if self.options['wrap']:
# Preserve newlines (and preceding whitespace) resulting
# from
tags. Newlines in the input have already been
@@ -500,13 +518,13 @@ class MarkdownConverter(object):
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
def convert_table(self, el, text, convert_as_inline):
- return '\n\n' + text + '\n'
+ return '\n\n' + text.strip() + '\n\n'
def convert_caption(self, el, text, convert_as_inline):
- return text + '\n\n'
+ return text.strip() + '\n\n'
def convert_figcaption(self, el, text, convert_as_inline):
- return '\n\n' + text + '\n\n'
+ return '\n\n' + text.strip() + '\n\n'
def convert_td(self, el, text, convert_as_inline):
colspan = 1
diff --git a/tests/test_advanced.py b/tests/test_advanced.py
index a3a5fda..6123d8c 100644
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
def test_chomp():
diff --git a/tests/test_args.py b/tests/test_args.py
index ebce4a8..301c19f 100644
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -2,7 +2,8 @@
Test whitelisting/blacklisting of specific tags.
"""
-from markdownify import markdownify as md
+from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
+from .utils import md
def test_strip():
@@ -23,3 +24,11 @@ def test_convert():
def test_do_not_convert():
text = md('Some Text', convert=[])
assert text == 'Some Text'
+
+
+def test_strip_document():
+ assert markdownify("Hello
") == "Hello" # test default of STRIP
+ assert markdownify("Hello
", strip_document=LSTRIP) == "Hello\n\n"
+ assert markdownify("Hello
", strip_document=RSTRIP) == "\n\nHello"
+ assert markdownify("Hello
", strip_document=STRIP) == "Hello"
+ assert markdownify("Hello
", strip_document=None) == "\n\nHello\n\n"
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 66f8b6c..584adb9 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
def test_single_tag():
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 05c6cd4..1367006 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -1,4 +1,5 @@
-from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
+from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
+from .utils import md
def inline_tests(tag, markup):
@@ -79,11 +80,6 @@ def test_br():
assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
-def test_caption():
- assert md('TEXTCaptionSPAN') == 'TEXT\n\nCaption\n\nSPAN'
- assert md('SPANCaptionTEXT') == 'SPAN\n\nCaption\n\nTEXT'
-
-
def test_code():
inline_tests('code', '`')
assert md('*this_should_not_escape*') == '`*this_should_not_escape*`'
@@ -126,6 +122,11 @@ def test_em():
inline_tests('em', '*')
+def test_figcaption():
+ assert (md("TEXT\nCaption\nSPAN") == "TEXT\n\nCaption\n\nSPAN")
+ assert (md("SPAN\nCaption\nTEXT") == "SPAN\n\nCaption\n\nTEXT")
+
+
def test_header_with_space():
assert md('\n\nHello
') == '\n\n### Hello\n\n'
assert md('Hello\n\n\nWorld
') == '\n\n### Hello World\n\n'
@@ -236,6 +237,7 @@ def test_kbd():
def test_p():
assert md('hello
') == '\n\nhello\n\n'
+ assert md("hello
") == "\n\nhello\n\n"
assert md('123456789 123456789
') == '\n\n123456789 123456789\n\n'
assert md('123456789\n\n\n123456789
') == '\n\n123456789\n123456789\n\n'
assert md('123456789\n\n\n123456789
', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
diff --git a/tests/test_custom_converter.py b/tests/test_custom_converter.py
index adc83f7..0d3f6af 100644
--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -20,8 +20,8 @@ def test_custom_conversion_functions():
def md(html, **options):
return UnitTestConverter(**options).convert(html)
- assert md('
') == '\n\n'
- assert md('
') == '\n\n'
+ assert md('
text') == '\n\ntext'
+ assert md('
text') == '\n\ntext'
assert md("text") == "FUNCTION USED: text"
diff --git a/tests/test_escaping.py b/tests/test_escaping.py
index 878760a..d213675 100644
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -1,6 +1,6 @@
import warnings
from bs4 import MarkupResemblesLocatorWarning
-from markdownify import markdownify as md
+from .utils import md
def test_asterisks():
diff --git a/tests/test_lists.py b/tests/test_lists.py
index ce54a87..6b320ca 100644
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
nested_uls = """
diff --git a/tests/test_tables.py b/tests/test_tables.py
index da4bf53..e41b389 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md
table = """
@@ -228,7 +228,10 @@ table_body = """"""
-table_with_caption = """TEXTCaption
+table_with_caption = """TEXT
+
+ Caption
+
| Firstname |
Lastname |
Age |
diff --git a/tests/utils.py b/tests/utils.py
new file mode 100644
index 0000000..0dac580
--- /dev/null
+++ b/tests/utils.py
@@ -0,0 +1,9 @@
+from markdownify import MarkdownConverter
+
+
+# for unit testing, disable document-level stripping by default so that
+# separation newlines are included in testing
+def md(html, **options):
+ options = {"strip_document": None, **options}
+
+ return MarkdownConverter(**options).convert(html)