implement a strip_pre configuration option (#218) (#222)

Signed-off-by: chrispy <chrispy@synopsys.com>
This commit is contained in:
Chris Papademetrious
2025-06-14 16:37:47 -04:00
committed by GitHub
parent 75ab3064dd
commit 9b1412aa5b
4 changed files with 45 additions and 3 deletions

View File

@@ -157,6 +157,12 @@ strip_document
within the document are unaffected. within the document are unaffected.
Defaults to ``STRIP``. Defaults to ``STRIP``.
strip_pre
Controls whether leading/trailing blank lines are removed from ``<pre>``
tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
Defaults to ``STRIP``.
bs4_options bs4_options
Specify additional configuration options for the ``BeautifulSoup`` object Specify additional configuration options for the ``BeautifulSoup`` object
used to interpret the HTML markup. String and list values (such as ``lxml`` used to interpret the HTML markup. String and list values (such as ``lxml``

View File

@@ -11,6 +11,10 @@ re_whitespace = re.compile(r'[\t ]+')
re_all_whitespace = re.compile(r'[\t \r\n]+') re_all_whitespace = re.compile(r'[\t \r\n]+')
re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
re_html_heading = re.compile(r'h(\d+)') re_html_heading = re.compile(r'h(\d+)')
re_pre_lstrip1 = re.compile(r'^ *\n')
re_pre_rstrip1 = re.compile(r'\n *$')
re_pre_lstrip = re.compile(r'^[ \n]*\n')
re_pre_rstrip = re.compile(r'[ \n]*$')
# Pattern for creating convert_<tag> function names from tag names # Pattern for creating convert_<tag> function names from tag names
re_make_convert_fn_name = re.compile(r'[\[\]:-]') re_make_convert_fn_name = re.compile(r'[\[\]:-]')
@@ -51,10 +55,25 @@ BACKSLASH = 'backslash'
ASTERISK = '*' ASTERISK = '*'
UNDERSCORE = '_' UNDERSCORE = '_'
# Document strip styles # Document/pre strip styles
LSTRIP = 'lstrip' LSTRIP = 'lstrip'
RSTRIP = 'rstrip' RSTRIP = 'rstrip'
STRIP = 'strip' STRIP = 'strip'
STRIP_ONE = 'strip_one'
def strip1_pre(text):
"""Strip one leading and trailing newline from a <pre> string."""
text = re_pre_lstrip1.sub('', text)
text = re_pre_rstrip1.sub('', text)
return text
def strip_pre(text):
"""Strip all leading and trailing newlines from a <pre> string."""
text = re_pre_lstrip.sub('', text)
text = re_pre_rstrip.sub('', text)
return text
def chomp(text): def chomp(text):
@@ -168,6 +187,7 @@ class MarkdownConverter(object):
newline_style = SPACES newline_style = SPACES
strip = None strip = None
strip_document = STRIP strip_document = STRIP
strip_pre = STRIP
strong_em_symbol = ASTERISK strong_em_symbol = ASTERISK
sub_symbol = '' sub_symbol = ''
sup_symbol = '' sup_symbol = ''
@@ -656,6 +676,15 @@ class MarkdownConverter(object):
if self.options['code_language_callback']: if self.options['code_language_callback']:
code_language = self.options['code_language_callback'](el) or code_language code_language = self.options['code_language_callback'](el) or code_language
if self.options['strip_pre'] == STRIP:
text = strip_pre(text) # remove all leading/trailing newlines
elif self.options['strip_pre'] == STRIP_ONE:
text = strip1_pre(text) # remove one leading/trailing newline
elif self.options['strip_pre'] is None:
pass # leave leading and trailing newlines as-is
else:
raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
return '\n\n```%s\n%s\n```\n\n' % (code_language, text) return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
def convert_q(self, el, text, parent_tags): def convert_q(self, el, text, parent_tags):

View File

@@ -2,7 +2,7 @@
Test whitelisting/blacklisting of specific tags. Test whitelisting/blacklisting of specific tags.
""" """
from markdownify import markdownify, LSTRIP, RSTRIP, STRIP from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE
from .utils import md from .utils import md
@@ -34,6 +34,13 @@ def test_strip_document():
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n" assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
def test_strip_pre():
assert markdownify("<pre> \n \n Hello \n \n </pre>") == "```\n Hello\n```"
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP) == "```\n Hello\n```"
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP_ONE) == "```\n \n Hello \n \n```"
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=None) == "```\n \n \n Hello \n \n \n```"
def bs4_options(): def bs4_options():
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello" assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello" assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"

View File

@@ -370,4 +370,4 @@ def test_spaces():
assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter' assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n' assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n' assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo \n```\n\nbar' assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'