Compare commits

...

1 Commits

Author SHA1 Message Date
chrispy
2d0a14a2a7 allow BeautifulSoup configuration kwargs to be specified
Signed-off-by: chrispy <chrispy@synopsys.com>
2025-06-14 07:24:19 -04:00
3 changed files with 21 additions and 7 deletions

View File

@@ -157,12 +157,16 @@ strip_document
within the document are unaffected. within the document are unaffected.
Defaults to ``STRIP``. Defaults to ``STRIP``.
beautiful_soup_parser bs4_options
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such Specify additional configuration options for the ``BeautifulSoup`` object
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution used to interpret the HTML markup. String and list values (such as ``lxml``)
environment. Defaults to ``html.parser``. are treated as ``features`` parameter arguments to control parser
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
are treated as full kwargs to be used for the BeautifulSoup constructor,
allowing specification of any parameter. For parameter details, see the
Beautiful Soup documentation at:
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/ .. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
Options may be specified as kwargs to the ``markdownify`` function, or as a Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses. nested ``Options`` class in ``MarkdownConverter`` subclasses.

View File

@@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
class MarkdownConverter(object): class MarkdownConverter(object):
class DefaultOptions: class DefaultOptions:
autolinks = True autolinks = True
beautiful_soup_parser = 'html.parser' bs4_options = 'html.parser'
bullets = '*+-' # An iterable of bullet types. bullets = '*+-' # An iterable of bullet types.
code_language = '' code_language = ''
code_language_callback = None code_language_callback = None
@@ -188,11 +188,15 @@ class MarkdownConverter(object):
raise ValueError('You may specify either tags to strip or tags to' raise ValueError('You may specify either tags to strip or tags to'
' convert, but not both.') ' convert, but not both.')
# If a string or list is passed to bs4_options, assume it is a 'features' specification
if not isinstance(self.options['bs4_options'], dict):
self.options['bs4_options'] = {'features': self.options['bs4_options']}
# Initialize the conversion function cache # Initialize the conversion function cache
self.convert_fn_cache = {} self.convert_fn_cache = {}
def convert(self, html): def convert(self, html):
soup = BeautifulSoup(html, self.options['beautiful_soup_parser']) soup = BeautifulSoup(html, **self.options['bs4_options'])
return self.convert_soup(soup) return self.convert_soup(soup)
def convert_soup(self, soup): def convert_soup(self, soup):

View File

@@ -32,3 +32,9 @@ def test_strip_document():
assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello" assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello" assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n" assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
def bs4_options():
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"