allow BeautifulSoup configuration kwargs to be specified

Signed-off-by: chrispy <chrispy@synopsys.com>
2025-06-14 07:24:19 -04:00
3 changed files with 21 additions and 7 deletions
--- a/README.rst
+++ b/README.rst
@@ -157,12 +157,16 @@ strip_document
  within the document are unaffected.
  Defaults to ``STRIP``.
-beautiful_soup_parser
+bs4_options
-  Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
+  Specify additional configuration options for the ``BeautifulSoup`` object
-  as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
+  used to interpret the HTML markup. String and list values (such as ``lxml``)
-  environment. Defaults to ``html.parser``.
+  are treated as ``features`` parameter arguments to control parser
  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
  are treated as full kwargs to be used for the BeautifulSoup constructor,
  allowing specification of any parameter. For parameter details, see the
  Beautiful Soup documentation at:
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
    class DefaultOptions:
        autolinks = True
-        beautiful_soup_parser = 'html.parser'
+        bs4_options = 'html.parser'
        bullets = '*+-'  # An iterable of bullet types.
        code_language = ''
        code_language_callback = None
@@ -188,11 +188,15 @@ class MarkdownConverter(object):
            raise ValueError('You may specify either tags to strip or tags to'
                             ' convert, but not both.')
        # If a string or list is passed to bs4_options, assume it is a 'features' specification
        if not isinstance(self.options['bs4_options'], dict):
            self.options['bs4_options'] = {'features': self.options['bs4_options']}
        # Initialize the conversion function cache
        self.convert_fn_cache = {}
    def convert(self, html):
-        soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
+        soup = BeautifulSoup(html, **self.options['bs4_options'])
        return self.convert_soup(soup)
    def convert_soup(self, soup):
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -32,3 +32,9 @@ def test_strip_document():
    assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
    assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
    assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
 def bs4_options():
    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
    assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"