allow BeautifulSoup configuration kwargs to be specified

Signed-off-by: chrispy <chrispy@synopsys.com>
ensure that explicitly provided heading conversion functions are used (#212 ) (#214 )
2025-06-14 07:24:19 -04:00 · 2025-05-03 10:57:09 -04:00 · 2025-04-28 06:37:33 -04:00 · 2025-04-20 06:20:01 -04:00 · 2025-03-29 11:29:29 +01:00
6 changed files with 70 additions and 15 deletions
--- a/README.rst
+++ b/README.rst
@@ -157,6 +157,17 @@ strip_document
  within the document are unaffected.
  Defaults to ``STRIP``.

+bs4_options
+  Specify additional configuration options for the ``BeautifulSoup`` object
+  used to interpret the HTML markup. String and list values (such as ``lxml``)
+  are treated as ``features`` parameter arguments to control parser
+  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
+  are treated as full kwargs to be used for the BeautifulSoup constructor,
+  allowing specification of any parameter. For parameter details, see the
+  Beautiful Soup documentation at:
+
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
+
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.

--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -154,6 +154,7 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
    class DefaultOptions:
        autolinks = True
+        bs4_options = 'html.parser'
        bullets = '*+-'  # An iterable of bullet types.
        code_language = ''
        code_language_callback = None
@@ -187,11 +188,15 @@ class MarkdownConverter(object):
            raise ValueError('You may specify either tags to strip or tags to'
                             ' convert, but not both.')

+        # If a string or list is passed to bs4_options, assume it is a 'features' specification
+        if not isinstance(self.options['bs4_options'], dict):
+            self.options['bs4_options'] = {'features': self.options['bs4_options']}
+
        # Initialize the conversion function cache
        self.convert_fn_cache = {}

    def convert(self, html):
-        soup = BeautifulSoup(html, 'html.parser')
+        soup = BeautifulSoup(html, **self.options['bs4_options'])
        return self.convert_soup(soup)

    def convert_soup(self, soup):
@@ -362,16 +367,20 @@ class MarkdownConverter(object):
        if not self.should_convert_tag(tag_name):
            return None

-        # Handle headings with _convert_hn() function
+        # Look for an explicitly defined conversion function by tag name first
+        convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
+        convert_fn = getattr(self, convert_fn_name, None)
+        if convert_fn:
+            return convert_fn
+
+        # If tag is any heading, handle with convert_hN() function
        match = re_html_heading.match(tag_name)
        if match:
-            n = int(match.group(1))
-            return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags)
+            n = int(match.group(1))  # get value of N from <hN>
+            return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)

-        # For other tags, look up their conversion function by tag name
-        convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', tag_name)
-        convert_fn = getattr(self, convert_fn_name, None)
-        return convert_fn
+        # No conversion function was found
+        return None

    def should_convert_tag(self, tag):
        """Given a tag name, return whether to convert based on strip/convert options."""
@@ -509,12 +518,12 @@ class MarkdownConverter(object):

        return '\n\n%s\n' % text

-    def _convert_hn(self, n, el, text, parent_tags):
-        """ Method name prefixed with _ to prevent <hn> to call this """
+    def convert_hN(self, n, el, text, parent_tags):
+        # convert_hN() converts <hN> tags, where N is any integer
        if '_inline' in parent_tags:
            return text

-        # prevent MemoryErrors in case of very large n
+        # Markdown does not support heading depths of n > 6
        n = max(1, min(6, n))

        style = self.options['heading_style'].lower()
@@ -649,6 +658,9 @@ class MarkdownConverter(object):

        return '\n\n```%s\n%s\n```\n\n' % (code_language, text)

+    def convert_q(self, el, text, parent_tags):
+        return '"' + text + '"'
+
    def convert_script(self, el, text, parent_tags):
        return ''

--- a/markdownify/main.py
+++ b/markdownify/main.py
@@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('--no-escape-underscores', dest='escape_underscores',
                        action='store_false',
                        help="Do not escape '_' to '\\_' in text.")
-    parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
+    parser.add_argument('-i', '--keep-inline-images-in',
+                        default=[],
+                        nargs='*',
                        help="Images are converted to their alt-text when the images are "
                        "located inside headlines or table cells. If some inline images "
                        "should be converted to markdown images instead, this option can "
@@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('-w', '--wrap', action='store_true',
                        help="Wrap all text paragraphs at --wrap-width characters.")
    parser.add_argument('--wrap-width', type=int, default=80)
+    parser.add_argument('-p', '--beautiful-soup-parser',
+                        dest='beautiful_soup_parser',
+                        default='html.parser',
+                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
+                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
+                             "environment.")

    args = parser.parse_args(argv)
    print(markdownify(**vars(args)))
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -32,3 +32,9 @@ def test_strip_document():
    assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
    assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
    assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
+
+
+def bs4_options():
+    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -164,7 +164,8 @@ def test_hn():
    assert md('<h5>Hello</h5>') == '\n\n##### Hello\n\n'
    assert md('<h6>Hello</h6>') == '\n\n###### Hello\n\n'
    assert md('<h10>Hello</h10>') == md('<h6>Hello</h6>')
-    assert md('<hn>Hello</hn>') == md('Hello')
+    assert md('<h0>Hello</h0>') == md('<h1>Hello</h1>')
+    assert md('<hx>Hello</hx>') == md('Hello')


 def test_hn_chained():
@@ -304,6 +305,11 @@ def test_pre():
    assert md("<p>foo</p>\n<pre>bar</pre>\n</p>baz</p>", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz"


+def test_q():
+    assert md('foo <q>quote</q> bar') == 'foo "quote" bar'
+    assert md('foo <q cite="https://example.com">quote</q> bar') == 'foo "quote" bar'
+
+
 def test_script():
    assert md('foo <script>var foo=42;</script> bar') == 'foo  bar'

--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -12,7 +12,15 @@ class UnitTestConverter(MarkdownConverter):

    def convert_custom_tag(self, el, text, parent_tags):
        """Ensure conversion function is found for tags with special characters in name"""
-        return "FUNCTION USED: %s" % text
+        return "convert_custom_tag(): %s" % text
+
+    def convert_h1(self, el, text, parent_tags):
+        """Ensure explicit heading conversion function is used"""
+        return "convert_h1: %s" % (text)
+
+    def convert_hN(self, n, el, text, parent_tags):
+        """Ensure general heading conversion function is used"""
+        return "convert_hN(%d): %s" % (n, text)


 def test_custom_conversion_functions():
@@ -23,7 +31,11 @@ def test_custom_conversion_functions():
    assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />text') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext'
    assert md('<img src="/path/to/img.jpg" alt="Alt text" />text') == '![Alt text](/path/to/img.jpg)\n\ntext'

-    assert md("<custom-tag>text</custom-tag>") == "FUNCTION USED: text"
+    assert md("<custom-tag>text</custom-tag>") == "convert_custom_tag(): text"
+
+    assert md("<h1>text</h1>") == "convert_h1: text"
+
+    assert md("<h3>text</h3>") == "convert_hN(3): text"


 def test_soup():
Author	SHA1	Message	Date
chrispy	2d0a14a2a7	allow BeautifulSoup configuration kwargs to be specified Signed-off-by: chrispy <chrispy@synopsys.com>	2025-06-14 07:24:19 -04:00
Chris Papademetrious	016251e915	ensure that explicitly provided heading conversion functions are used (#212 ) (#214 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-05-03 10:57:09 -04:00
Colin	0e1a849346	Add conversion support for <q> tags (#217 )	2025-04-28 06:37:33 -04:00
Chris Papademetrious	e29de4e753	make convert_hn() public instead of internal (#213 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-04-20 06:20:01 -04:00
Vincent Kelleher	2d654a6b7e	Add beautiful_soup_parser option (#206 ) * add beautiful_soup_parser option * add Beautiful Soup parser argument to command line --------- Co-authored-by: Vincent Kelleher <vincent.kelleher-ext@francetravail.fr> Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>	2025-03-29 11:29:29 +01:00