Add beautiful_soup_parser option (#206)

* add beautiful_soup_parser option * add Beautiful Soup parser argument to command line --------- Co-authored-by: Vincent Kelleher <vincent.kelleher-ext@francetravail.fr> Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>
2025-03-29 11:29:29 +01:00
parent 13183f9925
commit 2d654a6b7e
3 changed files with 18 additions and 2 deletions
--- a/README.rst
+++ b/README.rst
@@ -157,6 +157,13 @@ strip_document
  within the document are unaffected.
  Defaults to ``STRIP``.

+beautiful_soup_parser
+  Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
+  as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
+  environment. Defaults to ``html.parser``.
+
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
+
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.

--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -154,6 +154,7 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
    class DefaultOptions:
        autolinks = True
+        beautiful_soup_parser = 'html.parser'
        bullets = '*+-'  # An iterable of bullet types.
        code_language = ''
        code_language_callback = None
@@ -191,7 +192,7 @@ class MarkdownConverter(object):
        self.convert_fn_cache = {}

    def convert(self, html):
-        soup = BeautifulSoup(html, 'html.parser')
+        soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
        return self.convert_soup(soup)

    def convert_soup(self, soup):
--- a/markdownify/main.py
+++ b/markdownify/main.py
@@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('--no-escape-underscores', dest='escape_underscores',
                        action='store_false',
                        help="Do not escape '_' to '\\_' in text.")
-    parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
+    parser.add_argument('-i', '--keep-inline-images-in',
+                        default=[],
+                        nargs='*',
                        help="Images are converted to their alt-text when the images are "
                        "located inside headlines or table cells. If some inline images "
                        "should be converted to markdown images instead, this option can "
@@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('-w', '--wrap', action='store_true',
                        help="Wrap all text paragraphs at --wrap-width characters.")
    parser.add_argument('--wrap-width', type=int, default=80)
+    parser.add_argument('-p', '--beautiful-soup-parser',
+                        dest='beautiful_soup_parser',
+                        default='html.parser',
+                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
+                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
+                             "environment.")

    args = parser.parse_args(argv)
    print(markdownify(**vars(args)))