Add beautiful_soup_parser option (#206)

* add beautiful_soup_parser option
* add Beautiful Soup parser argument to command line

---------

Co-authored-by: Vincent Kelleher <vincent.kelleher-ext@francetravail.fr>
Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>
This commit is contained in:
Vincent Kelleher
2025-03-29 11:29:29 +01:00
committed by GitHub
parent 13183f9925
commit 2d654a6b7e
3 changed files with 18 additions and 2 deletions

View File

@@ -157,6 +157,13 @@ strip_document
within the document are unaffected.
Defaults to ``STRIP``.
beautiful_soup_parser
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
environment. Defaults to ``html.parser``.
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.

View File

@@ -154,6 +154,7 @@ def _next_block_content_sibling(el):
class MarkdownConverter(object):
class DefaultOptions:
autolinks = True
beautiful_soup_parser = 'html.parser'
bullets = '*+-' # An iterable of bullet types.
code_language = ''
code_language_callback = None
@@ -191,7 +192,7 @@ class MarkdownConverter(object):
self.convert_fn_cache = {}
def convert(self, html):
soup = BeautifulSoup(html, 'html.parser')
soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
return self.convert_soup(soup)
def convert_soup(self, soup):

View File

@@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]):
parser.add_argument('--no-escape-underscores', dest='escape_underscores',
action='store_false',
help="Do not escape '_' to '\\_' in text.")
parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
parser.add_argument('-i', '--keep-inline-images-in',
default=[],
nargs='*',
help="Images are converted to their alt-text when the images are "
"located inside headlines or table cells. If some inline images "
"should be converted to markdown images instead, this option can "
@@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]):
parser.add_argument('-w', '--wrap', action='store_true',
help="Wrap all text paragraphs at --wrap-width characters.")
parser.add_argument('--wrap-width', type=int, default=80)
parser.add_argument('-p', '--beautiful-soup-parser',
dest='beautiful_soup_parser',
default='html.parser',
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
"environment.")
args = parser.parse_args(argv)
print(markdownify(**vars(args)))