Add beautiful_soup_parser option (#206)
* add beautiful_soup_parser option * add Beautiful Soup parser argument to command line --------- Co-authored-by: Vincent Kelleher <vincent.kelleher-ext@francetravail.fr> Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>
This commit is contained in:
@@ -157,6 +157,13 @@ strip_document
|
||||
within the document are unaffected.
|
||||
Defaults to ``STRIP``.
|
||||
|
||||
beautiful_soup_parser
|
||||
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
|
||||
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
|
||||
environment. Defaults to ``html.parser``.
|
||||
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||
|
||||
Options may be specified as kwargs to the ``markdownify`` function, or as a
|
||||
nested ``Options`` class in ``MarkdownConverter`` subclasses.
|
||||
|
||||
|
||||
@@ -154,6 +154,7 @@ def _next_block_content_sibling(el):
|
||||
class MarkdownConverter(object):
|
||||
class DefaultOptions:
|
||||
autolinks = True
|
||||
beautiful_soup_parser = 'html.parser'
|
||||
bullets = '*+-' # An iterable of bullet types.
|
||||
code_language = ''
|
||||
code_language_callback = None
|
||||
@@ -191,7 +192,7 @@ class MarkdownConverter(object):
|
||||
self.convert_fn_cache = {}
|
||||
|
||||
def convert(self, html):
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
|
||||
return self.convert_soup(soup)
|
||||
|
||||
def convert_soup(self, soup):
|
||||
|
||||
@@ -55,7 +55,9 @@ def main(argv=sys.argv[1:]):
|
||||
parser.add_argument('--no-escape-underscores', dest='escape_underscores',
|
||||
action='store_false',
|
||||
help="Do not escape '_' to '\\_' in text.")
|
||||
parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
|
||||
parser.add_argument('-i', '--keep-inline-images-in',
|
||||
default=[],
|
||||
nargs='*',
|
||||
help="Images are converted to their alt-text when the images are "
|
||||
"located inside headlines or table cells. If some inline images "
|
||||
"should be converted to markdown images instead, this option can "
|
||||
@@ -68,6 +70,12 @@ def main(argv=sys.argv[1:]):
|
||||
parser.add_argument('-w', '--wrap', action='store_true',
|
||||
help="Wrap all text paragraphs at --wrap-width characters.")
|
||||
parser.add_argument('--wrap-width', type=int, default=80)
|
||||
parser.add_argument('-p', '--beautiful-soup-parser',
|
||||
dest='beautiful_soup_parser',
|
||||
default='html.parser',
|
||||
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
|
||||
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
|
||||
"environment.")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
print(markdownify(**vars(args)))
|
||||
|
||||
Reference in New Issue
Block a user