allow BeautifulSoup configuration kwargs to be specified (#224)
Signed-off-by: chrispy <chrispy@synopsys.com>
This commit is contained in:
committed by
GitHub
parent
016251e915
commit
75ab3064dd
14
README.rst
14
README.rst
@@ -157,12 +157,16 @@ strip_document
|
||||
within the document are unaffected.
|
||||
Defaults to ``STRIP``.
|
||||
|
||||
beautiful_soup_parser
|
||||
Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such
|
||||
as `html5lib`, `lxml` or even a custom parser as long as it is installed on the execution
|
||||
environment. Defaults to ``html.parser``.
|
||||
bs4_options
|
||||
Specify additional configuration options for the ``BeautifulSoup`` object
|
||||
used to interpret the HTML markup. String and list values (such as ``lxml``
|
||||
or ``html5lib``) are treated as ``features`` arguments to control parser
|
||||
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
|
||||
are treated as full kwargs to be used for the BeautifulSoup constructor,
|
||||
allowing specification of any parameter. For parameter details, see the
|
||||
Beautiful Soup documentation at:
|
||||
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
|
||||
Options may be specified as kwargs to the ``markdownify`` function, or as a
|
||||
nested ``Options`` class in ``MarkdownConverter`` subclasses.
|
||||
|
||||
@@ -154,7 +154,7 @@ def _next_block_content_sibling(el):
|
||||
class MarkdownConverter(object):
|
||||
class DefaultOptions:
|
||||
autolinks = True
|
||||
beautiful_soup_parser = 'html.parser'
|
||||
bs4_options = 'html.parser'
|
||||
bullets = '*+-' # An iterable of bullet types.
|
||||
code_language = ''
|
||||
code_language_callback = None
|
||||
@@ -188,11 +188,15 @@ class MarkdownConverter(object):
|
||||
raise ValueError('You may specify either tags to strip or tags to'
|
||||
' convert, but not both.')
|
||||
|
||||
# If a string or list is passed to bs4_options, assume it is a 'features' specification
|
||||
if not isinstance(self.options['bs4_options'], dict):
|
||||
self.options['bs4_options'] = {'features': self.options['bs4_options']}
|
||||
|
||||
# Initialize the conversion function cache
|
||||
self.convert_fn_cache = {}
|
||||
|
||||
def convert(self, html):
|
||||
soup = BeautifulSoup(html, self.options['beautiful_soup_parser'])
|
||||
soup = BeautifulSoup(html, **self.options['bs4_options'])
|
||||
return self.convert_soup(soup)
|
||||
|
||||
def convert_soup(self, soup):
|
||||
|
||||
9
markdownify/main.py
Normal file → Executable file
9
markdownify/main.py
Normal file → Executable file
@@ -70,12 +70,11 @@ def main(argv=sys.argv[1:]):
|
||||
parser.add_argument('-w', '--wrap', action='store_true',
|
||||
help="Wrap all text paragraphs at --wrap-width characters.")
|
||||
parser.add_argument('--wrap-width', type=int, default=80)
|
||||
parser.add_argument('-p', '--beautiful-soup-parser',
|
||||
dest='beautiful_soup_parser',
|
||||
parser.add_argument('--bs4-options',
|
||||
default='html.parser',
|
||||
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
|
||||
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
|
||||
"environment.")
|
||||
help="Specifies the parser that BeautifulSoup should use to parse "
|
||||
"the HTML markup. Examples include 'html5.parser', 'lxml', and "
|
||||
"'html5lib'.")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
print(markdownify(**vars(args)))
|
||||
|
||||
@@ -32,3 +32,9 @@ def test_strip_document():
|
||||
assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
|
||||
assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
|
||||
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
|
||||
|
||||
|
||||
def bs4_options():
|
||||
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
|
||||
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
|
||||
assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"
|
||||
|
||||
Reference in New Issue
Block a user