allow BeautifulSoup configuration kwargs to be specified

Signed-off-by: chrispy <chrispy@synopsys.com>
2025-06-14 07:24:19 -04:00
10 changed files with 20 additions and 247 deletions
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python 3.8
      uses: actions/setup-python@v2
      with:
@@ -30,22 +30,3 @@ jobs:
    - name: Build
      run: |
        python -m build -nwsx .
  types:
    runs-on: ubuntu-latest
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python 3.8
      uses: actions/setup-python@v2
      with:
        python-version: 3.8
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
        pip install --upgrade setuptools setuptools_scm wheel build tox mypy types-beautifulsoup4
    - name: Check types
      run: |
        mypy .
        mypy --strict tests/types.py
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest
    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
--- a/README.rst
+++ b/README.rst
@@ -110,7 +110,7 @@ code_language_callback
  When the HTML code contains ``pre`` tags that in some way provide the code
  language, for example as class, this callback can be used to extract the
  language from the tag and prefix it to the converted ``pre`` tag.
-  The callback gets one single argument, a BeautifulSoup object, and returns
+  The callback gets one single argument, an BeautifylSoup object, and returns
  a string containing the code language, or ``None``.
  An example to use the class name as code language could be::
@@ -157,16 +157,10 @@ strip_document
  within the document are unaffected.
  Defaults to ``STRIP``.
 strip_pre
  Controls whether leading/trailing blank lines are removed from ``<pre>``
  tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
  ``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
  Defaults to ``STRIP``.
 bs4_options
  Specify additional configuration options for the ``BeautifulSoup`` object
-  used to interpret the HTML markup. String and list values (such as ``lxml``
+  used to interpret the HTML markup. String and list values (such as ``lxml``)
-  or ``html5lib``) are treated as ``features`` arguments to control parser
+  are treated as ``features`` parameter arguments to control parser
  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
  are treated as full kwargs to be used for the BeautifulSoup constructor,
  allowing specification of any parameter. For parameter details, see the
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -11,10 +11,6 @@ re_whitespace = re.compile(r'[\t ]+')
 re_all_whitespace = re.compile(r'[\t \r\n]+')
 re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
 re_html_heading = re.compile(r'h(\d+)')
 re_pre_lstrip1 = re.compile(r'^ *\n')
 re_pre_rstrip1 = re.compile(r'\n *$')
 re_pre_lstrip = re.compile(r'^[ \n]*\n')
 re_pre_rstrip = re.compile(r'[ \n]*$')
 # Pattern for creating convert_<tag> function names from tag names
 re_make_convert_fn_name = re.compile(r'[\[\]:-]')
@@ -41,9 +37,6 @@ re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
 # confused with a list item
 re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
 # Find consecutive backtick sequences in a string
 re_backtick_runs = re.compile(r'`+')
 # Heading styles
 ATX = 'atx'
 ATX_CLOSED = 'atx_closed'
@@ -58,25 +51,10 @@ BACKSLASH = 'backslash'
 ASTERISK = '*'
 UNDERSCORE = '_'
-# Document/pre strip styles
+# Document strip styles
 LSTRIP = 'lstrip'
 RSTRIP = 'rstrip'
 STRIP = 'strip'
 STRIP_ONE = 'strip_one'
 def strip1_pre(text):
    """Strip one leading and trailing newline from a <pre> string."""
    text = re_pre_lstrip1.sub('', text)
    text = re_pre_rstrip1.sub('', text)
    return text
 def strip_pre(text):
    """Strip all leading and trailing newlines from a <pre> string."""
    text = re_pre_lstrip.sub('', text)
    text = re_pre_rstrip.sub('', text)
    return text
 def chomp(text):
@@ -190,7 +168,6 @@ class MarkdownConverter(object):
        newline_style = SPACES
        strip = None
        strip_document = STRIP
        strip_pre = STRIP
        strong_em_symbol = ASTERISK
        sub_symbol = ''
        sup_symbol = ''
@@ -483,24 +460,10 @@ class MarkdownConverter(object):
            return '  \n'
    def convert_code(self, el, text, parent_tags):
-        if '_noformat' in parent_tags:
+        if 'pre' in parent_tags:
            return text
-
+        converter = abstract_inline_conversion(lambda self: '`')
-        prefix, suffix, text = chomp(text)
+        return converter(self, el, text, parent_tags)
        if not text:
            return ''
        # Find the maximum number of consecutive backticks in the text, then
        # delimit the code span with one more backtick than that
        max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
        markup_delimiter = '`' * (max_backticks + 1)
        # If the maximum number of backticks is greater than zero, add a space
        # to avoid interpretation of inside backticks as literals
        if max_backticks > 0:
            text = " " + text + " "
        return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
    convert_del = abstract_inline_conversion(lambda self: '~~')
@@ -693,15 +656,6 @@ class MarkdownConverter(object):
        if self.options['code_language_callback']:
            code_language = self.options['code_language_callback'](el) or code_language
        if self.options['strip_pre'] == STRIP:
            text = strip_pre(text)  # remove all leading/trailing newlines
        elif self.options['strip_pre'] == STRIP_ONE:
            text = strip1_pre(text)  # remove one leading/trailing newline
        elif self.options['strip_pre'] is None:
            pass  # leave leading and trailing newlines as-is
        else:
            raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
        return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
    def convert_q(self, el, text, parent_tags):
@@ -735,13 +689,13 @@ class MarkdownConverter(object):
    def convert_td(self, el, text, parent_tags):
        colspan = 1
        if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = max(1, min(1000, int(el['colspan'])))
+            colspan = int(el['colspan'])
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
    def convert_th(self, el, text, parent_tags):
        colspan = 1
        if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = max(1, min(1000, int(el['colspan'])))
+            colspan = int(el['colspan'])
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
    def convert_tr(self, el, text, parent_tags):
@@ -762,7 +716,7 @@ class MarkdownConverter(object):
        full_colspan = 0
        for cell in cells:
            if 'colspan' in cell.attrs and cell['colspan'].isdigit():
-                full_colspan += max(1, min(1000, int(cell['colspan'])))
+                full_colspan += int(cell["colspan"])
            else:
                full_colspan += 1
        if ((is_headrow
--- a/markdownify/init.pyi
+++ b/markdownify/init.pyi
@@ -1,77 +0,0 @@
 from _typeshed import Incomplete
 from typing import Callable, Union
 ATX: str
 ATX_CLOSED: str
 UNDERLINED: str
 SETEXT = UNDERLINED
 SPACES: str
 BACKSLASH: str
 ASTERISK: str
 UNDERSCORE: str
 LSTRIP: str
 RSTRIP: str
 STRIP: str
 STRIP_ONE: str
 def markdownify(
    html: str,
    autolinks: bool = ...,
    bs4_options: str = ...,
    bullets: str = ...,
    code_language: str = ...,
    code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
    convert: Union[list[str], None] = ...,
    default_title: bool = ...,
    escape_asterisks: bool = ...,
    escape_underscores: bool = ...,
    escape_misc: bool = ...,
    heading_style: str = ...,
    keep_inline_images_in: list[str] = ...,
    newline_style: str = ...,
    strip: Union[list[str], None] = ...,
    strip_document: Union[str, None] = ...,
    strip_pre: str = ...,
    strong_em_symbol: str = ...,
    sub_symbol: str = ...,
    sup_symbol: str = ...,
    table_infer_header: bool = ...,
    wrap: bool = ...,
    wrap_width: int = ...,
 ) -> str: ...
 class MarkdownConverter:
    def __init__(
        self,
        autolinks: bool = ...,
        bs4_options: str = ...,
        bullets: str = ...,
        code_language: str = ...,
        code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
        convert: Union[list[str], None] = ...,
        default_title: bool = ...,
        escape_asterisks: bool = ...,
        escape_underscores: bool = ...,
        escape_misc: bool = ...,
        heading_style: str = ...,
        keep_inline_images_in: list[str] = ...,
        newline_style: str = ...,
        strip: Union[list[str], None] = ...,
        strip_document: Union[str, None] = ...,
        strip_pre: str = ...,
        strong_em_symbol: str = ...,
        sub_symbol: str = ...,
        sup_symbol: str = ...,
        table_infer_header: bool = ...,
        wrap: bool = ...,
        wrap_width: int = ...,
    ) -> None:
        ...
    def convert(self, html: str) -> str:
        ...
    def convert_soup(self, soup: Incomplete) -> str:
        ...
--- a/markdownify/main.py
+++ b/markdownify/main.py
@@ -70,11 +70,12 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('-w', '--wrap', action='store_true',
                        help="Wrap all text paragraphs at --wrap-width characters.")
    parser.add_argument('--wrap-width', type=int, default=80)
-    parser.add_argument('--bs4-options',
+    parser.add_argument('-p', '--beautiful-soup-parser',
                        dest='beautiful_soup_parser',
                        default='html.parser',
-                        help="Specifies the parser that BeautifulSoup should use to parse "
+                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
-                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
+                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
-                             "'html5lib'.")
+                             "environment.")
    args = parser.parse_args(argv)
    print(markdownify(**vars(args)))
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 [project]
 name = "markdownify"
-version = "1.2.0"
+version = "1.1.0"
 authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
 description = "Convert HTML to markdown."
 readme = "README.rst"
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -2,7 +2,7 @@
 Test whitelisting/blacklisting of specific tags.
 """
-from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE
+from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
 from .utils import md
@@ -34,13 +34,6 @@ def test_strip_document():
    assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
 def test_strip_pre():
    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>") == "```\n  Hello\n```"
    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP) == "```\n  Hello\n```"
    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP_ONE) == "```\n  \n  Hello  \n  \n```"
    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=None) == "```\n  \n  \n  Hello  \n  \n  \n```"
 def bs4_options():
    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -101,9 +101,6 @@ def test_code():
    assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
    assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
    assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
    assert md('foo<code>`bar`</code>baz') == 'foo`` `bar` ``baz'
    assert md('foo<code>``bar``</code>baz') == 'foo``` ``bar`` ```baz'
    assert md('foo<code> `bar` </code>baz') == 'foo `` `bar` `` baz'
 def test_dl():
--- a/tests/types.py
+++ b/tests/types.py
@@ -1,70 +0,0 @@
 from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter
 from bs4 import BeautifulSoup
 from typing import Union
 markdownify("<p>Hello</p>") == "Hello"  # test default of STRIP
 markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
 markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
 markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
 markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
 # default options
 MarkdownConverter(
    autolinks=True,
    bs4_options='html.parser',
    bullets='*+-',
    code_language='',
    code_language_callback=None,
    convert=None,
    default_title=False,
    escape_asterisks=True,
    escape_underscores=True,
    escape_misc=False,
    heading_style=UNDERLINED,
    keep_inline_images_in=[],
    newline_style=SPACES,
    strip=None,
    strip_document=STRIP,
    strip_pre=STRIP,
    strong_em_symbol=ASTERISK,
    sub_symbol='',
    sup_symbol='',
    table_infer_header=False,
    wrap=False,
    wrap_width=80,
 ).convert("")
 # custom options
 MarkdownConverter(
    strip_document=None,
    bullets="-",
    escape_asterisks=True,
    escape_underscores=True,
    escape_misc=True,
    autolinks=True,
    default_title=True,
    newline_style=BACKSLASH,
    sup_symbol='^',
    sub_symbol='^',
    keep_inline_images_in=['h3'],
    wrap=True,
    wrap_width=80,
    strong_em_symbol=UNDERSCORE,
    code_language='python',
    code_language_callback=None
 ).convert("")
 html = '<b>test</b>'
 soup = BeautifulSoup(html, 'html.parser')
 MarkdownConverter().convert_soup(soup) == '**test**'
 def callback(el: BeautifulSoup) -> Union[str, None]:
    return el['class'][0] if el.has_attr('class') else None
 MarkdownConverter(code_language_callback=callback).convert("")
 MarkdownConverter(code_language_callback=lambda el: None).convert("")
 markdownify('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback)
 markdownify('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=lambda el: None)