allow BeautifulSoup configuration kwargs to be specified

Signed-off-by: chrispy <chrispy@synopsys.com>
2025-06-14 07:24:19 -04:00
11 changed files with 20 additions and 248 deletions
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -15,7 +15,7 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python 3.8
      uses: actions/setup-python@v2
      with:
@@ -30,22 +30,3 @@ jobs:
    - name: Build
      run: |
        python -m build -nwsx .
-
-  types:
-
-    runs-on: ubuntu-latest
-
-    steps:
-    - uses: actions/checkout@v2
-    - name: Set up Python 3.8
-      uses: actions/setup-python@v2
-      with:
-        python-version: 3.8
-    - name: Install dependencies
-      run: |
-        python -m pip install --upgrade pip
-        pip install --upgrade setuptools setuptools_scm wheel build tox mypy types-beautifulsoup4
-    - name: Check types
-      run: |
-        mypy .
-        mypy --strict tests/types.py
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v4
+    - uses: actions/checkout@v2
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
--- a/README.rst
+++ b/README.rst
@@ -110,7 +110,7 @@ code_language_callback
  When the HTML code contains ``pre`` tags that in some way provide the code
  language, for example as class, this callback can be used to extract the
  language from the tag and prefix it to the converted ``pre`` tag.
-  The callback gets one single argument, a BeautifulSoup object, and returns
+  The callback gets one single argument, an BeautifylSoup object, and returns
  a string containing the code language, or ``None``.
  An example to use the class name as code language could be::

@@ -157,16 +157,10 @@ strip_document
  within the document are unaffected.
  Defaults to ``STRIP``.

-strip_pre
-  Controls whether leading/trailing blank lines are removed from ``<pre>``
-  tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
-  ``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
-  Defaults to ``STRIP``.
-
 bs4_options
  Specify additional configuration options for the ``BeautifulSoup`` object
-  used to interpret the HTML markup. String and list values (such as ``lxml``
-  or ``html5lib``) are treated as ``features`` arguments to control parser
+  used to interpret the HTML markup. String and list values (such as ``lxml``)
+  are treated as ``features`` parameter arguments to control parser
  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
  are treated as full kwargs to be used for the BeautifulSoup constructor,
  allowing specification of any parameter. For parameter details, see the
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -11,10 +11,6 @@ re_whitespace = re.compile(r'[\t ]+')
 re_all_whitespace = re.compile(r'[\t \r\n]+')
 re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
 re_html_heading = re.compile(r'h(\d+)')
-re_pre_lstrip1 = re.compile(r'^ *\n')
-re_pre_rstrip1 = re.compile(r'\n *$')
-re_pre_lstrip = re.compile(r'^[ \n]*\n')
-re_pre_rstrip = re.compile(r'[ \n]*$')

 # Pattern for creating convert_<tag> function names from tag names
 re_make_convert_fn_name = re.compile(r'[\[\]:-]')
@@ -41,9 +37,6 @@ re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
 # confused with a list item
 re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')

-# Find consecutive backtick sequences in a string
-re_backtick_runs = re.compile(r'`+')
-
 # Heading styles
 ATX = 'atx'
 ATX_CLOSED = 'atx_closed'
@@ -58,25 +51,10 @@ BACKSLASH = 'backslash'
 ASTERISK = '*'
 UNDERSCORE = '_'

-# Document/pre strip styles
+# Document strip styles
 LSTRIP = 'lstrip'
 RSTRIP = 'rstrip'
 STRIP = 'strip'
-STRIP_ONE = 'strip_one'
-
-
-def strip1_pre(text):
-    """Strip one leading and trailing newline from a <pre> string."""
-    text = re_pre_lstrip1.sub('', text)
-    text = re_pre_rstrip1.sub('', text)
-    return text
-
-
-def strip_pre(text):
-    """Strip all leading and trailing newlines from a <pre> string."""
-    text = re_pre_lstrip.sub('', text)
-    text = re_pre_rstrip.sub('', text)
-    return text


 def chomp(text):
@@ -190,7 +168,6 @@ class MarkdownConverter(object):
        newline_style = SPACES
        strip = None
        strip_document = STRIP
-        strip_pre = STRIP
        strong_em_symbol = ASTERISK
        sub_symbol = ''
        sup_symbol = ''
@@ -483,24 +460,10 @@ class MarkdownConverter(object):
            return '  \n'

    def convert_code(self, el, text, parent_tags):
-        if '_noformat' in parent_tags:
+        if 'pre' in parent_tags:
            return text
-
-        prefix, suffix, text = chomp(text)
-        if not text:
-            return ''
-
-        # Find the maximum number of consecutive backticks in the text, then
-        # delimit the code span with one more backtick than that
-        max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
-        markup_delimiter = '`' * (max_backticks + 1)
-
-        # If the maximum number of backticks is greater than zero, add a space
-        # to avoid interpretation of inside backticks as literals
-        if max_backticks > 0:
-            text = " " + text + " "
-
-        return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
+        converter = abstract_inline_conversion(lambda self: '`')
+        return converter(self, el, text, parent_tags)

    convert_del = abstract_inline_conversion(lambda self: '~~')

@@ -693,15 +656,6 @@ class MarkdownConverter(object):
        if self.options['code_language_callback']:
            code_language = self.options['code_language_callback'](el) or code_language

-        if self.options['strip_pre'] == STRIP:
-            text = strip_pre(text)  # remove all leading/trailing newlines
-        elif self.options['strip_pre'] == STRIP_ONE:
-            text = strip1_pre(text)  # remove one leading/trailing newline
-        elif self.options['strip_pre'] is None:
-            pass  # leave leading and trailing newlines as-is
-        else:
-            raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
-
        return '\n\n```%s\n%s\n```\n\n' % (code_language, text)

    def convert_q(self, el, text, parent_tags):
@@ -735,13 +689,13 @@ class MarkdownConverter(object):
    def convert_td(self, el, text, parent_tags):
        colspan = 1
        if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = max(1, min(1000, int(el['colspan'])))
+            colspan = int(el['colspan'])
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

    def convert_th(self, el, text, parent_tags):
        colspan = 1
        if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = max(1, min(1000, int(el['colspan'])))
+            colspan = int(el['colspan'])
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

    def convert_tr(self, el, text, parent_tags):
@@ -762,7 +716,7 @@ class MarkdownConverter(object):
        full_colspan = 0
        for cell in cells:
            if 'colspan' in cell.attrs and cell['colspan'].isdigit():
-                full_colspan += max(1, min(1000, int(cell['colspan'])))
+                full_colspan += int(cell["colspan"])
            else:
                full_colspan += 1
        if ((is_headrow
--- a/markdownify/init.pyi
+++ b/markdownify/init.pyi
@@ -1,77 +0,0 @@
-from _typeshed import Incomplete
-from typing import Callable, Union
-
-ATX: str
-ATX_CLOSED: str
-UNDERLINED: str
-SETEXT = UNDERLINED
-SPACES: str
-BACKSLASH: str
-ASTERISK: str
-UNDERSCORE: str
-LSTRIP: str
-RSTRIP: str
-STRIP: str
-STRIP_ONE: str
-
-
-def markdownify(
-    html: str,
-    autolinks: bool = ...,
-    bs4_options: str = ...,
-    bullets: str = ...,
-    code_language: str = ...,
-    code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
-    convert: Union[list[str], None] = ...,
-    default_title: bool = ...,
-    escape_asterisks: bool = ...,
-    escape_underscores: bool = ...,
-    escape_misc: bool = ...,
-    heading_style: str = ...,
-    keep_inline_images_in: list[str] = ...,
-    newline_style: str = ...,
-    strip: Union[list[str], None] = ...,
-    strip_document: Union[str, None] = ...,
-    strip_pre: str = ...,
-    strong_em_symbol: str = ...,
-    sub_symbol: str = ...,
-    sup_symbol: str = ...,
-    table_infer_header: bool = ...,
-    wrap: bool = ...,
-    wrap_width: int = ...,
-) -> str: ...
-
-
-class MarkdownConverter:
-    def __init__(
-        self,
-        autolinks: bool = ...,
-        bs4_options: str = ...,
-        bullets: str = ...,
-        code_language: str = ...,
-        code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
-        convert: Union[list[str], None] = ...,
-        default_title: bool = ...,
-        escape_asterisks: bool = ...,
-        escape_underscores: bool = ...,
-        escape_misc: bool = ...,
-        heading_style: str = ...,
-        keep_inline_images_in: list[str] = ...,
-        newline_style: str = ...,
-        strip: Union[list[str], None] = ...,
-        strip_document: Union[str, None] = ...,
-        strip_pre: str = ...,
-        strong_em_symbol: str = ...,
-        sub_symbol: str = ...,
-        sup_symbol: str = ...,
-        table_infer_header: bool = ...,
-        wrap: bool = ...,
-        wrap_width: int = ...,
-    ) -> None:
-        ...
-  
-    def convert(self, html: str) -> str:
-        ...
-
-    def convert_soup(self, soup: Incomplete) -> str:
-        ...
--- a/markdownify/main.py
+++ b/markdownify/main.py
@@ -70,11 +70,12 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('-w', '--wrap', action='store_true',
                        help="Wrap all text paragraphs at --wrap-width characters.")
    parser.add_argument('--wrap-width', type=int, default=80)
-    parser.add_argument('--bs4-options',
+    parser.add_argument('-p', '--beautiful-soup-parser',
+                        dest='beautiful_soup_parser',
                        default='html.parser',
-                        help="Specifies the parser that BeautifulSoup should use to parse "
-                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
-                             "'html5lib'.")
+                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
+                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
+                             "environment.")

    args = parser.parse_args(argv)
    print(markdownify(**vars(args)))
--- a/markdownify/py.typed
+++ b/markdownify/py.typed
@@ -1 +0,0 @@
-
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "markdownify"
-version = "1.2.2"
+version = "1.1.0"
 authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
 description = "Convert HTML to markdown."
 readme = "README.rst"
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -2,7 +2,7 @@
 Test whitelisting/blacklisting of specific tags.

 """
-from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE
+from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
 from .utils import md


@@ -34,13 +34,6 @@ def test_strip_document():
    assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"


-def test_strip_pre():
-    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>") == "```\n  Hello\n```"
-    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP) == "```\n  Hello\n```"
-    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP_ONE) == "```\n  \n  Hello  \n  \n```"
-    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=None) == "```\n  \n  \n  Hello  \n  \n  \n```"
-
-
 def bs4_options():
    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -101,9 +101,6 @@ def test_code():
    assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
    assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
    assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
-    assert md('foo<code>`bar`</code>baz') == 'foo`` `bar` ``baz'
-    assert md('foo<code>``bar``</code>baz') == 'foo``` ``bar`` ```baz'
-    assert md('foo<code> `bar` </code>baz') == 'foo `` `bar` `` baz'


 def test_dl():
@@ -373,4 +370,4 @@ def test_spaces():
    assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
    assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
    assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
-    assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'
+    assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo \n```\n\nbar'
--- a/tests/types.py
+++ b/tests/types.py
@@ -1,70 +0,0 @@
-from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter
-from bs4 import BeautifulSoup
-from typing import Union
-
-markdownify("<p>Hello</p>") == "Hello"  # test default of STRIP
-markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
-markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
-markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
-markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
-
-# default options
-MarkdownConverter(
-    autolinks=True,
-    bs4_options='html.parser',
-    bullets='*+-',
-    code_language='',
-    code_language_callback=None,
-    convert=None,
-    default_title=False,
-    escape_asterisks=True,
-    escape_underscores=True,
-    escape_misc=False,
-    heading_style=UNDERLINED,
-    keep_inline_images_in=[],
-    newline_style=SPACES,
-    strip=None,
-    strip_document=STRIP,
-    strip_pre=STRIP,
-    strong_em_symbol=ASTERISK,
-    sub_symbol='',
-    sup_symbol='',
-    table_infer_header=False,
-    wrap=False,
-    wrap_width=80,
-).convert("")
-
-# custom options
-MarkdownConverter(
-    strip_document=None,
-    bullets="-",
-    escape_asterisks=True,
-    escape_underscores=True,
-    escape_misc=True,
-    autolinks=True,
-    default_title=True,
-    newline_style=BACKSLASH,
-    sup_symbol='^',
-    sub_symbol='^',
-    keep_inline_images_in=['h3'],
-    wrap=True,
-    wrap_width=80,
-    strong_em_symbol=UNDERSCORE,
-    code_language='python',
-    code_language_callback=None
-).convert("")
-
-html = '<b>test</b>'
-soup = BeautifulSoup(html, 'html.parser')
-MarkdownConverter().convert_soup(soup) == '**test**'
-
-
-def callback(el: BeautifulSoup) -> Union[str, None]:
-    return el['class'][0] if el.has_attr('class') else None
-
-
-MarkdownConverter(code_language_callback=callback).convert("")
-MarkdownConverter(code_language_callback=lambda el: None).convert("")
-
-markdownify('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback)
-markdownify('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=lambda el: None)