Merge branch 'develop'

2025-02-24 16:20:32 -05:00 · 2024-11-24 23:05:17 +01:00 · 2024-11-24 12:26:53 +01:00 · 2024-07-14 22:40:15 +02:00 · 2024-07-14 21:20:04 +02:00 · 2024-03-26 21:56:09 +01:00
8 changed files with 33 additions and 141 deletions
--- a/README.rst
+++ b/README.rst
@@ -157,17 +157,6 @@ strip_document
  within the document are unaffected.
  Defaults to ``STRIP``.

-bs4_options
-  Specify additional configuration options for the ``BeautifulSoup`` object
-  used to interpret the HTML markup. String and list values (such as ``lxml``)
-  are treated as ``features`` parameter arguments to control parser
-  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
-  are treated as full kwargs to be used for the BeautifulSoup constructor,
-  allowing specification of any parameter. For parameter details, see the
-  Beautiful Soup documentation at:
-
-.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
-
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.

--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -106,7 +106,6 @@ def should_remove_whitespace_inside(el):
    return el.name in ('p', 'blockquote',
                       'article', 'div', 'section',
                       'ol', 'ul', 'li',
-                       'dl', 'dt', 'dd',
                       'table', 'thead', 'tbody', 'tfoot',
                       'tr', 'td', 'th')

@@ -154,7 +153,6 @@ def _next_block_content_sibling(el):
 class MarkdownConverter(object):
    class DefaultOptions:
        autolinks = True
-        bs4_options = 'html.parser'
        bullets = '*+-'  # An iterable of bullet types.
        code_language = ''
        code_language_callback = None
@@ -188,15 +186,11 @@ class MarkdownConverter(object):
            raise ValueError('You may specify either tags to strip or tags to'
                             ' convert, but not both.')

-        # If a string or list is passed to bs4_options, assume it is a 'features' specification
-        if not isinstance(self.options['bs4_options'], dict):
-            self.options['bs4_options'] = {'features': self.options['bs4_options']}
-
        # Initialize the conversion function cache
        self.convert_fn_cache = {}

    def convert(self, html):
-        soup = BeautifulSoup(html, **self.options['bs4_options'])
+        soup = BeautifulSoup(html, 'html.parser')
        return self.convert_soup(soup)

    def convert_soup(self, soup):
@@ -367,20 +361,16 @@ class MarkdownConverter(object):
        if not self.should_convert_tag(tag_name):
            return None

-        # Look for an explicitly defined conversion function by tag name first
-        convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
-        convert_fn = getattr(self, convert_fn_name, None)
-        if convert_fn:
-            return convert_fn
-
-        # If tag is any heading, handle with convert_hN() function
+        # Handle headings with _convert_hn() function
        match = re_html_heading.match(tag_name)
        if match:
-            n = int(match.group(1))  # get value of N from <hN>
-            return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
+            n = int(match.group(1))
+            return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags)

-        # No conversion function was found
-        return None
+        # For other tags, look up their conversion function by tag name
+        convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', tag_name)
+        convert_fn = getattr(self, convert_fn_name, None)
+        return convert_fn

    def should_convert_tag(self, tag):
        """Given a tag name, return whether to convert based on strip/convert options."""
@@ -452,7 +442,7 @@ class MarkdownConverter(object):

    def convert_br(self, el, text, parent_tags):
        if '_inline' in parent_tags:
-            return ' '
+            return ""

        if self.options['newline_style'].lower() == BACKSLASH:
            return '\\\n'
@@ -499,11 +489,6 @@ class MarkdownConverter(object):

        return '%s\n' % text

-    # definition lists are formatted as follows:
-    #   https://pandoc.org/MANUAL.html#definition-lists
-    #   https://michelf.ca/projects/php-markdown/extra/#def-list
-    convert_dl = convert_div
-
    def convert_dt(self, el, text, parent_tags):
        # remove newlines from term text
        text = (text or '').strip()
@@ -516,14 +501,14 @@ class MarkdownConverter(object):
        # TODO - format consecutive <dt> elements as directly adjacent lines):
        #   https://michelf.ca/projects/php-markdown/extra/#def-list

-        return '\n\n%s\n' % text
+        return '\n%s\n' % text

-    def convert_hN(self, n, el, text, parent_tags):
-        # convert_hN() converts <hN> tags, where N is any integer
+    def _convert_hn(self, n, el, text, parent_tags):
+        """ Method name prefixed with _ to prevent <hn> to call this """
        if '_inline' in parent_tags:
            return text

-        # Markdown does not support heading depths of n > 6
+        # prevent MemoryErrors in case of very large n
        n = max(1, min(6, n))

        style = self.options['heading_style'].lower()
@@ -553,24 +538,6 @@ class MarkdownConverter(object):

        return '![%s](%s%s)' % (alt, src, title_part)

-    def convert_video(self, el, text, parent_tags):
-        if ('_inline' in parent_tags
-                and el.parent.name not in self.options['keep_inline_images_in']):
-            return text
-        src = el.attrs.get('src', None) or ''
-        if not src:
-            sources = el.find_all('source', attrs={'src': True})
-            if sources:
-                src = sources[0].attrs.get('src', None) or ''
-        poster = el.attrs.get('poster', None) or ''
-        if src and poster:
-            return '[![%s](%s)](%s)' % (text, poster, src)
-        if src:
-            return '[%s](%s)' % (text, src)
-        if poster:
-            return '![%s](%s)' % (text, poster)
-        return text
-
    def convert_list(self, el, text, parent_tags):

        # Converting a list to inline is undefined.
@@ -658,9 +625,6 @@ class MarkdownConverter(object):

        return '\n\n```%s\n%s\n```\n\n' % (code_language, text)

-    def convert_q(self, el, text, parent_tags):
-        return '"' + text + '"'
-
    def convert_script(self, el, text, parent_tags):
        return ''

@@ -713,12 +677,6 @@ class MarkdownConverter(object):
        )
        overline = ''
        underline = ''
-        full_colspan = 0
-        for cell in cells:
-            if 'colspan' in cell.attrs and cell['colspan'].isdigit():
-                full_colspan += int(cell["colspan"])
-            else:
-                full_colspan += 1
        if ((is_headrow
             or (is_head_row_missing
                 and self.options['table_infer_header']))
@@ -727,6 +685,12 @@ class MarkdownConverter(object):
            # - is headline or
            # - headline is missing and header inference is enabled
            # print headline underline
+            full_colspan = 0
+            for cell in cells:
+                if 'colspan' in cell.attrs and cell['colspan'].isdigit():
+                    full_colspan += int(cell["colspan"])
+                else:
+                    full_colspan += 1
            underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
        elif ((is_head_row_missing
               and not self.options['table_infer_header'])
@@ -739,8 +703,8 @@ class MarkdownConverter(object):
            #  - the parent is table or
            #  - the parent is tbody at the beginning of a table.
            # print empty headline above this row
-            overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n'
-            overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
+            overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
+            overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
        return overline + '|' + text + '\n' + underline


--- a/markdownify/main.py
+++ b/markdownify/main.py
@@ -55,9 +55,7 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('--no-escape-underscores', dest='escape_underscores',
                        action='store_false',
                        help="Do not escape '_' to '\\_' in text.")
-    parser.add_argument('-i', '--keep-inline-images-in',
-                        default=[],
-                        nargs='*',
+    parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
                        help="Images are converted to their alt-text when the images are "
                        "located inside headlines or table cells. If some inline images "
                        "should be converted to markdown images instead, this option can "
@@ -70,12 +68,6 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('-w', '--wrap', action='store_true',
                        help="Wrap all text paragraphs at --wrap-width characters.")
    parser.add_argument('--wrap-width', type=int, default=80)
-    parser.add_argument('-p', '--beautiful-soup-parser',
-                        dest='beautiful_soup_parser',
-                        default='html.parser',
-                        help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
-                             "as html5lib, lxml or even a custom parser as long as it is installed on the execution "
-                             "environment.")

    args = parser.parse_args(argv)
    print(markdownify(**vars(args)))
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"

 [project]
 name = "markdownify"
-version = "1.1.0"
+version = "1.0.0"
 authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
 description = "Convert HTML to markdown."
 readme = "README.rst"
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -32,9 +32,3 @@ def test_strip_document():
    assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
    assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
    assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
-
-
-def bs4_options():
-    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
-    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
-    assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -79,8 +79,6 @@ def test_blockquote_nested():
 def test_br():
    assert md('a<br />b<br />c') == 'a  \nb  \nc'
    assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
-    assert md('<h1>foo<br />bar</h1>', heading_style=ATX) == '\n\n# foo bar\n\n'
-    assert md('<td>foo<br />bar</td>', heading_style=ATX) == ' foo bar |'


 def test_code():
@@ -104,13 +102,13 @@ def test_code():


 def test_dl():
-    assert md('<dl><dt>term</dt><dd>definition</dd></dl>') == '\n\nterm\n:   definition\n\n'
-    assert md('<dl><dt><p>te</p><p>rm</p></dt><dd>definition</dd></dl>') == '\n\nte rm\n:   definition\n\n'
-    assert md('<dl><dt>term</dt><dd><p>definition-p1</p><p>definition-p2</p></dd></dl>') == '\n\nterm\n:   definition-p1\n\n    definition-p2\n\n'
-    assert md('<dl><dt>term</dt><dd><p>definition 1</p></dd><dd><p>definition 2</p></dd></dl>') == '\n\nterm\n:   definition 1\n:   definition 2\n\n'
-    assert md('<dl><dt>term 1</dt><dd>definition 1</dd><dt>term 2</dt><dd>definition 2</dd></dl>') == '\n\nterm 1\n:   definition 1\n\nterm 2\n:   definition 2\n\n'
-    assert md('<dl><dt>term</dt><dd><blockquote><p>line 1</p><p>line 2</p></blockquote></dd></dl>') == '\n\nterm\n:   > line 1\n    >\n    > line 2\n\n'
-    assert md('<dl><dt>term</dt><dd><ol><li><p>1</p><ul><li>2a</li><li>2b</li></ul></li><li><p>3</p></li></ol></dd></dl>') == '\n\nterm\n:   1. 1\n\n       * 2a\n       * 2b\n    2. 3\n\n'
+    assert md('<dl><dt>term</dt><dd>definition</dd></dl>') == '\nterm\n:   definition\n'
+    assert md('<dl><dt><p>te</p><p>rm</p></dt><dd>definition</dd></dl>') == '\nte rm\n:   definition\n'
+    assert md('<dl><dt>term</dt><dd><p>definition-p1</p><p>definition-p2</p></dd></dl>') == '\nterm\n:   definition-p1\n\n    definition-p2\n'
+    assert md('<dl><dt>term</dt><dd><p>definition 1</p></dd><dd><p>definition 2</p></dd></dl>') == '\nterm\n:   definition 1\n:   definition 2\n'
+    assert md('<dl><dt>term 1</dt><dd>definition 1</dd><dt>term 2</dt><dd>definition 2</dd></dl>') == '\nterm 1\n:   definition 1\nterm 2\n:   definition 2\n'
+    assert md('<dl><dt>term</dt><dd><blockquote><p>line 1</p><p>line 2</p></blockquote></dd></dl>') == '\nterm\n:   > line 1\n    >\n    > line 2\n'
+    assert md('<dl><dt>term</dt><dd><ol><li><p>1</p><ul><li>2a</li><li>2b</li></ul></li><li><p>3</p></li></ol></dd></dl>') == '\nterm\n:   1. 1\n\n       * 2a\n       * 2b\n    2. 3\n'


 def test_del():
@@ -164,8 +162,7 @@ def test_hn():
    assert md('<h5>Hello</h5>') == '\n\n##### Hello\n\n'
    assert md('<h6>Hello</h6>') == '\n\n###### Hello\n\n'
    assert md('<h10>Hello</h10>') == md('<h6>Hello</h6>')
-    assert md('<h0>Hello</h0>') == md('<h1>Hello</h1>')
-    assert md('<hx>Hello</hx>') == md('Hello')
+    assert md('<hn>Hello</hn>') == md('Hello')


 def test_hn_chained():
@@ -246,14 +243,6 @@ def test_img():
    assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)'


-def test_video():
-    assert md('<video src="/path/to/video.mp4" poster="/path/to/img.jpg">text</video>') == '[![text](/path/to/img.jpg)](/path/to/video.mp4)'
-    assert md('<video src="/path/to/video.mp4">text</video>') == '[text](/path/to/video.mp4)'
-    assert md('<video><source src="/path/to/video.mp4"/>text</video>') == '[text](/path/to/video.mp4)'
-    assert md('<video poster="/path/to/img.jpg">text</video>') == '![text](/path/to/img.jpg)'
-    assert md('<video>text</video>') == 'text'
-
-
 def test_kbd():
    inline_tests('kbd', '`')

@@ -305,11 +294,6 @@ def test_pre():
    assert md("<p>foo</p>\n<pre>bar</pre>\n</p>baz</p>", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz"


-def test_q():
-    assert md('foo <q>quote</q> bar') == 'foo "quote" bar'
-    assert md('foo <q cite="https://example.com">quote</q> bar') == 'foo "quote" bar'
-
-
 def test_script():
    assert md('foo <script>var foo=42;</script> bar') == 'foo  bar'

--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -12,15 +12,7 @@ class UnitTestConverter(MarkdownConverter):

    def convert_custom_tag(self, el, text, parent_tags):
        """Ensure conversion function is found for tags with special characters in name"""
-        return "convert_custom_tag(): %s" % text
-
-    def convert_h1(self, el, text, parent_tags):
-        """Ensure explicit heading conversion function is used"""
-        return "convert_h1: %s" % (text)
-
-    def convert_hN(self, n, el, text, parent_tags):
-        """Ensure general heading conversion function is used"""
-        return "convert_hN(%d): %s" % (n, text)
+        return "FUNCTION USED: %s" % text


 def test_custom_conversion_functions():
@@ -31,11 +23,7 @@ def test_custom_conversion_functions():
    assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />text') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext'
    assert md('<img src="/path/to/img.jpg" alt="Alt text" />text') == '![Alt text](/path/to/img.jpg)\n\ntext'

-    assert md("<custom-tag>text</custom-tag>") == "convert_custom_tag(): text"
-
-    assert md("<h1>text</h1>") == "convert_h1: text"
-
-    assert md("<h3>text</h3>") == "convert_hN(3): text"
+    assert md("<custom-tag>text</custom-tag>") == "FUNCTION USED: text"


 def test_soup():
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -267,23 +267,6 @@ table_with_undefined_colspan = """<table>
    </tr>
 </table>"""

-table_with_colspan_missing_head = """<table>
-    <tr>
-        <td colspan="2">Name</td>
-        <td>Age</td>
-    </tr>
-    <tr>
-        <td>Jill</td>
-        <td>Smith</td>
-        <td>50</td>
-    </tr>
-    <tr>
-        <td>Eve</td>
-        <td>Jackson</td>
-        <td>94</td>
-    </tr>
-</table>"""
-

 def test_table():
    assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
@@ -300,7 +283,6 @@ def test_table():
    assert md(table_with_caption) == 'TEXT\n\nCaption\n\n|  |  |  |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n\n'
    assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
-    assert md(table_with_colspan_missing_head) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Name | | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'


 def test_table_infer_header():
@@ -318,4 +300,3 @@ def test_table_infer_header():
    assert md(table_with_caption, table_infer_header=True) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
    assert md(table_with_colspan, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_undefined_colspan, table_infer_header=True) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
-    assert md(table_with_colspan_missing_head, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
Author	SHA1	Message	Date
chrispy	47856cd429	Merge branch 'develop'	2025-02-24 16:20:32 -05:00
AlexVonB	8f70e3952f	Merge branch 'develop'	2024-11-24 23:05:17 +01:00
AlexVonB	e935ce819e	Merge branch 'develop'	2024-11-24 12:26:53 +01:00
AlexVonB	b5c724ab33	Merge branch 'develop'	2024-07-14 22:40:15 +02:00
AlexVonB	8c810eb8a8	Merge branch 'develop'	2024-07-14 21:20:04 +02:00
AlexVonB	383847ee86	Merge branch 'develop'	2024-03-26 21:56:09 +01:00
AlexVonB	be3a7f4672	Merge branch 'develop'	2024-03-26 21:52:16 +01:00
AlexVonB	8219d2a673	Merge branch 'develop'	2022-09-02 10:11:08 +02:00
AlexVonB	0c8ac578c9	Merge branch 'develop'	2022-08-31 21:45:38 +02:00
AlexVonB	8f047753ae	Merge branch 'develop'	2022-08-28 22:03:22 +02:00
AlexVonB	194c646a20	Merge branch 'develop'	2022-08-28 21:43:12 +02:00
AlexVonB	2c533339cf	Merge branch 'develop'	2022-04-24 11:01:54 +02:00
AlexVonB	2b8cf444f1	Merge branch 'develop'	2022-04-14 10:25:35 +02:00
AlexVonB	d375116807	Merge branch 'develop'	2022-04-13 20:47:52 +02:00
AlexVonB	eb0330bfc6	Merge branch 'develop'	2022-01-23 11:01:45 +01:00
AlexVonB	28793ac0b3	Merge branch 'develop'	2022-01-18 08:56:33 +01:00
AlexVonB	9231704988	Merge branch 'develop'	2021-12-11 14:44:58 +01:00
AlexVonB	1613c302bc	Merge branch 'develop'	2021-11-17 17:11:01 +01:00
AlexVonB	55c9e84f38	Merge branch 'develop'	2021-09-04 21:50:34 +02:00
AlexVonB	99875683ac	Merge branch 'develop'	2021-08-25 08:53:38 +02:00
AlexVonB	eaeb0603eb	Merge branch 'develop'	2021-07-11 13:21:20 +02:00
AlexVonB	cb73590623	Merge branch 'develop'	2021-07-11 13:14:29 +02:00
AlexVonB	59417ab115	Merge branch 'develop'	2021-05-30 19:10:49 +02:00
AlexVonB	917b01e548	Merge branch 'develop'	2021-05-30 11:20:32 +02:00
AlexVonB	652714859d	Merge branch 'develop'	2021-05-21 14:18:14 +02:00
AlexVonB	ea5b22824b	Merge branch 'develop'	2021-05-18 10:42:27 +02:00
AlexVonB	ec5858e42f	Merge branch 'develop'	2021-05-16 18:41:24 +02:00
AlexVonB	02bb914ef3	Merge branch 'develop'	2021-05-02 13:49:30 +02:00
AlexVonB	21c0d034d0	Merge branch 'develop'	2021-05-02 10:51:00 +02:00
AlexVonB	e3ddc789a2	Merge branch 'develop'	2021-04-22 12:43:27 +02:00
AlexVonB	2d0cd97323	Merge branch 'develop'	2021-04-22 12:13:03 +02:00
AlexVonB	ec185e2e9c	Merge branch 'develop'	2021-02-21 23:09:55 +01:00
AlexVonB	079d1721aa	Merge branch 'develop'	2021-02-21 20:58:34 +01:00
AlexVonB	bf24df3e2e	bump to v0.6.3	2021-01-12 22:43:18 +01:00
AlexVonB	15329588b1	Merge branch 'develop'	2021-01-12 22:42:58 +01:00
AlexVonB	34ad8485fa	bump to v0.6.2	2021-01-12 22:40:03 +01:00
AlexVonB	f0ce934bf8	Merge branch 'develop'	2021-01-12 22:39:47 +01:00
AlexVonB	99cd237f27	Merge branch 'develop'	2021-01-04 10:22:02 +01:00
AlexVonB	2bde8d3e8e	Merge branch 'develop'	2021-01-02 16:49:28 +01:00
AlexVonB	8c9b029756	Merge branch 'develop'	2020-09-01 18:10:07 +02:00
AlexVonB	ae50065872	Merge branch 'develop'	2020-08-18 18:53:10 +02:00