Merge branch 'develop'

bump to v0.11.2
added wrap option
2022-04-24 11:01:54 +02:00 · 2022-04-24 11:01:29 +02:00 · 2022-04-24 11:00:04 +02:00 · 2022-04-24 10:59:22 +02:00 · 2022-04-14 10:25:35 +02:00 · 2022-04-14 10:25:25 +02:00
7 changed files with 146 additions and 23 deletions
--- a/README.rst
+++ b/README.rst
@@ -32,14 +32,14 @@ Convert some HTML to Markdown:
    from markdownify import markdownify as md
    md('<b>Yay</b> <a href="http://github.com">GitHub</a>')  # > '**Yay** [GitHub](http://github.com)'
-Specify tags to exclude (blacklist):
+Specify tags to exclude:
 .. code:: python
    from markdownify import markdownify as md
    md('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a'])  # > '**Yay** GitHub'
-\...or specify the tags you want to include (whitelist):
+\...or specify the tags you want to include:
 .. code:: python
@@ -53,11 +53,11 @@ Options
 Markdownify supports the following options:
 strip
-  A list of tags to strip (blacklist). This option can't be used with the
+  A list of tags to strip. This option can't be used with the
  ``convert`` option.
 convert
-  A list of tags to convert (whitelist). This option can't be used with the
+  A list of tags to convert. This option can't be used with the
  ``strip`` option.
 autolinks
@@ -92,7 +92,7 @@ sub_symbol, sup_symbol
 newline_style
  Defines the style of marking linebreaks (``<br>``) in markdown. The default
  value ``SPACES`` of this option will adopt the usual two spaces and a newline,
-  while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a
+  while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash and a
  newline). While the latter convention is non-standard, it is commonly
  preferred and supported by a lot of interpreters.
@@ -102,10 +102,55 @@ code_language
  should be annotated with `````python`` or similar.
  Defaults to ``''`` (empty string) and can be any string.
 code_language_callback
  When the HTML code contains ``pre`` tags that in some way provide the code
  language, for example as class, this callback can be used to extract the
  language from the tag and prefix it to the converted ``pre`` tag.
  The callback gets one single argument, an BeautifylSoup object, and returns
  a string containing the code language, or ``None``.
  An example to use the class name as code language could be::
    def callback(el):
        return el['class'][0] if el.has_attr('class') else None
  Defaults to ``None``.
 escape_asterisks
  If set to ``False``, do not escape ``*`` to ``\*`` in text.
  Defaults to ``True``.
 escape_underscores
  If set to ``False``, do not escape ``_`` to ``\_`` in text.
  Defaults to ``True``.
 keep_inline_images_in
  Images are converted to their alt-text when the images are located inside
  headlines or table cells. If some inline images should be converted to
  markdown images instead, this option can be set to a list of parent tags
  that should be allowed to contain inline images, for example ``['td']``.
  Defaults to an empty list.
 wrap, wrap_width
  If ``wrap`` is set to ``True``, all text paragraphs are wrapped at
  ``wrap_width`` characters. Defaults to ``False`` and ``80``.
  Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
 Converting BeautifulSoup objects
 ================================
 .. code:: python
    from markdownify import MarkdownConverter
    # Create shorthand method for conversion
    def md(soup, **options):
        return MarkdownConverter(**options).convert_soup(soup)
 Creating Custom Converters
 ==========================
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -1,4 +1,5 @@
 from bs4 import BeautifulSoup, NavigableString, Comment, Doctype
 from textwrap import fill
 import re
 import six
@@ -25,12 +26,6 @@ ASTERISK = '*'
 UNDERSCORE = '_'
 def escape(text):
    if not text:
        return ''
    return text.replace('_', r'\_')
 def chomp(text):
    """
    If the text in an inline tag like b, a, or em contains a leading or trailing
@@ -68,15 +63,21 @@ class MarkdownConverter(object):
    class DefaultOptions:
        autolinks = True
        bullets = '*+-'  # An iterable of bullet types.
        code_language = ''
        code_language_callback = None
        convert = None
        default_title = False
        escape_asterisks = True
        escape_underscores = True
        heading_style = UNDERLINED
        keep_inline_images_in = []
        newline_style = SPACES
        strip = None
        strong_em_symbol = ASTERISK
        sub_symbol = ''
        sup_symbol = ''
-        code_language = ''
+        wrap = False
        wrap_width = 80
    class Options(DefaultOptions):
        pass
@@ -93,6 +94,9 @@ class MarkdownConverter(object):
    def convert(self, html):
        soup = BeautifulSoup(html, 'html.parser')
        return self.convert_soup(soup)
    def convert_soup(self, soup):
        return self.process_tag(soup, convert_as_inline=False, children_only=True)
    def process_tag(self, node, convert_as_inline, children_only=False):
@@ -155,7 +159,7 @@ class MarkdownConverter(object):
            text = whitespace_re.sub(' ', text)
        if el.parent.name != 'code':
-            text = escape(text)
+            text = self.escape(text)
        # remove trailing whitespaces if any of the following condition is true:
        # - current text node is the last node in li
@@ -193,6 +197,15 @@ class MarkdownConverter(object):
        else:
            return True
    def escape(self, text):
        if not text:
            return ''
        if self.options['escape_asterisks']:
            text = text.replace('*', r'\*')
        if self.options['escape_underscores']:
            text = text.replace('_', r'\_')
        return text
    def indent(self, text, level):
        return line_beginning_re.sub('\t' * level, text) if text else ''
@@ -272,7 +285,8 @@ class MarkdownConverter(object):
        src = el.attrs.get('src', None) or ''
        title = el.attrs.get('title', None) or ''
        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
-        if convert_as_inline:
+        if (convert_as_inline
                and el.parent.name not in self.options['keep_inline_images_in']):
            return alt
        return '![%s](%s%s)' % (alt, src, title_part)
@@ -320,12 +334,22 @@ class MarkdownConverter(object):
    def convert_p(self, el, text, convert_as_inline):
        if convert_as_inline:
            return text
        if self.options['wrap']:
            text = fill(text,
                        width=self.options['wrap_width'],
                        break_long_words=False,
                        break_on_hyphens=False)
        return '%s\n\n' % text if text else ''
    def convert_pre(self, el, text, convert_as_inline):
        if not text:
            return ''
-        return '\n```%s\n%s\n```\n' % (self.options['code_language'], text)
+        code_language = self.options['code_language']
        if self.options['code_language_callback']:
            code_language = self.options['code_language_callback'](el) or code_language
        return '\n```%s\n%s\n```\n' % (code_language, text)
    convert_s = convert_del
@@ -354,8 +378,13 @@ class MarkdownConverter(object):
        if is_headrow and not el.previous_sibling:
            # first row and is headline: print headline underline
            underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
-        elif not el.previous_sibling and not el.parent.name != 'table':
+        elif (not el.previous_sibling
-            # first row, not headline, and the parent is sth. like tbody:
+              and (el.parent.name == 'table'
                   or (el.parent.name == 'tbody'
                       and not el.parent.previous_sibling))):
            # first row, not headline, and:
            # - the parent is table or
            # - the parent is tbody at the beginning of a table.
            # print empty headline above this row
            overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
            overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
 pkgmeta = {
    '__title__': 'markdownify',
    '__author__': 'Matthew Tretter',
-    '__version__': '0.10.1',
+    '__version__': '0.11.2',
 }
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -133,12 +133,13 @@ def test_hn_nested_simple_tag():
 def test_hn_nested_img():
    image_attributes_to_markdown = [
-        ("", ""),
+        ("", "", ""),
-        ("alt='Alt Text'", "Alt Text"),
+        ("alt='Alt Text'", "Alt Text", ""),
-        ("alt='Alt Text' title='Optional title'", "Alt Text"),
+        ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
    ]
-    for image_attributes, markdown in image_attributes_to_markdown:
+    for image_attributes, markdown, title in image_attributes_to_markdown:
-        assert md('<h3>A <img src="/path/to/img.jpg " ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
 def test_hn_atx_headings():
@@ -176,6 +177,11 @@ def test_kbd():
 def test_p():
    assert md('<p>hello</p>') == 'hello\n\n'
    assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
    assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
    assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
 def test_pre():
@@ -215,3 +221,12 @@ def test_sup():
 def test_lang():
    assert md('<pre>test\n    foo\nbar</pre>', code_language='python') == '\n```python\ntest\n    foo\nbar\n```\n'
    assert md('<pre><code>test\n    foo\nbar</code></pre>', code_language='javascript') == '\n```javascript\ntest\n    foo\nbar\n```\n'
 def test_lang_callback():
    def callback(el):
        return el['class'][0] if el.has_attr('class') else None
    assert md('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n    foo\nbar\n```\n'
    assert md('<pre class="javascript"><code>test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
    assert md('<pre class="javascript"><code class="javascript">test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -1,4 +1,5 @@
 from markdownify import MarkdownConverter
 from bs4 import BeautifulSoup
 class ImageBlockConverter(MarkdownConverter):
@@ -16,3 +17,9 @@ def test_img():
    assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />') == '![Alt text](/path/to/img.jpg "Optional title")\n\n'
    assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)\n\n'
 def test_soup():
    html = '<b>test</b>'
    soup = BeautifulSoup(html, 'html.parser')
    assert MarkdownConverter().convert_soup(soup) == '**test**'
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -1,8 +1,14 @@
 from markdownify import markdownify as md
 def test_asterisks():
    assert md('*hey*dude*') == r'\*hey\*dude\*'
    assert md('*hey*dude*', escape_asterisks=False) == r'*hey*dude*'
 def test_underscore():
    assert md('_hey_dude_') == r'\_hey\_dude\_'
    assert md('_hey_dude_', escape_underscores=False) == r'_hey_dude_'
 def test_xml_entities():
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -139,6 +139,26 @@ table_missing_head = """<table>
    </tr>
 </table>"""
 table_body = """<table>
    <tbody>
        <tr>
            <td>Firstname</td>
            <td>Lastname</td>
            <td>Age</td>
        </tr>
        <tr>
            <td>Jill</td>
            <td>Smith</td>
            <td>50</td>
        </tr>
        <tr>
            <td>Eve</td>
            <td>Jackson</td>
            <td>94</td>
        </tr>
    </tbody>
 </table>"""
 def test_table():
    assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
@@ -148,3 +168,4 @@ def test_table():
    assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_missing_text) == '\n\n|  | Lastname | Age |\n| --- | --- | --- |\n| Jill |  | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_missing_head) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_body) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
Author	SHA1	Message	Date
AlexVonB	2c533339cf	Merge branch 'develop'	2022-04-24 11:01:54 +02:00
AlexVonB	5adda130b8	bump to v0.11.2	2022-04-24 11:01:29 +02:00
AlexVonB	5f1b98e25d	added wrap option closes #66	2022-04-24 11:00:04 +02:00
AlexVonB	16acd2b763	typo in readme	2022-04-24 10:59:22 +02:00
AlexVonB	2b8cf444f1	Merge branch 'develop'	2022-04-14 10:25:35 +02:00
AlexVonB	207d0f4ec6	bump to v0.11.1	2022-04-14 10:25:25 +02:00
Mikko Korpela	ebb9ea713d	Fix detection of "first row, not headline" (#63 ) Improved handling of "first row, not headline". Works for tables with 1) neither thead nor tbody 2) tbody but no thead	2022-04-14 10:24:32 +02:00
AlexVonB	d375116807	Merge branch 'develop'	2022-04-13 20:47:52 +02:00
AlexVonB	87b9f6c88e	bump to v0.11.0	2022-04-13 20:47:30 +02:00
AlexVonB	bda367dad9	Merge branch 'tdgroot-code_language_callback' into develop closes #64	2022-04-13 20:44:18 +02:00
AlexVonB	61e8940486	added readme for callback	2022-04-13 20:42:38 +02:00
AlexVonB	35479d2d3b	Merge branch 'code_language_callback' of https://github.com/tdgroot/python-markdownify into tdgroot-code_language_callback	2022-04-13 20:25:37 +02:00
AlexVonB	b589863715	add escaping of asterisks and option to disable it closes #62	2022-04-13 20:04:12 +02:00
AlexVonB	423b7e948c	add option to allow inline images in selected tags fixes #61	2022-04-13 19:55:34 +02:00
Timon de Groot	0ea95de4d0	Add code language callback	2022-04-09 13:22:28 +02:00
AlexVonB	ed3eee78d2	fixed readme	2022-01-24 18:18:19 +01:00
AlexVonB	eb0330bfc6	Merge branch 'develop'	2022-01-23 11:01:45 +01:00
AlexVonB	ddda696396	bump to v0.10.3	2022-01-23 11:01:26 +01:00
AlexVonB	0a1343a538	allow BeautifulSoup objects to be converted	2022-01-23 11:00:19 +01:00
AlexVonB	9d0b839b73	wording	2022-01-23 10:59:24 +01:00
AlexVonB	28793ac0b3	Merge branch 'develop'	2022-01-18 08:56:33 +01:00
AlexVonB	d3eff11617	bump to v0.10.2	2022-01-18 08:53:33 +01:00
AlexVonB	bd6b581122	add option to not escape underscores closes #59	2022-01-18 08:51:44 +01:00