Compare commits
41 Commits
chrispy/su
...
1.0.0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
47856cd429 | ||
|
|
8f70e3952f | ||
|
|
e935ce819e | ||
|
|
b5c724ab33 | ||
|
|
8c810eb8a8 | ||
|
|
383847ee86 | ||
|
|
be3a7f4672 | ||
|
|
8219d2a673 | ||
|
|
0c8ac578c9 | ||
|
|
8f047753ae | ||
|
|
194c646a20 | ||
|
|
2c533339cf | ||
|
|
2b8cf444f1 | ||
|
|
d375116807 | ||
|
|
eb0330bfc6 | ||
|
|
28793ac0b3 | ||
|
|
9231704988 | ||
|
|
1613c302bc | ||
|
|
55c9e84f38 | ||
|
|
99875683ac | ||
|
|
eaeb0603eb | ||
|
|
cb73590623 | ||
|
|
59417ab115 | ||
|
|
917b01e548 | ||
|
|
652714859d | ||
|
|
ea5b22824b | ||
|
|
ec5858e42f | ||
|
|
02bb914ef3 | ||
|
|
21c0d034d0 | ||
|
|
e3ddc789a2 | ||
|
|
2d0cd97323 | ||
|
|
ec185e2e9c | ||
|
|
079d1721aa | ||
|
|
bf24df3e2e | ||
|
|
15329588b1 | ||
|
|
34ad8485fa | ||
|
|
f0ce934bf8 | ||
|
|
99cd237f27 | ||
|
|
2bde8d3e8e | ||
|
|
8c9b029756 | ||
|
|
ae50065872 |
11
README.rst
11
README.rst
@@ -157,17 +157,6 @@ strip_document
|
||||
within the document are unaffected.
|
||||
Defaults to ``STRIP``.
|
||||
|
||||
bs4_options
|
||||
Specify additional configuration options for the ``BeautifulSoup`` object
|
||||
used to interpret the HTML markup. String and list values (such as ``lxml``)
|
||||
are treated as ``features`` parameter arguments to control parser
|
||||
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
|
||||
are treated as full kwargs to be used for the BeautifulSoup constructor,
|
||||
allowing specification of any parameter. For parameter details, see the
|
||||
Beautiful Soup documentation at:
|
||||
|
||||
.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/
|
||||
|
||||
Options may be specified as kwargs to the ``markdownify`` function, or as a
|
||||
nested ``Options`` class in ``MarkdownConverter`` subclasses.
|
||||
|
||||
|
||||
@@ -106,7 +106,6 @@ def should_remove_whitespace_inside(el):
|
||||
return el.name in ('p', 'blockquote',
|
||||
'article', 'div', 'section',
|
||||
'ol', 'ul', 'li',
|
||||
'dl', 'dt', 'dd',
|
||||
'table', 'thead', 'tbody', 'tfoot',
|
||||
'tr', 'td', 'th')
|
||||
|
||||
@@ -154,7 +153,6 @@ def _next_block_content_sibling(el):
|
||||
class MarkdownConverter(object):
|
||||
class DefaultOptions:
|
||||
autolinks = True
|
||||
bs4_options = 'html.parser'
|
||||
bullets = '*+-' # An iterable of bullet types.
|
||||
code_language = ''
|
||||
code_language_callback = None
|
||||
@@ -188,15 +186,11 @@ class MarkdownConverter(object):
|
||||
raise ValueError('You may specify either tags to strip or tags to'
|
||||
' convert, but not both.')
|
||||
|
||||
# If a string or list is passed to bs4_options, assume it is a 'features' specification
|
||||
if not isinstance(self.options['bs4_options'], dict):
|
||||
self.options['bs4_options'] = {'features': self.options['bs4_options']}
|
||||
|
||||
# Initialize the conversion function cache
|
||||
self.convert_fn_cache = {}
|
||||
|
||||
def convert(self, html):
|
||||
soup = BeautifulSoup(html, **self.options['bs4_options'])
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
return self.convert_soup(soup)
|
||||
|
||||
def convert_soup(self, soup):
|
||||
@@ -367,20 +361,16 @@ class MarkdownConverter(object):
|
||||
if not self.should_convert_tag(tag_name):
|
||||
return None
|
||||
|
||||
# Look for an explicitly defined conversion function by tag name first
|
||||
convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
|
||||
convert_fn = getattr(self, convert_fn_name, None)
|
||||
if convert_fn:
|
||||
return convert_fn
|
||||
|
||||
# If tag is any heading, handle with convert_hN() function
|
||||
# Handle headings with _convert_hn() function
|
||||
match = re_html_heading.match(tag_name)
|
||||
if match:
|
||||
n = int(match.group(1)) # get value of N from <hN>
|
||||
return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
|
||||
n = int(match.group(1))
|
||||
return lambda el, text, parent_tags: self._convert_hn(n, el, text, parent_tags)
|
||||
|
||||
# No conversion function was found
|
||||
return None
|
||||
# For other tags, look up their conversion function by tag name
|
||||
convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub('_', tag_name)
|
||||
convert_fn = getattr(self, convert_fn_name, None)
|
||||
return convert_fn
|
||||
|
||||
def should_convert_tag(self, tag):
|
||||
"""Given a tag name, return whether to convert based on strip/convert options."""
|
||||
@@ -452,7 +442,7 @@ class MarkdownConverter(object):
|
||||
|
||||
def convert_br(self, el, text, parent_tags):
|
||||
if '_inline' in parent_tags:
|
||||
return ' '
|
||||
return ""
|
||||
|
||||
if self.options['newline_style'].lower() == BACKSLASH:
|
||||
return '\\\n'
|
||||
@@ -499,11 +489,6 @@ class MarkdownConverter(object):
|
||||
|
||||
return '%s\n' % text
|
||||
|
||||
# definition lists are formatted as follows:
|
||||
# https://pandoc.org/MANUAL.html#definition-lists
|
||||
# https://michelf.ca/projects/php-markdown/extra/#def-list
|
||||
convert_dl = convert_div
|
||||
|
||||
def convert_dt(self, el, text, parent_tags):
|
||||
# remove newlines from term text
|
||||
text = (text or '').strip()
|
||||
@@ -516,14 +501,14 @@ class MarkdownConverter(object):
|
||||
# TODO - format consecutive <dt> elements as directly adjacent lines):
|
||||
# https://michelf.ca/projects/php-markdown/extra/#def-list
|
||||
|
||||
return '\n\n%s\n' % text
|
||||
return '\n%s\n' % text
|
||||
|
||||
def convert_hN(self, n, el, text, parent_tags):
|
||||
# convert_hN() converts <hN> tags, where N is any integer
|
||||
def _convert_hn(self, n, el, text, parent_tags):
|
||||
""" Method name prefixed with _ to prevent <hn> to call this """
|
||||
if '_inline' in parent_tags:
|
||||
return text
|
||||
|
||||
# Markdown does not support heading depths of n > 6
|
||||
# prevent MemoryErrors in case of very large n
|
||||
n = max(1, min(6, n))
|
||||
|
||||
style = self.options['heading_style'].lower()
|
||||
@@ -553,24 +538,6 @@ class MarkdownConverter(object):
|
||||
|
||||
return '' % (alt, src, title_part)
|
||||
|
||||
def convert_video(self, el, text, parent_tags):
|
||||
if ('_inline' in parent_tags
|
||||
and el.parent.name not in self.options['keep_inline_images_in']):
|
||||
return text
|
||||
src = el.attrs.get('src', None) or ''
|
||||
if not src:
|
||||
sources = el.find_all('source', attrs={'src': True})
|
||||
if sources:
|
||||
src = sources[0].attrs.get('src', None) or ''
|
||||
poster = el.attrs.get('poster', None) or ''
|
||||
if src and poster:
|
||||
return '[](%s)' % (text, poster, src)
|
||||
if src:
|
||||
return '[%s](%s)' % (text, src)
|
||||
if poster:
|
||||
return '' % (text, poster)
|
||||
return text
|
||||
|
||||
def convert_list(self, el, text, parent_tags):
|
||||
|
||||
# Converting a list to inline is undefined.
|
||||
@@ -658,9 +625,6 @@ class MarkdownConverter(object):
|
||||
|
||||
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
|
||||
|
||||
def convert_q(self, el, text, parent_tags):
|
||||
return '"' + text + '"'
|
||||
|
||||
def convert_script(self, el, text, parent_tags):
|
||||
return ''
|
||||
|
||||
@@ -713,12 +677,6 @@ class MarkdownConverter(object):
|
||||
)
|
||||
overline = ''
|
||||
underline = ''
|
||||
full_colspan = 0
|
||||
for cell in cells:
|
||||
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
|
||||
full_colspan += int(cell["colspan"])
|
||||
else:
|
||||
full_colspan += 1
|
||||
if ((is_headrow
|
||||
or (is_head_row_missing
|
||||
and self.options['table_infer_header']))
|
||||
@@ -727,6 +685,12 @@ class MarkdownConverter(object):
|
||||
# - is headline or
|
||||
# - headline is missing and header inference is enabled
|
||||
# print headline underline
|
||||
full_colspan = 0
|
||||
for cell in cells:
|
||||
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
|
||||
full_colspan += int(cell["colspan"])
|
||||
else:
|
||||
full_colspan += 1
|
||||
underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
|
||||
elif ((is_head_row_missing
|
||||
and not self.options['table_infer_header'])
|
||||
@@ -739,8 +703,8 @@ class MarkdownConverter(object):
|
||||
# - the parent is table or
|
||||
# - the parent is tbody at the beginning of a table.
|
||||
# print empty headline above this row
|
||||
overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n'
|
||||
overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
|
||||
overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
|
||||
overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
|
||||
return overline + '|' + text + '\n' + underline
|
||||
|
||||
|
||||
|
||||
@@ -55,9 +55,7 @@ def main(argv=sys.argv[1:]):
|
||||
parser.add_argument('--no-escape-underscores', dest='escape_underscores',
|
||||
action='store_false',
|
||||
help="Do not escape '_' to '\\_' in text.")
|
||||
parser.add_argument('-i', '--keep-inline-images-in',
|
||||
default=[],
|
||||
nargs='*',
|
||||
parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
|
||||
help="Images are converted to their alt-text when the images are "
|
||||
"located inside headlines or table cells. If some inline images "
|
||||
"should be converted to markdown images instead, this option can "
|
||||
@@ -70,12 +68,6 @@ def main(argv=sys.argv[1:]):
|
||||
parser.add_argument('-w', '--wrap', action='store_true',
|
||||
help="Wrap all text paragraphs at --wrap-width characters.")
|
||||
parser.add_argument('--wrap-width', type=int, default=80)
|
||||
parser.add_argument('-p', '--beautiful-soup-parser',
|
||||
dest='beautiful_soup_parser',
|
||||
default='html.parser',
|
||||
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
|
||||
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
|
||||
"environment.")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
print(markdownify(**vars(args)))
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "markdownify"
|
||||
version = "1.1.0"
|
||||
version = "1.0.0"
|
||||
authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
|
||||
description = "Convert HTML to markdown."
|
||||
readme = "README.rst"
|
||||
|
||||
@@ -32,9 +32,3 @@ def test_strip_document():
|
||||
assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
|
||||
assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
|
||||
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
|
||||
|
||||
|
||||
def bs4_options():
|
||||
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
|
||||
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
|
||||
assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"
|
||||
|
||||
@@ -79,8 +79,6 @@ def test_blockquote_nested():
|
||||
def test_br():
|
||||
assert md('a<br />b<br />c') == 'a \nb \nc'
|
||||
assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
|
||||
assert md('<h1>foo<br />bar</h1>', heading_style=ATX) == '\n\n# foo bar\n\n'
|
||||
assert md('<td>foo<br />bar</td>', heading_style=ATX) == ' foo bar |'
|
||||
|
||||
|
||||
def test_code():
|
||||
@@ -104,13 +102,13 @@ def test_code():
|
||||
|
||||
|
||||
def test_dl():
|
||||
assert md('<dl><dt>term</dt><dd>definition</dd></dl>') == '\n\nterm\n: definition\n\n'
|
||||
assert md('<dl><dt><p>te</p><p>rm</p></dt><dd>definition</dd></dl>') == '\n\nte rm\n: definition\n\n'
|
||||
assert md('<dl><dt>term</dt><dd><p>definition-p1</p><p>definition-p2</p></dd></dl>') == '\n\nterm\n: definition-p1\n\n definition-p2\n\n'
|
||||
assert md('<dl><dt>term</dt><dd><p>definition 1</p></dd><dd><p>definition 2</p></dd></dl>') == '\n\nterm\n: definition 1\n: definition 2\n\n'
|
||||
assert md('<dl><dt>term 1</dt><dd>definition 1</dd><dt>term 2</dt><dd>definition 2</dd></dl>') == '\n\nterm 1\n: definition 1\n\nterm 2\n: definition 2\n\n'
|
||||
assert md('<dl><dt>term</dt><dd><blockquote><p>line 1</p><p>line 2</p></blockquote></dd></dl>') == '\n\nterm\n: > line 1\n >\n > line 2\n\n'
|
||||
assert md('<dl><dt>term</dt><dd><ol><li><p>1</p><ul><li>2a</li><li>2b</li></ul></li><li><p>3</p></li></ol></dd></dl>') == '\n\nterm\n: 1. 1\n\n * 2a\n * 2b\n 2. 3\n\n'
|
||||
assert md('<dl><dt>term</dt><dd>definition</dd></dl>') == '\nterm\n: definition\n'
|
||||
assert md('<dl><dt><p>te</p><p>rm</p></dt><dd>definition</dd></dl>') == '\nte rm\n: definition\n'
|
||||
assert md('<dl><dt>term</dt><dd><p>definition-p1</p><p>definition-p2</p></dd></dl>') == '\nterm\n: definition-p1\n\n definition-p2\n'
|
||||
assert md('<dl><dt>term</dt><dd><p>definition 1</p></dd><dd><p>definition 2</p></dd></dl>') == '\nterm\n: definition 1\n: definition 2\n'
|
||||
assert md('<dl><dt>term 1</dt><dd>definition 1</dd><dt>term 2</dt><dd>definition 2</dd></dl>') == '\nterm 1\n: definition 1\nterm 2\n: definition 2\n'
|
||||
assert md('<dl><dt>term</dt><dd><blockquote><p>line 1</p><p>line 2</p></blockquote></dd></dl>') == '\nterm\n: > line 1\n >\n > line 2\n'
|
||||
assert md('<dl><dt>term</dt><dd><ol><li><p>1</p><ul><li>2a</li><li>2b</li></ul></li><li><p>3</p></li></ol></dd></dl>') == '\nterm\n: 1. 1\n\n * 2a\n * 2b\n 2. 3\n'
|
||||
|
||||
|
||||
def test_del():
|
||||
@@ -164,8 +162,7 @@ def test_hn():
|
||||
assert md('<h5>Hello</h5>') == '\n\n##### Hello\n\n'
|
||||
assert md('<h6>Hello</h6>') == '\n\n###### Hello\n\n'
|
||||
assert md('<h10>Hello</h10>') == md('<h6>Hello</h6>')
|
||||
assert md('<h0>Hello</h0>') == md('<h1>Hello</h1>')
|
||||
assert md('<hx>Hello</hx>') == md('Hello')
|
||||
assert md('<hn>Hello</hn>') == md('Hello')
|
||||
|
||||
|
||||
def test_hn_chained():
|
||||
@@ -246,14 +243,6 @@ def test_img():
|
||||
assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == ''
|
||||
|
||||
|
||||
def test_video():
|
||||
assert md('<video src="/path/to/video.mp4" poster="/path/to/img.jpg">text</video>') == '[](/path/to/video.mp4)'
|
||||
assert md('<video src="/path/to/video.mp4">text</video>') == '[text](/path/to/video.mp4)'
|
||||
assert md('<video><source src="/path/to/video.mp4"/>text</video>') == '[text](/path/to/video.mp4)'
|
||||
assert md('<video poster="/path/to/img.jpg">text</video>') == ''
|
||||
assert md('<video>text</video>') == 'text'
|
||||
|
||||
|
||||
def test_kbd():
|
||||
inline_tests('kbd', '`')
|
||||
|
||||
@@ -305,11 +294,6 @@ def test_pre():
|
||||
assert md("<p>foo</p>\n<pre>bar</pre>\n</p>baz</p>", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz"
|
||||
|
||||
|
||||
def test_q():
|
||||
assert md('foo <q>quote</q> bar') == 'foo "quote" bar'
|
||||
assert md('foo <q cite="https://example.com">quote</q> bar') == 'foo "quote" bar'
|
||||
|
||||
|
||||
def test_script():
|
||||
assert md('foo <script>var foo=42;</script> bar') == 'foo bar'
|
||||
|
||||
|
||||
@@ -12,15 +12,7 @@ class UnitTestConverter(MarkdownConverter):
|
||||
|
||||
def convert_custom_tag(self, el, text, parent_tags):
|
||||
"""Ensure conversion function is found for tags with special characters in name"""
|
||||
return "convert_custom_tag(): %s" % text
|
||||
|
||||
def convert_h1(self, el, text, parent_tags):
|
||||
"""Ensure explicit heading conversion function is used"""
|
||||
return "convert_h1: %s" % (text)
|
||||
|
||||
def convert_hN(self, n, el, text, parent_tags):
|
||||
"""Ensure general heading conversion function is used"""
|
||||
return "convert_hN(%d): %s" % (n, text)
|
||||
return "FUNCTION USED: %s" % text
|
||||
|
||||
|
||||
def test_custom_conversion_functions():
|
||||
@@ -31,11 +23,7 @@ def test_custom_conversion_functions():
|
||||
assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />text') == '\n\ntext'
|
||||
assert md('<img src="/path/to/img.jpg" alt="Alt text" />text') == '\n\ntext'
|
||||
|
||||
assert md("<custom-tag>text</custom-tag>") == "convert_custom_tag(): text"
|
||||
|
||||
assert md("<h1>text</h1>") == "convert_h1: text"
|
||||
|
||||
assert md("<h3>text</h3>") == "convert_hN(3): text"
|
||||
assert md("<custom-tag>text</custom-tag>") == "FUNCTION USED: text"
|
||||
|
||||
|
||||
def test_soup():
|
||||
|
||||
@@ -267,23 +267,6 @@ table_with_undefined_colspan = """<table>
|
||||
</tr>
|
||||
</table>"""
|
||||
|
||||
table_with_colspan_missing_head = """<table>
|
||||
<tr>
|
||||
<td colspan="2">Name</td>
|
||||
<td>Age</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jill</td>
|
||||
<td>Smith</td>
|
||||
<td>50</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Eve</td>
|
||||
<td>Jackson</td>
|
||||
<td>94</td>
|
||||
</tr>
|
||||
</table>"""
|
||||
|
||||
|
||||
def test_table():
|
||||
assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
@@ -300,7 +283,6 @@ def test_table():
|
||||
assert md(table_with_caption) == 'TEXT\n\nCaption\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n\n'
|
||||
assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
|
||||
assert md(table_with_colspan_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Name | | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
|
||||
|
||||
def test_table_infer_header():
|
||||
@@ -318,4 +300,3 @@ def test_table_infer_header():
|
||||
assert md(table_with_caption, table_infer_header=True) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
|
||||
assert md(table_with_colspan, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_with_undefined_colspan, table_infer_header=True) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
|
||||
assert md(table_with_colspan_missing_head, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
|
||||
Reference in New Issue
Block a user