Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
8f70e3952f | ||
|
|
6258f5c38b | ||
|
|
3466061ca9 | ||
|
|
9595618796 | ||
|
|
e935ce819e | ||
|
|
fe8a821a20 | ||
|
|
54c7ca9937 | ||
|
|
19780834af | ||
|
|
9202027e26 | ||
|
|
9bf4ff14b9 | ||
|
|
7ff4d835ae | ||
|
|
c13bdd5c14 | ||
|
|
340aecbe98 | ||
|
|
c2ffe46e85 | ||
|
|
a369e07211 | ||
|
|
4399ee75db | ||
|
|
60d86663d7 |
@@ -128,9 +128,9 @@ escape_underscores
|
||||
Defaults to ``True``.
|
||||
|
||||
escape_misc
|
||||
If set to ``False``, do not escape miscellaneous punctuation characters
|
||||
If set to ``True``, escape miscellaneous punctuation characters
|
||||
that sometimes have Markdown significance in text.
|
||||
Defaults to ``True``.
|
||||
Defaults to ``False``.
|
||||
|
||||
keep_inline_images_in
|
||||
Images are converted to their alt-text when the images are located inside
|
||||
|
||||
@@ -7,7 +7,8 @@ import six
|
||||
convert_heading_re = re.compile(r'convert_h(\d+)')
|
||||
line_beginning_re = re.compile(r'^', re.MULTILINE)
|
||||
whitespace_re = re.compile(r'[\t ]+')
|
||||
all_whitespace_re = re.compile(r'[\s]+')
|
||||
all_whitespace_re = re.compile(r'[\t \r\n]+')
|
||||
newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
|
||||
html_heading_re = re.compile(r'h[1-6]')
|
||||
|
||||
|
||||
@@ -66,6 +67,23 @@ def _todict(obj):
|
||||
return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))
|
||||
|
||||
|
||||
def should_remove_whitespace_inside(el):
|
||||
"""Return to remove whitespace immediately inside a block-level element."""
|
||||
if not el or not el.name:
|
||||
return False
|
||||
if html_heading_re.match(el.name) is not None:
|
||||
return True
|
||||
return el.name in ('p', 'blockquote',
|
||||
'ol', 'ul', 'li',
|
||||
'table', 'thead', 'tbody', 'tfoot',
|
||||
'tr', 'td', 'th')
|
||||
|
||||
|
||||
def should_remove_whitespace_outside(el):
|
||||
"""Return to remove whitespace immediately outside a block-level element."""
|
||||
return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
|
||||
|
||||
|
||||
class MarkdownConverter(object):
|
||||
class DefaultOptions:
|
||||
autolinks = True
|
||||
@@ -76,7 +94,7 @@ class MarkdownConverter(object):
|
||||
default_title = False
|
||||
escape_asterisks = True
|
||||
escape_underscores = True
|
||||
escape_misc = True
|
||||
escape_misc = False
|
||||
heading_style = UNDERLINED
|
||||
keep_inline_images_in = []
|
||||
newline_style = SPACES
|
||||
@@ -119,27 +137,23 @@ class MarkdownConverter(object):
|
||||
if not children_only and (isHeading or isCell):
|
||||
convert_children_as_inline = True
|
||||
|
||||
# Remove whitespace-only textnodes in purely nested nodes
|
||||
def is_nested_node(el):
|
||||
return el and el.name in ['ol', 'ul', 'li',
|
||||
'table', 'thead', 'tbody', 'tfoot',
|
||||
'tr', 'td', 'th']
|
||||
|
||||
if is_nested_node(node):
|
||||
for el in node.children:
|
||||
# Only extract (remove) whitespace-only text node if any of the
|
||||
# conditions is true:
|
||||
# - el is the first element in its parent
|
||||
# - el is the last element in its parent
|
||||
# - el is adjacent to an nested node
|
||||
can_extract = (not el.previous_sibling
|
||||
or not el.next_sibling
|
||||
or is_nested_node(el.previous_sibling)
|
||||
or is_nested_node(el.next_sibling))
|
||||
if (isinstance(el, NavigableString)
|
||||
and six.text_type(el).strip() == ''
|
||||
and can_extract):
|
||||
el.extract()
|
||||
# Remove whitespace-only textnodes just before, after or
|
||||
# inside block-level elements.
|
||||
should_remove_inside = should_remove_whitespace_inside(node)
|
||||
for el in node.children:
|
||||
# Only extract (remove) whitespace-only text node if any of the
|
||||
# conditions is true:
|
||||
# - el is the first element in its parent (block-level)
|
||||
# - el is the last element in its parent (block-level)
|
||||
# - el is adjacent to a block-level node
|
||||
can_extract = (should_remove_inside and (not el.previous_sibling
|
||||
or not el.next_sibling)
|
||||
or should_remove_whitespace_outside(el.previous_sibling)
|
||||
or should_remove_whitespace_outside(el.next_sibling))
|
||||
if (isinstance(el, NavigableString)
|
||||
and six.text_type(el).strip() == ''
|
||||
and can_extract):
|
||||
el.extract()
|
||||
|
||||
# Convert the children first
|
||||
for el in node.children:
|
||||
@@ -148,7 +162,13 @@ class MarkdownConverter(object):
|
||||
elif isinstance(el, NavigableString):
|
||||
text += self.process_text(el)
|
||||
else:
|
||||
text += self.process_tag(el, convert_children_as_inline)
|
||||
text_strip = text.rstrip('\n')
|
||||
newlines_left = len(text) - len(text_strip)
|
||||
next_text = self.process_tag(el, convert_children_as_inline)
|
||||
next_text_strip = next_text.lstrip('\n')
|
||||
newlines_right = len(next_text) - len(next_text_strip)
|
||||
newlines = '\n' * max(newlines_left, newlines_right)
|
||||
text = text_strip + newlines + next_text_strip
|
||||
|
||||
if not children_only:
|
||||
convert_fn = getattr(self, 'convert_%s' % node.name, None)
|
||||
@@ -162,18 +182,26 @@ class MarkdownConverter(object):
|
||||
|
||||
# normalize whitespace if we're not inside a preformatted element
|
||||
if not el.find_parent('pre'):
|
||||
text = whitespace_re.sub(' ', text)
|
||||
if self.options['wrap']:
|
||||
text = all_whitespace_re.sub(' ', text)
|
||||
else:
|
||||
text = newline_whitespace_re.sub('\n', text)
|
||||
text = whitespace_re.sub(' ', text)
|
||||
|
||||
# escape special characters if we're not inside a preformatted or code element
|
||||
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
|
||||
text = self.escape(text)
|
||||
|
||||
# remove trailing whitespaces if any of the following condition is true:
|
||||
# - current text node is the last node in li
|
||||
# - current text node is followed by an embedded list
|
||||
if (el.parent.name == 'li'
|
||||
and (not el.next_sibling
|
||||
or el.next_sibling.name in ['ul', 'ol'])):
|
||||
# remove leading whitespace at the start or just after a
|
||||
# block-level element; remove traliing whitespace at the end
|
||||
# or just before a block-level element.
|
||||
if (should_remove_whitespace_outside(el.previous_sibling)
|
||||
or (should_remove_whitespace_inside(el.parent)
|
||||
and not el.previous_sibling)):
|
||||
text = text.lstrip()
|
||||
if (should_remove_whitespace_outside(el.next_sibling)
|
||||
or (should_remove_whitespace_inside(el.parent)
|
||||
and not el.next_sibling)):
|
||||
text = text.rstrip()
|
||||
|
||||
return text
|
||||
@@ -185,7 +213,7 @@ class MarkdownConverter(object):
|
||||
n = int(m.group(1))
|
||||
|
||||
def convert_tag(el, text, convert_as_inline):
|
||||
return self.convert_hn(n, el, text, convert_as_inline)
|
||||
return self._convert_hn(n, el, text, convert_as_inline)
|
||||
|
||||
convert_tag.__name__ = 'convert_h%s' % n
|
||||
setattr(self, convert_tag.__name__, convert_tag)
|
||||
@@ -208,20 +236,32 @@ class MarkdownConverter(object):
|
||||
if not text:
|
||||
return ''
|
||||
if self.options['escape_misc']:
|
||||
text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
|
||||
text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
|
||||
text = re.sub(r'([\\&<`[>~=+|])', r'\\\1', text)
|
||||
# A sequence of one or more consecutive '-', preceded and
|
||||
# followed by whitespace or start/end of fragment, might
|
||||
# be confused with an underline of a header, or with a
|
||||
# list marker.
|
||||
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
|
||||
# A sequence of up to six consecutive '#', preceded and
|
||||
# followed by whitespace or start/end of fragment, might
|
||||
# be confused with an ATX heading.
|
||||
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
|
||||
# '.' or ')' preceded by up to nine digits might be
|
||||
# confused with a list item.
|
||||
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
|
||||
text)
|
||||
if self.options['escape_asterisks']:
|
||||
text = text.replace('*', r'\*')
|
||||
if self.options['escape_underscores']:
|
||||
text = text.replace('_', r'\_')
|
||||
return text
|
||||
|
||||
def indent(self, text, level):
|
||||
return line_beginning_re.sub('\t' * level, text) if text else ''
|
||||
def indent(self, text, columns):
|
||||
return line_beginning_re.sub(' ' * columns, text) if text else ''
|
||||
|
||||
def underline(self, text, pad_char):
|
||||
text = (text or '').rstrip()
|
||||
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
|
||||
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
|
||||
|
||||
def convert_a(self, el, text, convert_as_inline):
|
||||
prefix, suffix, text = chomp(text)
|
||||
@@ -246,7 +286,7 @@ class MarkdownConverter(object):
|
||||
def convert_blockquote(self, el, text, convert_as_inline):
|
||||
|
||||
if convert_as_inline:
|
||||
return text
|
||||
return ' ' + text.strip() + ' '
|
||||
|
||||
return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
|
||||
|
||||
@@ -271,19 +311,24 @@ class MarkdownConverter(object):
|
||||
|
||||
convert_kbd = convert_code
|
||||
|
||||
def convert_hn(self, n, el, text, convert_as_inline):
|
||||
def _convert_hn(self, n, el, text, convert_as_inline):
|
||||
""" Method name prefixed with _ to prevent <hn> to call this """
|
||||
if convert_as_inline:
|
||||
return text
|
||||
|
||||
# prevent MemoryErrors in case of very large n
|
||||
n = max(1, min(6, n))
|
||||
|
||||
style = self.options['heading_style'].lower()
|
||||
text = text.strip()
|
||||
if style == UNDERLINED and n <= 2:
|
||||
line = '=' if n == 1 else '-'
|
||||
return self.underline(text, line)
|
||||
text = all_whitespace_re.sub(' ', text)
|
||||
hashes = '#' * n
|
||||
if style == ATX_CLOSED:
|
||||
return '%s %s %s\n\n' % (hashes, text, hashes)
|
||||
return '%s %s\n\n' % (hashes, text)
|
||||
return '\n%s %s %s\n\n' % (hashes, text, hashes)
|
||||
return '\n%s %s\n\n' % (hashes, text)
|
||||
|
||||
def convert_hr(self, el, text, convert_as_inline):
|
||||
return '\n\n---\n\n'
|
||||
@@ -317,8 +362,8 @@ class MarkdownConverter(object):
|
||||
el = el.parent
|
||||
if nested:
|
||||
# remove trailing newline if nested
|
||||
return '\n' + self.indent(text, 1).rstrip()
|
||||
return text + ('\n' if before_paragraph else '')
|
||||
return '\n' + text.rstrip()
|
||||
return '\n\n' + text + ('\n' if before_paragraph else '')
|
||||
|
||||
convert_ul = convert_list
|
||||
convert_ol = convert_list
|
||||
@@ -339,17 +384,33 @@ class MarkdownConverter(object):
|
||||
el = el.parent
|
||||
bullets = self.options['bullets']
|
||||
bullet = bullets[depth % len(bullets)]
|
||||
return '%s %s\n' % (bullet, (text or '').strip())
|
||||
bullet = bullet + ' '
|
||||
text = (text or '').strip()
|
||||
text = self.indent(text, len(bullet))
|
||||
if text:
|
||||
text = bullet + text[len(bullet):]
|
||||
return '%s\n' % text
|
||||
|
||||
def convert_p(self, el, text, convert_as_inline):
|
||||
if convert_as_inline:
|
||||
return text
|
||||
return ' ' + text.strip() + ' '
|
||||
if self.options['wrap']:
|
||||
text = fill(text,
|
||||
width=self.options['wrap_width'],
|
||||
break_long_words=False,
|
||||
break_on_hyphens=False)
|
||||
return '%s\n\n' % text if text else ''
|
||||
# Preserve newlines (and preceding whitespace) resulting
|
||||
# from <br> tags. Newlines in the input have already been
|
||||
# replaced by spaces.
|
||||
lines = text.split('\n')
|
||||
new_lines = []
|
||||
for line in lines:
|
||||
line = line.lstrip()
|
||||
line_no_trailing = line.rstrip()
|
||||
trailing = line[len(line_no_trailing):]
|
||||
line = fill(line,
|
||||
width=self.options['wrap_width'],
|
||||
break_long_words=False,
|
||||
break_on_hyphens=False)
|
||||
new_lines.append(line + trailing)
|
||||
text = '\n'.join(new_lines)
|
||||
return '\n\n%s\n\n' % text if text else ''
|
||||
|
||||
def convert_pre(self, el, text, convert_as_inline):
|
||||
if not text:
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "markdownify"
|
||||
version = "0.13.1"
|
||||
version = "0.14.1"
|
||||
authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
|
||||
description = "Convert HTML to markdown."
|
||||
readme = "README.rst"
|
||||
|
||||
@@ -14,7 +14,7 @@ def test_chomp():
|
||||
|
||||
def test_nested():
|
||||
text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
|
||||
assert text == 'This is an [example link](http://example.com/).\n\n'
|
||||
assert text == '\n\nThis is an [example link](http://example.com/).\n\n'
|
||||
|
||||
|
||||
def test_ignore_comments():
|
||||
|
||||
@@ -11,3 +11,4 @@ def test_soup():
|
||||
|
||||
def test_whitespace():
|
||||
assert md(' a b \t\t c ') == ' a b c '
|
||||
assert md(' a b \n\n c ') == ' a b\nc '
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
|
||||
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
|
||||
|
||||
|
||||
def inline_tests(tag, markup):
|
||||
@@ -66,7 +66,7 @@ def test_blockquote_with_paragraph():
|
||||
|
||||
def test_blockquote_nested():
|
||||
text = md('<blockquote>And she was like <blockquote>Hello</blockquote></blockquote>')
|
||||
assert text == '\n> And she was like \n> > Hello\n\n'
|
||||
assert text == '\n> And she was like\n> > Hello\n\n'
|
||||
|
||||
|
||||
def test_br():
|
||||
@@ -112,36 +112,41 @@ def test_em():
|
||||
|
||||
|
||||
def test_header_with_space():
|
||||
assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
|
||||
assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
|
||||
assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello \n\n</h5>') == '##### Hello\n\n'
|
||||
assert md('<h3>\n\nHello</h3>') == '\n### Hello\n\n'
|
||||
assert md('<h3>Hello\n\n\nWorld</h3>') == '\n### Hello World\n\n'
|
||||
assert md('<h4>\n\nHello</h4>') == '\n#### Hello\n\n'
|
||||
assert md('<h5>\n\nHello</h5>') == '\n##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello\n\n</h5>') == '\n##### Hello\n\n'
|
||||
assert md('<h5>\n\nHello \n\n</h5>') == '\n##### Hello\n\n'
|
||||
|
||||
|
||||
def test_h1():
|
||||
assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
|
||||
assert md('<h1>Hello</h1>') == '\n\nHello\n=====\n\n'
|
||||
|
||||
|
||||
def test_h2():
|
||||
assert md('<h2>Hello</h2>') == 'Hello\n-----\n\n'
|
||||
assert md('<h2>Hello</h2>') == '\n\nHello\n-----\n\n'
|
||||
|
||||
|
||||
def test_hn():
|
||||
assert md('<h3>Hello</h3>') == '### Hello\n\n'
|
||||
assert md('<h4>Hello</h4>') == '#### Hello\n\n'
|
||||
assert md('<h5>Hello</h5>') == '##### Hello\n\n'
|
||||
assert md('<h6>Hello</h6>') == '###### Hello\n\n'
|
||||
assert md('<h3>Hello</h3>') == '\n### Hello\n\n'
|
||||
assert md('<h4>Hello</h4>') == '\n#### Hello\n\n'
|
||||
assert md('<h5>Hello</h5>') == '\n##### Hello\n\n'
|
||||
assert md('<h6>Hello</h6>') == '\n###### Hello\n\n'
|
||||
assert md('<h10>Hello</h10>') == md('<h6>Hello</h6>')
|
||||
assert md('<hn>Hello</hn>') == md('Hello')
|
||||
|
||||
|
||||
def test_hn_chained():
|
||||
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n'
|
||||
assert md('X<h1>First</h1>', heading_style=ATX) == 'X# First\n\n'
|
||||
assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n# First\n\n## Second\n\n### Third\n\n'
|
||||
assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n# First\n\n'
|
||||
assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n# First #\n\n'
|
||||
assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'
|
||||
|
||||
|
||||
def test_hn_nested_tag_heading_style():
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '# A P C #\n\n'
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '# A P C\n\n'
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '\n# A P C #\n\n'
|
||||
assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '\n# A P C\n\n'
|
||||
|
||||
|
||||
def test_hn_nested_simple_tag():
|
||||
@@ -157,12 +162,12 @@ def test_hn_nested_simple_tag():
|
||||
]
|
||||
|
||||
for tag, markdown in tag_to_markdown:
|
||||
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '### A ' + markdown + ' B\n\n'
|
||||
assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '\n### A ' + markdown + ' B\n\n'
|
||||
|
||||
assert md('<h3>A <br>B</h3>', heading_style=ATX) == '### A B\n\n'
|
||||
assert md('<h3>A <br>B</h3>', heading_style=ATX) == '\n### A B\n\n'
|
||||
|
||||
# Nested lists not supported
|
||||
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '### A li1 li2 B\n\n'
|
||||
# assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '\n### A li1 li2 B\n\n'
|
||||
|
||||
|
||||
def test_hn_nested_img():
|
||||
@@ -172,18 +177,18 @@ def test_hn_nested_img():
|
||||
("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
|
||||
]
|
||||
for image_attributes, markdown, title in image_attributes_to_markdown:
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A  B\n\n'
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n'
|
||||
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n### A  B\n\n'
|
||||
|
||||
|
||||
def test_hn_atx_headings():
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX) == '# Hello\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX) == '## Hello\n\n'
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX) == '\n# Hello\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX) == '\n## Hello\n\n'
|
||||
|
||||
|
||||
def test_hn_atx_closed_headings():
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '# Hello #\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '## Hello ##\n\n'
|
||||
assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '\n# Hello #\n\n'
|
||||
assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '\n## Hello ##\n\n'
|
||||
|
||||
|
||||
def test_head():
|
||||
@@ -193,7 +198,7 @@ def test_head():
|
||||
def test_hr():
|
||||
assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
|
||||
assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
|
||||
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n'
|
||||
assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n---\n\nWorld\n\n'
|
||||
|
||||
|
||||
def test_i():
|
||||
@@ -210,12 +215,23 @@ def test_kbd():
|
||||
|
||||
|
||||
def test_p():
|
||||
assert md('<p>hello</p>') == 'hello\n\n'
|
||||
assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
|
||||
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
|
||||
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
|
||||
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
|
||||
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
|
||||
assert md('<p>hello</p>') == '\n\nhello\n\n'
|
||||
assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
|
||||
assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
|
||||
assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
|
||||
assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
|
||||
assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
|
||||
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
|
||||
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
|
||||
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345 \n67890\n\n'
|
||||
assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345 \n67890\n\n'
|
||||
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
|
||||
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
|
||||
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n'
|
||||
assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901 \n12345\n\n'
|
||||
assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n'
|
||||
assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012 \n67890\n\n'
|
||||
assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'
|
||||
|
||||
|
||||
def test_pre():
|
||||
@@ -289,3 +305,13 @@ def test_lang_callback():
|
||||
assert md('<pre class="python">test\n foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n foo\nbar\n```\n'
|
||||
assert md('<pre class="javascript"><code>test\n foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'
|
||||
assert md('<pre class="javascript"><code class="javascript">test\n foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'
|
||||
|
||||
|
||||
def test_spaces():
|
||||
assert md('<p> a b </p> <p> c d </p>') == '\n\na b\n\nc d\n\n'
|
||||
assert md('<p> <i>a</i> </p>') == '\n\n*a*\n\n'
|
||||
assert md('test <p> again </p>') == 'test\n\nagain\n\n'
|
||||
assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
|
||||
assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
|
||||
assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
|
||||
assert md('test <pre> foo </pre> bar') == 'test\n```\n foo \n```\nbar'
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import warnings
|
||||
from bs4 import MarkupResemblesLocatorWarning
|
||||
from markdownify import markdownify as md
|
||||
|
||||
|
||||
@@ -12,7 +14,7 @@ def test_underscore():
|
||||
|
||||
|
||||
def test_xml_entities():
|
||||
assert md('&') == r'\&'
|
||||
assert md('&', escape_misc=True) == r'\&'
|
||||
|
||||
|
||||
def test_named_entities():
|
||||
@@ -25,23 +27,49 @@ def test_hexadecimal_entities():
|
||||
|
||||
|
||||
def test_single_escaping_entities():
|
||||
assert md('&amp;') == r'\&'
|
||||
assert md('&amp;', escape_misc=True) == r'\&'
|
||||
|
||||
|
||||
def text_misc():
|
||||
assert md('\\*') == r'\\\*'
|
||||
assert md('<foo>') == r'\<foo\>'
|
||||
assert md('# foo') == r'\# foo'
|
||||
assert md('> foo') == r'\> foo'
|
||||
assert md('~~foo~~') == r'\~\~foo\~\~'
|
||||
assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
|
||||
assert md('---\n') == '\\-\\-\\-\n'
|
||||
assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
|
||||
assert md('`x`') == r'\`x\`'
|
||||
assert md('[text](link)') == r'\[text](link)'
|
||||
assert md('1. x') == r'1\. x'
|
||||
assert md('not a number. x') == r'not a number. x'
|
||||
assert md('1) x') == r'1\) x'
|
||||
assert md('not a number) x') == r'not a number) x'
|
||||
assert md('|not table|') == r'\|not table\|'
|
||||
assert md(r'\ <foo> &amp; | ` `', escape_misc=False) == r'\ <foo> & | ` `'
|
||||
def test_misc():
|
||||
# ignore the bs4 warning that "1.2" or "*" looks like a filename
|
||||
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
|
||||
|
||||
assert md('\\*', escape_misc=True) == r'\\\*'
|
||||
assert md('<foo>', escape_misc=True) == r'\<foo\>'
|
||||
assert md('# foo', escape_misc=True) == r'\# foo'
|
||||
assert md('#5', escape_misc=True) == r'#5'
|
||||
assert md('5#', escape_misc=True) == '5#'
|
||||
assert md('####### foo', escape_misc=True) == r'####### foo'
|
||||
assert md('> foo', escape_misc=True) == r'\> foo'
|
||||
assert md('~~foo~~', escape_misc=True) == r'\~\~foo\~\~'
|
||||
assert md('foo\n===\n', escape_misc=True) == 'foo\n\\=\\=\\=\n'
|
||||
assert md('---\n', escape_misc=True) == '\\---\n'
|
||||
assert md('- test', escape_misc=True) == r'\- test'
|
||||
assert md('x - y', escape_misc=True) == r'x \- y'
|
||||
assert md('test-case', escape_misc=True) == 'test-case'
|
||||
assert md('x-', escape_misc=True) == 'x-'
|
||||
assert md('-y', escape_misc=True) == '-y'
|
||||
assert md('+ x\n+ y\n', escape_misc=True) == '\\+ x\n\\+ y\n'
|
||||
assert md('`x`', escape_misc=True) == r'\`x\`'
|
||||
assert md('[text](link)', escape_misc=True) == r'\[text](link)'
|
||||
assert md('1. x', escape_misc=True) == r'1\. x'
|
||||
# assert md('1<span>.</span> x', escape_misc=True) == r'1\. x'
|
||||
assert md('<span>1.</span> x', escape_misc=True) == r'1\. x'
|
||||
assert md(' 1. x', escape_misc=True) == r' 1\. x'
|
||||
assert md('123456789. x', escape_misc=True) == r'123456789\. x'
|
||||
assert md('1234567890. x', escape_misc=True) == r'1234567890. x'
|
||||
assert md('A1. x', escape_misc=True) == r'A1. x'
|
||||
assert md('1.2', escape_misc=True) == r'1.2'
|
||||
assert md('not a number. x', escape_misc=True) == r'not a number. x'
|
||||
assert md('1) x', escape_misc=True) == r'1\) x'
|
||||
# assert md('1<span>)</span> x', escape_misc=True) == r'1\) x'
|
||||
assert md('<span>1)</span> x', escape_misc=True) == r'1\) x'
|
||||
assert md(' 1) x', escape_misc=True) == r' 1\) x'
|
||||
assert md('123456789) x', escape_misc=True) == r'123456789\) x'
|
||||
assert md('1234567890) x', escape_misc=True) == r'1234567890) x'
|
||||
assert md('(1) x', escape_misc=True) == r'(1) x'
|
||||
assert md('A1) x', escape_misc=True) == r'A1) x'
|
||||
assert md('1)x', escape_misc=True) == r'1)x'
|
||||
assert md('not a number) x', escape_misc=True) == r'not a number) x'
|
||||
assert md('|not table|', escape_misc=True) == r'\|not table\|'
|
||||
assert md(r'\ <foo> &amp; | ` `', escape_misc=False) == r'\ <foo> & | ` `'
|
||||
|
||||
@@ -41,19 +41,21 @@ nested_ols = """
|
||||
|
||||
|
||||
def test_ol():
|
||||
assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
|
||||
assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
|
||||
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '\n\n3. a\n4. b\n'
|
||||
assert md('foo<ol start="3"><li>a</li><li>b</li></ol>bar') == 'foo\n\n3. a\n4. b\n\nbar'
|
||||
assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
|
||||
assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
|
||||
assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
|
||||
assert md('<ol start="1234"><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ol>') == '\n\n1234. first para\n \n second para\n1235. third para\n \n fourth para\n'
|
||||
|
||||
|
||||
def test_nested_ols():
|
||||
assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
|
||||
assert md(nested_ols) == '\n\n1. 1\n 1. a\n 1. I\n 2. II\n 3. III\n 2. b\n 3. c\n2. 2\n3. 3\n'
|
||||
|
||||
|
||||
def test_ul():
|
||||
assert md('<ul><li>a</li><li>b</li></ul>') == '* a\n* b\n'
|
||||
assert md('<ul><li>a</li><li>b</li></ul>') == '\n\n* a\n* b\n'
|
||||
assert md("""<ul>
|
||||
<li>
|
||||
a
|
||||
@@ -61,11 +63,13 @@ def test_ul():
|
||||
<li> b </li>
|
||||
<li> c
|
||||
</li>
|
||||
</ul>""") == '* a\n* b\n* c\n'
|
||||
</ul>""") == '\n\n* a\n* b\n* c\n'
|
||||
assert md('<ul><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ul>') == '\n\n* first para\n \n second para\n* third para\n \n fourth para\n'
|
||||
|
||||
|
||||
def test_inline_ul():
|
||||
assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == 'foo\n\n* a\n* b\n\nbar\n\n'
|
||||
assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n'
|
||||
assert md('foo<ul><li>bar</li></ul>baz') == 'foo\n\n* bar\n\nbaz'
|
||||
|
||||
|
||||
def test_nested_uls():
|
||||
@@ -73,12 +77,12 @@ def test_nested_uls():
|
||||
Nested ULs should alternate bullet characters.
|
||||
|
||||
"""
|
||||
assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
|
||||
assert md(nested_uls) == '\n\n* 1\n + a\n - I\n - II\n - III\n + b\n + c\n* 2\n* 3\n'
|
||||
|
||||
|
||||
def test_bullets():
|
||||
assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
|
||||
assert md(nested_uls, bullets='-') == '\n\n- 1\n - a\n - I\n - II\n - III\n - b\n - c\n- 2\n- 3\n'
|
||||
|
||||
|
||||
def test_li_text():
|
||||
assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar </li><li>foo <b>bar</b> <i>space</i>.</ul>') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
|
||||
assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar </li><li>foo <b>bar</b> <i>space</i>.</ul>') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
|
||||
|
||||
@@ -242,7 +242,7 @@ def test_table():
|
||||
assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
|
||||
assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
|
||||
assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
|
||||
Reference in New Issue
Block a user