diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 6c64c60..9a30b34 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -44,6 +44,22 @@ def chomp(text): return (prefix, suffix, text) +def abstract_inline_conversion(markup_fn): + """ + This abstracts all simple inline tags like b, em, del, ... + Returns a function that wraps the chomped text in a pair of the string + that is returned by markup_fn. markup_fn is necessary to allow for + references to self.strong_em_symbol etc. + """ + def implementation(self, el, text, convert_as_inline): + markup = markup_fn(self) + prefix, suffix, text = chomp(text) + if not text: + return '' + return '%s%s%s%s%s' % (prefix, markup, text, markup, suffix) + return implementation + + def _todict(obj): return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_')) @@ -124,12 +140,21 @@ class MarkdownConverter(object): def process_text(self, el): text = six.text_type(el) + + # dont remove any whitespace when handling pre or code in pre + if (el.parent.name == 'pre' + or (el.parent.name == 'code' and el.parent.parent.name == 'pre')): + return escape(text or '') + + cleaned_text = escape(whitespace_re.sub(' ', text or '')) + # remove trailing whitespaces if any of the following condition is true: # - current text node is the last node in li # - current text node is followed by an embedded list if el.parent.name == 'li' and (not el.next_sibling or el.next_sibling.name in ['ul', 'ol']): - return escape(all_whitespace_re.sub(' ', text or '')).rstrip() - return escape(whitespace_re.sub(' ', text or '')) + return cleaned_text.rstrip() + + return cleaned_text def __getattr__(self, attr): # Handle headings @@ -179,8 +204,7 @@ class MarkdownConverter(object): title_part = ' "%s"' % title.replace('"', r'\"') if title else '' return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text - def convert_b(self, el, text, convert_as_inline): - return self.convert_strong(el, text, convert_as_inline) + convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol']) def convert_blockquote(self, el, text, convert_as_inline): @@ -198,12 +222,17 @@ class MarkdownConverter(object): else: return ' \n' - def convert_em(self, el, text, convert_as_inline): - em_tag = self.options['strong_em_symbol'] - prefix, suffix, text = chomp(text) - if not text: - return '' - return '%s%s%s%s%s' % (prefix, em_tag, text, em_tag, suffix) + def convert_code(self, el, text, convert_as_inline): + if el.parent.name == 'pre': + return text + converter = abstract_inline_conversion(lambda self: '`') + return converter(self, el, text, convert_as_inline) + + convert_del = abstract_inline_conversion(lambda self: '~~') + + convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol']) + + convert_kbd = convert_code def convert_hn(self, n, el, text, convert_as_inline): if convert_as_inline: @@ -219,8 +248,20 @@ class MarkdownConverter(object): return '%s %s %s\n\n' % (hashes, text, hashes) return '%s %s\n\n' % (hashes, text) - def convert_i(self, el, text, convert_as_inline): - return self.convert_em(el, text, convert_as_inline) + def convert_hr(self, el, text, convert_as_inline): + return '\n\n---\n\n' + + convert_i = convert_em + + def convert_img(self, el, text, convert_as_inline): + alt = el.attrs.get('alt', None) or '' + src = el.attrs.get('src', None) or '' + title = el.attrs.get('title', None) or '' + title_part = ' "%s"' % title.replace('"', r'\"') if title else '' + if convert_as_inline: + return alt + + return '![%s](%s%s)' % (alt, src, title_part) def convert_list(self, el, text, convert_as_inline): @@ -267,26 +308,26 @@ class MarkdownConverter(object): return text return '%s\n\n' % text if text else '' - def convert_strong(self, el, text, convert_as_inline): - strong_tag = 2 * self.options['strong_em_symbol'] - prefix, suffix, text = chomp(text) + def convert_pre(self, el, text, convert_as_inline): if not text: return '' - return '%s%s%s%s%s' % (prefix, strong_tag, text, strong_tag, suffix) + return '\n```\n%s\n```\n' % text - def convert_img(self, el, text, convert_as_inline): - alt = el.attrs.get('alt', None) or '' - src = el.attrs.get('src', None) or '' - title = el.attrs.get('title', None) or '' - title_part = ' "%s"' % title.replace('"', r'\"') if title else '' - if convert_as_inline: - return alt + convert_s = convert_del - return '![%s](%s%s)' % (alt, src, title_part) + convert_strong = convert_b + + convert_samp = convert_code def convert_table(self, el, text, convert_as_inline): return '\n\n' + text + '\n' + def convert_td(self, el, text, convert_as_inline): + return ' ' + text + ' |' + + def convert_th(self, el, text, convert_as_inline): + return ' ' + text + ' |' + def convert_tr(self, el, text, convert_as_inline): cells = el.find_all(['td', 'th']) is_headrow = all([cell.name == 'th' for cell in cells]) @@ -302,15 +343,6 @@ class MarkdownConverter(object): overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' return overline + '|' + text + '\n' + underline - def convert_th(self, el, text, convert_as_inline): - return ' ' + text + ' |' - - def convert_td(self, el, text, convert_as_inline): - return ' ' + text + ' |' - - def convert_hr(self, el, text, convert_as_inline): - return '\n\n---\n\n' - def markdownify(html, **options): return MarkdownConverter(**options).convert(html) diff --git a/setup.py b/setup.py index 87bd84e..04dbb80 100644 --- a/setup.py +++ b/setup.py @@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read() pkgmeta = { '__title__': 'markdownify', '__author__': 'Matthew Tretter', - '__version__': '0.7.4', + '__version__': '0.8.0', } diff --git a/tests/test_advanced.py b/tests/test_advanced.py index 7ee61d2..2435cac 100644 --- a/tests/test_advanced.py +++ b/tests/test_advanced.py @@ -14,3 +14,10 @@ def test_ignore_comments(): def test_ignore_comments_with_other_tags(): text = md("example link") assert text == "[example link](http://example.com/)" + + +def test_code_with_tricky_content(): + assert md('>') == "`>`" + assert md('/home/username') == "`/home/`**username**" + assert md('First line blah blah
blah blah
second line') \ + == "First line `blah blah \nblah blah` second line" diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 31fe7f2..354212b 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -240,6 +240,40 @@ def test_em_spaces(): assert md('foo bar') == 'foo bar' +def inline_tests(tag, markup): + # Basically re-use test_em() and test_em_spaces(), + assert md(f'<{tag}>Hello') == f'{markup}Hello{markup}' + assert md(f'foo <{tag}>Hello bar') == f'foo {markup}Hello{markup} bar' + assert md(f'foo<{tag}> Hello bar') == f'foo {markup}Hello{markup} bar' + assert md(f'foo <{tag}>Hello bar') == f'foo {markup}Hello{markup} bar' + assert md(f'foo <{tag}> bar') in ['foo bar', 'foo bar'] # Either is OK + + +def test_code(): + inline_tests('code', '`') + + +def test_samp(): + inline_tests('samp', '`') + + +def test_kbd(): + inline_tests('kbd', '`') + + +def test_pre(): + assert md('
test\n    foo\nbar
') == '\n```\ntest\n foo\nbar\n```\n' + assert md('
test\n    foo\nbar
') == '\n```\ntest\n foo\nbar\n```\n' + + +def test_del(): + inline_tests('del', '~~') + + +def test_s(): + inline_tests('s', '~~') + + def test_h1(): assert md('

Hello

') == 'Hello\n=====\n\n'