diff --git a/README.rst b/README.rst index 06f319b..51888ea 100644 --- a/README.rst +++ b/README.rst @@ -156,7 +156,12 @@ Creating Custom Converters If you have a special usecase that calls for a special conversion, you can always inherit from ``MarkdownConverter`` and override the method you want to -change: +change. +The function that handles a HTML tag named ``abc`` is called +``convert_abc(self, el, text, convert_as_inline)`` and returns a string +containing the converted HTML tag. +The ``MarkdownConverter`` object will handle the conversion based on the +function names: .. code:: python @@ -173,6 +178,21 @@ change: def md(html, **options): return ImageBlockConverter(**options).convert(html) +.. code:: python + + from markdownify import MarkdownConverter + + class IgnoreParagraphsConverter(MarkdownConverter): + """ + Create a custom MarkdownConverter that ignores paragraphs + """ + def convert_p(self, el, text, convert_as_inline): + return '' + + # Create shorthand method for conversion + def md(html, **options): + return IgnoreParagraphsConverter(**options).convert(html) + Command Line Interface ====================== diff --git a/markdownify/__init__.py b/markdownify/__init__.py index e15ecd4..86226d2 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -152,13 +152,12 @@ class MarkdownConverter(object): def process_text(self, el): text = six.text_type(el) or '' - # dont remove any whitespace when handling pre or code in pre - if not (el.parent.name == 'pre' - or (el.parent.name == 'code' - and el.parent.parent.name == 'pre')): + # normalize whitespace if we're not inside a preformatted element + if not el.find_parent('pre'): text = whitespace_re.sub(' ', text) - if el.parent.name != 'code' and el.parent.name != 'pre': + # escape special characters if we're not inside a preformatted or code element + if not el.find_parent(['pre', 'code', 'kbd', 'samp']): text = self.escape(text) # remove trailing whitespaces if any of the following condition is true: @@ -238,7 +237,7 @@ class MarkdownConverter(object): if convert_as_inline: return text - return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else '' + return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else '' def convert_br(self, el, text, convert_as_inline): if convert_as_inline: @@ -266,7 +265,7 @@ class MarkdownConverter(object): return text style = self.options['heading_style'].lower() - text = text.rstrip() + text = text.strip() if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) @@ -351,6 +350,12 @@ class MarkdownConverter(object): return '\n```%s\n%s\n```\n' % (code_language, text) + def convert_script(self, el, text, convert_as_inline): + return '' + + def convert_style(self, el, text, convert_as_inline): + return '' + convert_s = convert_del convert_strong = convert_b @@ -364,20 +369,42 @@ class MarkdownConverter(object): def convert_table(self, el, text, convert_as_inline): return '\n\n' + text + '\n' + def convert_caption(self, el, text, convert_as_inline): + return text + '\n' + + def convert_figcaption(self, el, text, convert_as_inline): + return '\n\n' + text + '\n\n' + def convert_td(self, el, text, convert_as_inline): - return ' ' + text + ' |' + colspan = 1 + if 'colspan' in el.attrs: + colspan = int(el['colspan']) + return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_th(self, el, text, convert_as_inline): - return ' ' + text + ' |' + colspan = 1 + if 'colspan' in el.attrs: + colspan = int(el['colspan']) + return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_tr(self, el, text, convert_as_inline): cells = el.find_all(['td', 'th']) - is_headrow = all([cell.name == 'th' for cell in cells]) + is_headrow = ( + all([cell.name == 'th' for cell in cells]) + or (not el.previous_sibling and not el.parent.name == 'tbody') + or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) + ) overline = '' underline = '' if is_headrow and not el.previous_sibling: # first row and is headline: print headline underline - underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' + full_colspan = 0 + for cell in cells: + if "colspan" in cell.attrs: + full_colspan += int(cell["colspan"]) + else: + full_colspan += 1 + underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' elif (not el.previous_sibling and (el.parent.name == 'table' or (el.parent.name == 'tbody' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index da78649..1e685f3 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -52,6 +52,12 @@ def test_b_spaces(): def test_blockquote(): assert md('
Hello
') == '\n> Hello\n\n' + assert md('
\nHello\n
') == '\n> Hello\n\n' + + +def test_blockquote_with_nested_paragraph(): + assert md('

Hello

') == '\n> Hello\n\n' + assert md('

Hello

Hello again

') == '\n> Hello\n> \n> Hello again\n\n' def test_blockquote_with_paragraph(): @@ -60,7 +66,7 @@ def test_blockquote_with_paragraph(): def test_blockquote_nested(): text = md('
And she was like
Hello
') - assert text == '\n> And she was like \n> > Hello\n> \n> \n\n' + assert text == '\n> And she was like \n> > Hello\n\n' def test_br(): @@ -68,9 +74,19 @@ def test_br(): assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' +def test_caption(): + assert md('TEXT
Caption
SPAN
') == 'TEXT\n\nCaption\n\nSPAN' + assert md('
SPAN
Caption
TEXT') == 'SPAN\n\nCaption\n\nTEXT' + + def test_code(): inline_tests('code', '`') - assert md('this_should_not_escape') == '`this_should_not_escape`' + assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' + assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' + assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' + assert md('*this_should_not_escape*') == '`*this_should_not_escape*`' + assert md('this should\t\tnormalize') == '`this should normalize`' + assert md('this should\t\tnormalize') == '`this should normalize`' def test_del(): @@ -85,6 +101,14 @@ def test_em(): inline_tests('em', '*') +def test_header_with_space(): + assert md('

\n\nHello

') == '### Hello\n\n' + assert md('

\n\nHello

') == '#### Hello\n\n' + assert md('
\n\nHello
') == '##### Hello\n\n' + assert md('
\n\nHello\n\n
') == '##### Hello\n\n' + assert md('
\n\nHello \n\n
') == '##### Hello\n\n' + + def test_h1(): assert md('

Hello

') == 'Hello\n=====\n\n' @@ -187,7 +211,18 @@ def test_p(): def test_pre(): assert md('
test\n    foo\nbar
') == '\n```\ntest\n foo\nbar\n```\n' assert md('
test\n    foo\nbar
') == '\n```\ntest\n foo\nbar\n```\n' - assert md('
this_should_not_escape
') == '\n```\nthis_should_not_escape\n```\n' + assert md('
*this_should_not_escape*
') == '\n```\n*this_should_not_escape*\n```\n' + assert md('
*this_should_not_escape*
') == '\n```\n*this_should_not_escape*\n```\n' + assert md('
\t\tthis  should\t\tnot  normalize
') == '\n```\n\t\tthis should\t\tnot normalize\n```\n' + assert md('
\t\tthis  should\t\tnot  normalize
') == '\n```\n\t\tthis should\t\tnot normalize\n```\n' + + +def test_script(): + assert md('foo bar') == 'foo bar' + + +def test_style(): + assert md('foo bar') == 'foo bar' def test_s(): diff --git a/tests/test_tables.py b/tests/test_tables.py index e0c07ea..9120c29 100644 --- a/tests/test_tables.py +++ b/tests/test_tables.py @@ -57,6 +57,26 @@ table_with_paragraphs = """
""" +table_with_linebreaks = """ + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith + Jackson50
EveJackson + Smith94
""" + table_with_header_column = """ @@ -99,6 +119,28 @@ table_head_body = """
""" +table_head_body_missing_head = """ + + + + + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
""" + table_missing_text = """ @@ -159,13 +201,42 @@ table_body = """
""" +table_with_caption = """TEXT + + + + + +
Caption
FirstnameLastnameAge
""" + +table_with_colspan = """ + + + + + + + + + + + + + + +
NameAge
JillSmith50
EveJackson94
""" + def test_table(): assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' - assert md(table_body) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n' + assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'