diff --git a/README.rst b/README.rst index 06f319b..51888ea 100644 --- a/README.rst +++ b/README.rst @@ -156,7 +156,12 @@ Creating Custom Converters If you have a special usecase that calls for a special conversion, you can always inherit from ``MarkdownConverter`` and override the method you want to -change: +change. +The function that handles a HTML tag named ``abc`` is called +``convert_abc(self, el, text, convert_as_inline)`` and returns a string +containing the converted HTML tag. +The ``MarkdownConverter`` object will handle the conversion based on the +function names: .. code:: python @@ -173,6 +178,21 @@ change: def md(html, **options): return ImageBlockConverter(**options).convert(html) +.. code:: python + + from markdownify import MarkdownConverter + + class IgnoreParagraphsConverter(MarkdownConverter): + """ + Create a custom MarkdownConverter that ignores paragraphs + """ + def convert_p(self, el, text, convert_as_inline): + return '' + + # Create shorthand method for conversion + def md(html, **options): + return IgnoreParagraphsConverter(**options).convert(html) + Command Line Interface ====================== diff --git a/markdownify/__init__.py b/markdownify/__init__.py index e15ecd4..86226d2 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -152,13 +152,12 @@ class MarkdownConverter(object): def process_text(self, el): text = six.text_type(el) or '' - # dont remove any whitespace when handling pre or code in pre - if not (el.parent.name == 'pre' - or (el.parent.name == 'code' - and el.parent.parent.name == 'pre')): + # normalize whitespace if we're not inside a preformatted element + if not el.find_parent('pre'): text = whitespace_re.sub(' ', text) - if el.parent.name != 'code' and el.parent.name != 'pre': + # escape special characters if we're not inside a preformatted or code element + if not el.find_parent(['pre', 'code', 'kbd', 'samp']): text = self.escape(text) # remove trailing whitespaces if any of the following condition is true: @@ -238,7 +237,7 @@ class MarkdownConverter(object): if convert_as_inline: return text - return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else '' + return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else '' def convert_br(self, el, text, convert_as_inline): if convert_as_inline: @@ -266,7 +265,7 @@ class MarkdownConverter(object): return text style = self.options['heading_style'].lower() - text = text.rstrip() + text = text.strip() if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) @@ -351,6 +350,12 @@ class MarkdownConverter(object): return '\n```%s\n%s\n```\n' % (code_language, text) + def convert_script(self, el, text, convert_as_inline): + return '' + + def convert_style(self, el, text, convert_as_inline): + return '' + convert_s = convert_del convert_strong = convert_b @@ -364,20 +369,42 @@ class MarkdownConverter(object): def convert_table(self, el, text, convert_as_inline): return '\n\n' + text + '\n' + def convert_caption(self, el, text, convert_as_inline): + return text + '\n' + + def convert_figcaption(self, el, text, convert_as_inline): + return '\n\n' + text + '\n\n' + def convert_td(self, el, text, convert_as_inline): - return ' ' + text + ' |' + colspan = 1 + if 'colspan' in el.attrs: + colspan = int(el['colspan']) + return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_th(self, el, text, convert_as_inline): - return ' ' + text + ' |' + colspan = 1 + if 'colspan' in el.attrs: + colspan = int(el['colspan']) + return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_tr(self, el, text, convert_as_inline): cells = el.find_all(['td', 'th']) - is_headrow = all([cell.name == 'th' for cell in cells]) + is_headrow = ( + all([cell.name == 'th' for cell in cells]) + or (not el.previous_sibling and not el.parent.name == 'tbody') + or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) + ) overline = '' underline = '' if is_headrow and not el.previous_sibling: # first row and is headline: print headline underline - underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' + full_colspan = 0 + for cell in cells: + if "colspan" in cell.attrs: + full_colspan += int(cell["colspan"]) + else: + full_colspan += 1 + underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' elif (not el.previous_sibling and (el.parent.name == 'table' or (el.parent.name == 'tbody' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index da78649..1e685f3 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -52,6 +52,12 @@ def test_b_spaces(): def test_blockquote(): assert md('
Hello') == '\n> Hello\n\n' + assert md('
\nHello\n') == '\n> Hello\n\n' + + +def test_blockquote_with_nested_paragraph(): + assert md('
') == '\n> Hello\n\n' + assert md('Hello
') == '\n> Hello\n> \n> Hello again\n\n' def test_blockquote_with_paragraph(): @@ -60,7 +66,7 @@ def test_blockquote_with_paragraph(): def test_blockquote_nested(): text = md('Hello
Hello again
And she was like') - assert text == '\n> And she was like \n> > Hello\n> \n> \n\n' + assert text == '\n> And she was like \n> > Hello\n\n' def test_br(): @@ -68,9 +74,19 @@ def test_br(): assert md('aHello
this_should_not_escape') == '`this_should_not_escape`'
+ assert md('*this_should_not_escape*') == '`*this_should_not_escape*`'
+ assert md('*this_should_not_escape*') == '`*this_should_not_escape*`'
+ assert md('*this_should_not_escape*') == '`*this_should_not_escape*`'
+ assert md('*this_should_not_escape*') == '`*this_should_not_escape*`'
+ assert md('this should\t\tnormalize') == '`this should normalize`'
+ assert md('this should\t\tnormalize') == '`this should normalize`'
def test_del():
@@ -85,6 +101,14 @@ def test_em():
inline_tests('em', '*')
+def test_header_with_space():
+ assert md('test\n foo\nbar') == '\n```\ntest\n foo\nbar\n```\n' assert md('
test\n foo\nbar') == '\n```\ntest\n foo\nbar\n```\n'
- assert md('this_should_not_escape') == '\n```\nthis_should_not_escape\n```\n' + assert md('
*this_should_not_escape*') == '\n```\n*this_should_not_escape*\n```\n' + assert md('
*this_should_not_escape*') == '\n```\n*this_should_not_escape*\n```\n'
+ assert md('\t\tthis should\t\tnot normalize') == '\n```\n\t\tthis should\t\tnot normalize\n```\n' + assert md('
\t\tthis should\t\tnot normalize') == '\n```\n\t\tthis should\t\tnot normalize\n```\n'
+
+
+def test_script():
+ assert md('foo bar') == 'foo bar'
+
+
+def test_style():
+ assert md('foo bar') == 'foo bar'
def test_s():
diff --git a/tests/test_tables.py b/tests/test_tables.py
index e0c07ea..9120c29 100644
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -57,6 +57,26 @@ table_with_paragraphs = """| Firstname | +Lastname | +Age | +
|---|---|---|
| Jill | +Smith + Jackson | +50 | +
| Eve | +Jackson + Smith | +94 | +
| Firstname | +Lastname | +Age | +
| Jill | +Smith | +50 | +
| Eve | +Jackson | +94 | +
| Firstname | +Lastname | +Age | +
| Name | +Age | +|
|---|---|---|
| Jill | +Smith | +50 | +
| Eve | +Jackson | +94 | +