From 7dac92e85e9288595167320b4f32325e8f56ff97 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Sun, 16 May 2021 19:02:00 +0200 Subject: [PATCH 1/3] Allow for tables without header row fixes #42 --- markdownify/__init__.py | 8 ++++++- tests/test_conversions.py | 45 +++++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 17 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 6230fb8..284eba3 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -279,14 +279,20 @@ class MarkdownConverter(object): def convert_table(self, el, text, convert_as_inline): rows = el.find_all('tr') text_data = [] + rendered_header = False for row in rows: headers = row.find_all('th') columns = row.find_all('td') - if len(headers) > 0: + if not rendered_header and len(headers) > 0: headers = [head.text.strip() for head in headers] text_data.append('| ' + ' | '.join(headers) + ' |') text_data.append('| ' + ' | '.join(['---'] * len(headers)) + ' |') + rendered_header = True elif len(columns) > 0: + if not rendered_header: + text_data.append('| ' + ' | '.join([''] * len(columns)) + ' |') + text_data.append('| ' + ' | '.join(['---'] * len(columns)) + ' |') + rendered_header = True columns = [colm.text.strip() for colm in columns] text_data.append('| ' + ' | '.join(columns) + ' |') else: diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 6663204..e6f70c0 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -41,8 +41,7 @@ nested_ols = """ """ -table = re.sub(r'\s+', '', """ - +table = """
@@ -58,18 +57,16 @@ table = re.sub(r'\s+', '', """ -
Firstname LastnameJackson 94
-""") +""" -table_head_body = re.sub(r'\s+', '', """ - +table_head_body = """
- + - + @@ -83,17 +80,15 @@ table_head_body = re.sub(r'\s+', '', """ -
Firstname Lastname Age
94
-""") +""" -table_missing_text = re.sub(r'\s+', '', """ - +table_missing_text = """
- + - + @@ -107,8 +102,25 @@ table_missing_text = re.sub(r'\s+', '', """ -
Lastname Age
94
-""") +""" + +table_missing_head = """ + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
""" def test_chomp(): @@ -325,6 +337,7 @@ def test_table(): assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' assert md(table_missing_text) == '| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |' + assert md(table_missing_head) == '| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' def test_strong_em_symbol(): From e6da15c173d52aaea2d78708463bf7e796bf4ccd Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Mon, 17 May 2021 12:36:48 +0200 Subject: [PATCH 2/3] allow tables with headers in first (or any) column --- markdownify/__init__.py | 21 ++++++++++----------- tests/test_conversions.py | 20 ++++++++++++++++++++ 2 files changed, 30 insertions(+), 11 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 284eba3..d3a2e6e 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -281,20 +281,19 @@ class MarkdownConverter(object): text_data = [] rendered_header = False for row in rows: - headers = row.find_all('th') - columns = row.find_all('td') - if not rendered_header and len(headers) > 0: - headers = [head.text.strip() for head in headers] - text_data.append('| ' + ' | '.join(headers) + ' |') - text_data.append('| ' + ' | '.join(['---'] * len(headers)) + ' |') + cells = row.find_all(['td', 'th']) + is_headrow = all([cell.name == 'th' for cell in cells]) + texts = [cell.text.strip() for cell in cells] + if not rendered_header and is_headrow: + text_data.append('| ' + ' | '.join(texts) + ' |') + text_data.append('| ' + ' | '.join(['---'] * len(cells)) + ' |') rendered_header = True - elif len(columns) > 0: + elif len(cells) > 0: if not rendered_header: - text_data.append('| ' + ' | '.join([''] * len(columns)) + ' |') - text_data.append('| ' + ' | '.join(['---'] * len(columns)) + ' |') + text_data.append('| ' + ' | '.join([''] * len(cells)) + ' |') + text_data.append('| ' + ' | '.join(['---'] * len(cells)) + ' |') rendered_header = True - columns = [colm.text.strip() for colm in columns] - text_data.append('| ' + ' | '.join(columns) + ' |') + text_data.append('| ' + ' | '.join(texts) + ' |') else: continue return '\n'.join(text_data) diff --git a/tests/test_conversions.py b/tests/test_conversions.py index e6f70c0..e2f7c39 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -60,6 +60,25 @@ table = """
""" +table_with_header_column = """ + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
""" + + table_head_body = """ @@ -335,6 +354,7 @@ def test_div(): def test_table(): assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' + assert md(table_with_header_column) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' assert md(table_missing_text) == '| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |' assert md(table_missing_head) == '| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' From ea81407b87af48859719ef7bf454edb7792af6a7 Mon Sep 17 00:00:00 2001 From: AlexVonB Date: Mon, 17 May 2021 14:00:00 +0200 Subject: [PATCH 3/3] implemented table parsing correctly instead of manually walking down the dom tree in a table, we now rely on the main descent loop and just implement conversion for rows and cells correctly. this enables the use of html inside a table cell. --- markdownify/__init__.py | 66 ++++++++++++++++++++++----------------- setup.cfg | 2 +- tests/test_conversions.py | 31 ++++++++++++++---- 3 files changed, 64 insertions(+), 35 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index d3a2e6e..6c64c60 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -84,18 +84,26 @@ class MarkdownConverter(object): if not children_only and isHeading: convert_children_as_inline = True - # Remove whitespace-only textnodes in lists - def is_list_node(el): - return el and el.name in ['ol', 'ul', 'li'] + # Remove whitespace-only textnodes in purely nested nodes + def is_nested_node(el): + return el and el.name in ['ol', 'ul', 'li', + 'table', 'thead', 'tbody', 'tfoot', + 'tr', 'td', 'th'] - if is_list_node(node): + if is_nested_node(node): for el in node.children: - # Only extract (remove) whitespace-only text node if any of the conditions is true: + # Only extract (remove) whitespace-only text node if any of the + # conditions is true: # - el is the first element in its parent # - el is the last element in its parent - # - el is adjacent to an list node - can_extract = not el.previous_sibling or not el.next_sibling or is_list_node(el.previous_sibling) or is_list_node(el.next_sibling) - if isinstance(el, NavigableString) and six.text_type(el).strip() == '' and can_extract: + # - el is adjacent to an nested node + can_extract = (not el.previous_sibling + or not el.next_sibling + or is_nested_node(el.previous_sibling) + or is_nested_node(el.next_sibling)) + if (isinstance(el, NavigableString) + and six.text_type(el).strip() == '' + and can_extract): el.extract() # Convert the children first @@ -277,26 +285,28 @@ class MarkdownConverter(object): return '![%s](%s%s)' % (alt, src, title_part) def convert_table(self, el, text, convert_as_inline): - rows = el.find_all('tr') - text_data = [] - rendered_header = False - for row in rows: - cells = row.find_all(['td', 'th']) - is_headrow = all([cell.name == 'th' for cell in cells]) - texts = [cell.text.strip() for cell in cells] - if not rendered_header and is_headrow: - text_data.append('| ' + ' | '.join(texts) + ' |') - text_data.append('| ' + ' | '.join(['---'] * len(cells)) + ' |') - rendered_header = True - elif len(cells) > 0: - if not rendered_header: - text_data.append('| ' + ' | '.join([''] * len(cells)) + ' |') - text_data.append('| ' + ' | '.join(['---'] * len(cells)) + ' |') - rendered_header = True - text_data.append('| ' + ' | '.join(texts) + ' |') - else: - continue - return '\n'.join(text_data) + return '\n\n' + text + '\n' + + def convert_tr(self, el, text, convert_as_inline): + cells = el.find_all(['td', 'th']) + is_headrow = all([cell.name == 'th' for cell in cells]) + overline = '' + underline = '' + if is_headrow and not el.previous_sibling: + # first row and is headline: print headline underline + underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' + elif not el.previous_sibling and not el.parent.name != 'table': + # first row, not headline, and the parent is sth. like tbody: + # print empty headline above this row + overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n' + overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' + return overline + '|' + text + '\n' + underline + + def convert_th(self, el, text, convert_as_inline): + return ' ' + text + ' |' + + def convert_td(self, el, text, convert_as_inline): + return ' ' + text + ' |' def convert_hr(self, el, text, convert_as_inline): return '\n\n---\n\n' diff --git a/setup.cfg b/setup.cfg index e44b810..32e2565 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,2 +1,2 @@ [flake8] -ignore = E501 +ignore = E501 W503 diff --git a/tests/test_conversions.py b/tests/test_conversions.py index e2f7c39..31fe7f2 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,5 +1,4 @@ from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE -import re nested_uls = """ @@ -60,6 +59,25 @@ table = """
""" +table_with_html_content = """ + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
""" + + table_with_header_column = """ @@ -353,11 +371,12 @@ def test_div(): def test_table(): - assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' - assert md(table_with_header_column) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' - assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' - assert md(table_missing_text) == '| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |' - assert md(table_missing_head) == '| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' + assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n' + assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n' + assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n' def test_strong_em_symbol():
Firstname