Compare commits

..

6 Commits
0.7.3 ... 0.7.4

Author SHA1 Message Date
AlexVonB
ea5b22824b Merge branch 'develop' 2021-05-18 10:42:27 +02:00
AlexVonB
9f3c4c9fa0 bump to v0.7.4 2021-05-18 10:42:16 +02:00
AlexVonB
967db26b3a Merge branch 'fix-headless-tables' into develop 2021-05-18 10:41:42 +02:00
AlexVonB
ea81407b87 implemented table parsing correctly
instead of manually walking down the dom tree
in a table, we now rely on the main descent loop
and just implement conversion for rows and cells
correctly. this enables the use of html inside a
table cell.
2021-05-17 14:00:00 +02:00
AlexVonB
e6da15c173 allow tables with headers in first (or any) column 2021-05-17 12:36:48 +02:00
AlexVonB
7dac92e85e Allow for tables without header row
fixes #42
2021-05-16 19:02:04 +02:00
4 changed files with 112 additions and 45 deletions

View File

@@ -84,18 +84,26 @@ class MarkdownConverter(object):
if not children_only and isHeading: if not children_only and isHeading:
convert_children_as_inline = True convert_children_as_inline = True
# Remove whitespace-only textnodes in lists # Remove whitespace-only textnodes in purely nested nodes
def is_list_node(el): def is_nested_node(el):
return el and el.name in ['ol', 'ul', 'li'] return el and el.name in ['ol', 'ul', 'li',
'table', 'thead', 'tbody', 'tfoot',
'tr', 'td', 'th']
if is_list_node(node): if is_nested_node(node):
for el in node.children: for el in node.children:
# Only extract (remove) whitespace-only text node if any of the conditions is true: # Only extract (remove) whitespace-only text node if any of the
# conditions is true:
# - el is the first element in its parent # - el is the first element in its parent
# - el is the last element in its parent # - el is the last element in its parent
# - el is adjacent to an list node # - el is adjacent to an nested node
can_extract = not el.previous_sibling or not el.next_sibling or is_list_node(el.previous_sibling) or is_list_node(el.next_sibling) can_extract = (not el.previous_sibling
if isinstance(el, NavigableString) and six.text_type(el).strip() == '' and can_extract: or not el.next_sibling
or is_nested_node(el.previous_sibling)
or is_nested_node(el.next_sibling))
if (isinstance(el, NavigableString)
and six.text_type(el).strip() == ''
and can_extract):
el.extract() el.extract()
# Convert the children first # Convert the children first
@@ -277,21 +285,28 @@ class MarkdownConverter(object):
return '![%s](%s%s)' % (alt, src, title_part) return '![%s](%s%s)' % (alt, src, title_part)
def convert_table(self, el, text, convert_as_inline): def convert_table(self, el, text, convert_as_inline):
rows = el.find_all('tr') return '\n\n' + text + '\n'
text_data = []
for row in rows: def convert_tr(self, el, text, convert_as_inline):
headers = row.find_all('th') cells = el.find_all(['td', 'th'])
columns = row.find_all('td') is_headrow = all([cell.name == 'th' for cell in cells])
if len(headers) > 0: overline = ''
headers = [head.text.strip() for head in headers] underline = ''
text_data.append('| ' + ' | '.join(headers) + ' |') if is_headrow and not el.previous_sibling:
text_data.append('| ' + ' | '.join(['---'] * len(headers)) + ' |') # first row and is headline: print headline underline
elif len(columns) > 0: underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
columns = [colm.text.strip() for colm in columns] elif not el.previous_sibling and not el.parent.name != 'table':
text_data.append('| ' + ' | '.join(columns) + ' |') # first row, not headline, and the parent is sth. like tbody:
else: # print empty headline above this row
continue overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
return '\n'.join(text_data) overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
return overline + '|' + text + '\n' + underline
def convert_th(self, el, text, convert_as_inline):
return ' ' + text + ' |'
def convert_td(self, el, text, convert_as_inline):
return ' ' + text + ' |'
def convert_hr(self, el, text, convert_as_inline): def convert_hr(self, el, text, convert_as_inline):
return '\n\n---\n\n' return '\n\n---\n\n'

View File

@@ -1,2 +1,2 @@
[flake8] [flake8]
ignore = E501 ignore = E501 W503

View File

@@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
pkgmeta = { pkgmeta = {
'__title__': 'markdownify', '__title__': 'markdownify',
'__author__': 'Matthew Tretter', '__author__': 'Matthew Tretter',
'__version__': '0.7.3', '__version__': '0.7.4',
} }

View File

@@ -1,5 +1,4 @@
from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
import re
nested_uls = """ nested_uls = """
@@ -41,8 +40,7 @@ nested_ols = """
</ul>""" </ul>"""
table = re.sub(r'\s+', '', """ table = """<table>
<table>
<tr> <tr>
<th>Firstname</th> <th>Firstname</th>
<th>Lastname</th> <th>Lastname</th>
@@ -58,18 +56,54 @@ table = re.sub(r'\s+', '', """
<td>Jackson</td> <td>Jackson</td>
<td>94</td> <td>94</td>
</tr> </tr>
</table> </table>"""
""")
table_head_body = re.sub(r'\s+', '', """ table_with_html_content = """<table>
<table> <tr>
<th>Firstname</th>
<th>Lastname</th>
<th>Age</th>
</tr>
<tr>
<td><b>Jill</b></td>
<td><i>Smith</i></td>
<td><a href="#">50</a></td>
</tr>
<tr>
<td>Eve</td>
<td>Jackson</td>
<td>94</td>
</tr>
</table>"""
table_with_header_column = """<table>
<tr>
<th>Firstname</th>
<th>Lastname</th>
<th>Age</th>
</tr>
<tr>
<th>Jill</th>
<td>Smith</td>
<td>50</td>
</tr>
<tr>
<th>Eve</th>
<td>Jackson</td>
<td>94</td>
</tr>
</table>"""
table_head_body = """<table>
<thead> <thead>
<tr> <tr>
<th>Firstname</th> <th>Firstname</th>
<th>Lastname</th> <th>Lastname</th>
<th>Age</th> <th>Age</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr> <tr>
@@ -83,17 +117,15 @@ table_head_body = re.sub(r'\s+', '', """
<td>94</td> <td>94</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>"""
""")
table_missing_text = re.sub(r'\s+', '', """ table_missing_text = """<table>
<table>
<thead> <thead>
<tr> <tr>
<th></th> <th></th>
<th>Lastname</th> <th>Lastname</th>
<th>Age</th> <th>Age</th>
</tr> </tr>
</thead> </thead>
<tbody> <tbody>
<tr> <tr>
@@ -107,8 +139,25 @@ table_missing_text = re.sub(r'\s+', '', """
<td>94</td> <td>94</td>
</tr> </tr>
</tbody> </tbody>
</table> </table>"""
""")
table_missing_head = """<table>
<tr>
<td>Firstname</td>
<td>Lastname</td>
<td>Age</td>
</tr>
<tr>
<td>Jill</td>
<td>Smith</td>
<td>50</td>
</tr>
<tr>
<td>Eve</td>
<td>Jackson</td>
<td>94</td>
</tr>
</table>"""
def test_chomp(): def test_chomp():
@@ -322,9 +371,12 @@ def test_div():
def test_table(): def test_table():
assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
assert md(table_missing_text) == '| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |' assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_missing_text) == '\n\n| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_missing_head) == '\n\n| | | |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
def test_strong_em_symbol(): def test_strong_em_symbol():