diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml
index 41240f8..000c0b2 100644
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -16,14 +16,14 @@ jobs:
steps:
- uses: actions/checkout@v2
- - name: Set up Python 3.6
+ - name: Set up Python 3.8
uses: actions/setup-python@v2
with:
- python-version: 3.6
+ python-version: 3.8
- name: Install dependencies
run: |
python -m pip install --upgrade pip
- pip install flake8==2.5.4 pytest
+ pip install flake8==3.8.4 pytest
if [ -f requirements.txt ]; then pip install -r requirements.txt; fi
- name: Lint with flake8
run: |
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
index 1a03a7b..9e3a349 100644
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -17,7 +17,7 @@ jobs:
- name: Set up Python
uses: actions/setup-python@v2
with:
- python-version: '3.x'
+ python-version: '3.8'
- name: Install dependencies
run: |
python -m pip install --upgrade pip
diff --git a/README.rst b/README.rst
index 4d21411..1e245c1 100644
--- a/README.rst
+++ b/README.rst
@@ -75,6 +75,18 @@ bullets
lists are nested. Otherwise, the bullet will alternate based on nesting
level. Defaults to ``'*+-'``.
+strong_em_symbol
+ In markdown, both ``*`` and ``_`` are used to encode **strong** or
+ *emphasized* texts. Either of these symbols can be chosen by the options
+ ``ASTERISK`` (default) or ``UNDERSCORE`` respectively.
+
+newline_style
+ Defines the style of marking linebreaks (``
``) in markdown. The default
+ value ``SPACES`` of this option will adopt the usual two spaces and a newline,
+ while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a
+ newline). While the latter convention is non-standard, it is commonly
+ preferred and supported by a lot of interpreters.
+
Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
index aa5f283..1322ac0 100644
--- a/markdownify/__init__.py
+++ b/markdownify/__init__.py
@@ -1,13 +1,12 @@
-from bs4 import BeautifulSoup, NavigableString
+from bs4 import BeautifulSoup, NavigableString, Comment
import re
import six
convert_heading_re = re.compile(r'convert_h(\d+)')
line_beginning_re = re.compile(r'^', re.MULTILINE)
-whitespace_re = re.compile(r'[\r\n\s\t ]+')
-FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__'
-wrapped = '
for el in node.children:
@@ -79,15 +90,17 @@ class MarkdownConverter(object):
# Convert the children first
for el in node.children:
- if isinstance(el, NavigableString):
+ if isinstance(el, Comment):
+ continue
+ elif isinstance(el, NavigableString):
text += self.process_text(six.text_type(el))
else:
- text += self.process_tag(el)
+ text += self.process_tag(el, convert_children_as_inline)
if not children_only:
convert_fn = getattr(self, 'convert_%s' % node.name, None)
if convert_fn and self.should_convert_tag(node.name):
- text = convert_fn(node, text)
+ text = convert_fn(node, text, convert_as_inline)
return text
@@ -100,8 +113,8 @@ class MarkdownConverter(object):
if m:
n = int(m.group(1))
- def convert_tag(el, text):
- return self.convert_hn(n, el, text)
+ def convert_tag(el, text, convert_as_inline):
+ return self.convert_hn(n, el, text, convert_as_inline)
convert_tag.__name__ = 'convert_h%s' % n
setattr(self, convert_tag.__name__, convert_tag)
@@ -127,35 +140,52 @@ class MarkdownConverter(object):
text = (text or '').rstrip()
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
- def convert_a(self, el, text):
+ def convert_a(self, el, text, convert_as_inline):
prefix, suffix, text = chomp(text)
if not text:
return ''
+ if convert_as_inline:
+ return text
href = el.get('href')
title = el.get('title')
- if self.options['autolinks'] and text == href and not title:
+ # For the replacement see #29: text nodes underscores are escaped
+ if self.options['autolinks'] and text.replace(r'\_', '_') == href and not title:
# Shortcut syntax
return '<%s>' % href
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
- def convert_b(self, el, text):
- return self.convert_strong(el, text)
+ def convert_b(self, el, text, convert_as_inline):
+ return self.convert_strong(el, text, convert_as_inline)
- def convert_blockquote(self, el, text):
- return '\n' + line_beginning_re.sub('> ', text) if text else ''
+ def convert_blockquote(self, el, text, convert_as_inline):
- def convert_br(self, el, text):
- return ' \n'
+ if convert_as_inline:
+ return text
- def convert_em(self, el, text):
+ return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else ''
+
+ def convert_br(self, el, text, convert_as_inline):
+ if convert_as_inline:
+ return ""
+
+ if self.options['newline_style'].lower() == BACKSLASH:
+ return '\\\n'
+ else:
+ return ' \n'
+
+ def convert_em(self, el, text, convert_as_inline):
+ em_tag = self.options['strong_em_symbol']
prefix, suffix, text = chomp(text)
if not text:
return ''
- return '%s*%s*%s' % (prefix, text, suffix)
+ return '%s%s%s%s%s' % (prefix, em_tag, text, em_tag, suffix)
- def convert_hn(self, n, el, text):
- style = self.options['heading_style']
+ def convert_hn(self, n, el, text, convert_as_inline):
+ if convert_as_inline:
+ return text
+
+ style = self.options['heading_style'].lower()
text = text.rstrip()
if style == UNDERLINED and n <= 2:
line = '=' if n == 1 else '-'
@@ -165,10 +195,14 @@ class MarkdownConverter(object):
return '%s %s %s\n\n' % (hashes, text, hashes)
return '%s %s\n\n' % (hashes, text)
- def convert_i(self, el, text):
- return self.convert_em(el, text)
+ def convert_i(self, el, text, convert_as_inline):
+ return self.convert_em(el, text, convert_as_inline)
+
+ def convert_list(self, el, text, convert_as_inline):
+
+ # Converting a list to inline is undefined.
+ # Ignoring convert_to_inline for list.
- def convert_list(self, el, text):
nested = False
before_paragraph = False
if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
@@ -186,7 +220,7 @@ class MarkdownConverter(object):
convert_ul = convert_list
convert_ol = convert_list
- def convert_li(self, el, text):
+ def convert_li(self, el, text, convert_as_inline):
parent = el.parent
if parent is not None and parent.name == 'ol':
if parent.get("start"):
@@ -204,22 +238,48 @@ class MarkdownConverter(object):
bullet = bullets[depth % len(bullets)]
return '%s %s\n' % (bullet, text or '')
- def convert_p(self, el, text):
+ def convert_p(self, el, text, convert_as_inline):
+ if convert_as_inline:
+ return text
return '%s\n\n' % text if text else ''
- def convert_strong(self, el, text):
+ def convert_strong(self, el, text, convert_as_inline):
+ strong_tag = 2 * self.options['strong_em_symbol']
prefix, suffix, text = chomp(text)
if not text:
return ''
- return '%s**%s**%s' % (prefix, text, suffix)
+ return '%s%s%s%s%s' % (prefix, strong_tag, text, strong_tag, suffix)
- def convert_img(self, el, text):
+ def convert_img(self, el, text, convert_as_inline):
alt = el.attrs.get('alt', None) or ''
src = el.attrs.get('src', None) or ''
title = el.attrs.get('title', None) or ''
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
+ if convert_as_inline:
+ return alt
+
return '' % (alt, src, title_part)
+ def convert_table(self, el, text, convert_as_inline):
+ rows = el.find_all('tr')
+ text_data = []
+ for row in rows:
+ headers = row.find_all('th')
+ columns = row.find_all('td')
+ if len(headers) > 0:
+ headers = [head.text.strip() for head in headers]
+ text_data.append('| ' + ' | '.join(headers) + ' |')
+ text_data.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
+ elif len(columns) > 0:
+ columns = [colm.text.strip() for colm in columns]
+ text_data.append('| ' + ' | '.join(columns) + ' |')
+ else:
+ continue
+ return '\n'.join(text_data)
+
+ def convert_hr(self, el, text, convert_as_inline):
+ return '\n\n---\n\n'
+
def markdownify(html, **options):
return MarkdownConverter(**options).convert(html)
diff --git a/setup.py b/setup.py
index 06ab404..bdf2b70 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
pkgmeta = {
'__title__': 'markdownify',
'__author__': 'Matthew Tretter',
- '__version__': '0.5.2',
+ '__version__': '0.7.1',
}
@@ -50,7 +50,7 @@ class LintCommand(Command):
yield "%s.py" % filename
def run(self):
- from flake8.engine import get_style_guide
+ from flake8.api.legacy import get_style_guide
flake8_style = get_style_guide(config_file='setup.cfg')
paths = self.distribution_files()
report = flake8_style.check_files(paths)
@@ -70,13 +70,13 @@ setup(
zip_safe=False,
include_package_data=True,
setup_requires=[
- 'flake8',
+ 'flake8>=3.8,<4',
],
tests_require=[
- 'pytest',
+ 'pytest>=6.2,<7',
],
install_requires=[
- 'beautifulsoup4', 'six'
+ 'beautifulsoup4>=4.9,<5', 'six>=1.15,<2'
],
classifiers=[
'Environment :: Web Environment',
@@ -87,6 +87,9 @@ setup(
'Programming Language :: Python :: 2.5',
'Programming Language :: Python :: 2.6',
'Programming Language :: Python :: 2.7',
+ 'Programming Language :: Python :: 3.6',
+ 'Programming Language :: Python :: 3.7',
+ 'Programming Language :: Python :: 3.8',
'Topic :: Utilities'
],
cmdclass={
diff --git a/tests/test_advanced.py b/tests/test_advanced.py
index 4c480d7..7ee61d2 100644
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -4,3 +4,13 @@ from markdownify import markdownify as md
def test_nested():
text = md('This is an example link.
')
assert text == 'This is an [example link](http://example.com/).\n\n'
+
+
+def test_ignore_comments():
+ text = md("")
+ assert text == ""
+
+
+def test_ignore_comments_with_other_tags():
+ text = md("example link")
+ assert text == "[example link](http://example.com/)"
diff --git a/tests/test_basic.py b/tests/test_basic.py
index 78775b6..bf25ee0 100644
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -10,4 +10,4 @@ def test_soup():
def test_whitespace():
- assert md(' a b \n\n c ') == ' a b c '
+ assert md(' a b \t\t c ') == ' a b c '
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
index 07aae57..68bb81e 100644
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -1,4 +1,5 @@
-from markdownify import markdownify as md, ATX, ATX_CLOSED
+from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
+import re
nested_uls = """
@@ -40,6 +41,76 @@ nested_ols = """
"""
+table = re.sub(r'\s+', '', """
+| Firstname | +Lastname | +Age | +
|---|---|---|
| Jill | +Smith | +50 | +
| Eve | +Jackson | +94 | +
| Firstname | +Lastname | +Age | +
|---|---|---|
| Jill | +Smith | +50 | +
| Eve | +Jackson | +94 | +
| + | Lastname | +Age | +
|---|---|---|
| Jill | ++ | 50 | +
| Eve | +Jackson | +94 | +
Hello').strip() == '> Hello' + assert md('
Hello') == '\n> Hello\n\n' + + +def test_blockquote_with_paragraph(): + assert md('
Hello
handsome
') == '\n> Hello\n\nhandsome\n\n' def test_nested_blockquote(): - text = md('And she was like').strip() - assert text == '> And she was like \n> > Hello' + text = md('Hello
And she was like') + assert text == '\n> And she was like \n> > Hello\n> \n> \n\n' def test_br(): @@ -125,6 +204,59 @@ def test_hn(): assert md('Hello
P
C ', heading_style=ATX_CLOSED) == '# A P C #\n\n' + assert md('P
C ', heading_style=ATX) == '# A P C\n\n' + + +def test_hn_nested_simple_tag(): + tag_to_markdown = [ + ("strong", "**strong**"), + ("b", "**b**"), + ("em", "*em*"), + ("i", "*i*"), + ("p", "p"), + ("a", "a"), + ("div", "div"), + ("blockquote", "blockquote"), + ] + + for tag, markdown in tag_to_markdown: + assert md('
') == ''
+ assert md('
') == ''
+ image_attributes_to_markdown = [
+ ("", ""),
+ ("alt='Alt Text'", "Alt Text"),
+ ("alt='Alt Text' title='Optional title'", "Alt Text"),
+ ]
+ for image_attributes, markdown in image_attributes_to_markdown:
+ assert md('
BHello
\nWorld
') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n' + + +def test_head(): + assert md('head') == 'head' + + def test_atx_headings(): assert md('
') == ''
assert md('
') == ''
+
+
+def test_div():
+ assert md('Hello World') == 'Hello World'
+
+
+def test_table():
+ assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |'
+ assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |'
+ assert md(table_missing_text) == '| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |'
+
+
+def test_strong_em_symbol():
+ assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__'
+ assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__'
+ assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_'
+ assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_'
+
+
+def test_newline_style():
+ assert md('a