diff --git a/.github/workflows/python-app.yml b/.github/workflows/python-app.yml index 41240f8..000c0b2 100644 --- a/.github/workflows/python-app.yml +++ b/.github/workflows/python-app.yml @@ -16,14 +16,14 @@ jobs: steps: - uses: actions/checkout@v2 - - name: Set up Python 3.6 + - name: Set up Python 3.8 uses: actions/setup-python@v2 with: - python-version: 3.6 + python-version: 3.8 - name: Install dependencies run: | python -m pip install --upgrade pip - pip install flake8==2.5.4 pytest + pip install flake8==3.8.4 pytest if [ -f requirements.txt ]; then pip install -r requirements.txt; fi - name: Lint with flake8 run: | diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml index 1a03a7b..9e3a349 100644 --- a/.github/workflows/python-publish.yml +++ b/.github/workflows/python-publish.yml @@ -17,7 +17,7 @@ jobs: - name: Set up Python uses: actions/setup-python@v2 with: - python-version: '3.x' + python-version: '3.8' - name: Install dependencies run: | python -m pip install --upgrade pip diff --git a/README.rst b/README.rst index 4d21411..1e245c1 100644 --- a/README.rst +++ b/README.rst @@ -75,6 +75,18 @@ bullets lists are nested. Otherwise, the bullet will alternate based on nesting level. Defaults to ``'*+-'``. +strong_em_symbol + In markdown, both ``*`` and ``_`` are used to encode **strong** or + *emphasized* texts. Either of these symbols can be chosen by the options + ``ASTERISK`` (default) or ``UNDERSCORE`` respectively. + +newline_style + Defines the style of marking linebreaks (``
``) in markdown. The default + value ``SPACES`` of this option will adopt the usual two spaces and a newline, + while ``BACKSLASH`` will convert a linebreak to ``\\n`` (a backslash an a + newline). While the latter convention is non-standard, it is commonly + preferred and supported by a lot of interpreters. + Options may be specified as kwargs to the ``markdownify`` function, or as a nested ``Options`` class in ``MarkdownConverter`` subclasses. diff --git a/markdownify/__init__.py b/markdownify/__init__.py index aa5f283..1322ac0 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -1,13 +1,12 @@ -from bs4 import BeautifulSoup, NavigableString +from bs4 import BeautifulSoup, NavigableString, Comment import re import six convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) -whitespace_re = re.compile(r'[\r\n\s\t ]+') -FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__' -wrapped = '
%%s
' % FRAGMENT_ID +whitespace_re = re.compile(r'[\t ]+') +html_heading_re = re.compile(r'h[1-6]') # Heading styles @@ -16,6 +15,14 @@ ATX_CLOSED = 'atx_closed' UNDERLINED = 'underlined' SETEXT = UNDERLINED +# Newline style +SPACES = 'spaces' +BACKSLASH = 'backslash' + +# Strong and emphasis style +ASTERISK = '*' +UNDERSCORE = '_' + def escape(text): if not text: @@ -47,6 +54,8 @@ class MarkdownConverter(object): autolinks = True heading_style = UNDERLINED bullets = '*+-' # An iterable of bullet types. + strong_em_symbol = ASTERISK + newline_style = SPACES class Options(DefaultOptions): pass @@ -62,15 +71,17 @@ class MarkdownConverter(object): ' convert, but not both.') def convert(self, html): - # We want to take advantage of the html5 parsing, but we don't actually - # want a full document. Therefore, we'll mark our fragment with an id, - # create the document, and extract the element with the id. - html = wrapped % html soup = BeautifulSoup(html, 'html.parser') - return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True) + return self.process_tag(soup, convert_as_inline=False, children_only=True) - def process_tag(self, node, children_only=False): + def process_tag(self, node, convert_as_inline, children_only=False): text = '' + # markdown headings can't include block elements (elements w/newlines) + isHeading = html_heading_re.match(node.name) is not None + convert_children_as_inline = convert_as_inline + + if not children_only and isHeading: + convert_children_as_inline = True # Clean newline-only textnodes outside
         for el in node.children:
@@ -79,15 +90,17 @@ class MarkdownConverter(object):
 
         # Convert the children first
         for el in node.children:
-            if isinstance(el, NavigableString):
+            if isinstance(el, Comment):
+                continue
+            elif isinstance(el, NavigableString):
                 text += self.process_text(six.text_type(el))
             else:
-                text += self.process_tag(el)
+                text += self.process_tag(el, convert_children_as_inline)
 
         if not children_only:
             convert_fn = getattr(self, 'convert_%s' % node.name, None)
             if convert_fn and self.should_convert_tag(node.name):
-                text = convert_fn(node, text)
+                text = convert_fn(node, text, convert_as_inline)
 
         return text
 
@@ -100,8 +113,8 @@ class MarkdownConverter(object):
         if m:
             n = int(m.group(1))
 
-            def convert_tag(el, text):
-                return self.convert_hn(n, el, text)
+            def convert_tag(el, text, convert_as_inline):
+                return self.convert_hn(n, el, text, convert_as_inline)
 
             convert_tag.__name__ = 'convert_h%s' % n
             setattr(self, convert_tag.__name__, convert_tag)
@@ -127,35 +140,52 @@ class MarkdownConverter(object):
         text = (text or '').rstrip()
         return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
 
-    def convert_a(self, el, text):
+    def convert_a(self, el, text, convert_as_inline):
         prefix, suffix, text = chomp(text)
         if not text:
             return ''
+        if convert_as_inline:
+            return text
         href = el.get('href')
         title = el.get('title')
-        if self.options['autolinks'] and text == href and not title:
+        # For the replacement see #29: text nodes underscores are escaped
+        if self.options['autolinks'] and text.replace(r'\_', '_') == href and not title:
             # Shortcut syntax
             return '<%s>' % href
         title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
         return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
 
-    def convert_b(self, el, text):
-        return self.convert_strong(el, text)
+    def convert_b(self, el, text, convert_as_inline):
+        return self.convert_strong(el, text, convert_as_inline)
 
-    def convert_blockquote(self, el, text):
-        return '\n' + line_beginning_re.sub('> ', text) if text else ''
+    def convert_blockquote(self, el, text, convert_as_inline):
 
-    def convert_br(self, el, text):
-        return '  \n'
+        if convert_as_inline:
+            return text
 
-    def convert_em(self, el, text):
+        return '\n' + (line_beginning_re.sub('> ', text) + '\n\n') if text else ''
+
+    def convert_br(self, el, text, convert_as_inline):
+        if convert_as_inline:
+            return ""
+
+        if self.options['newline_style'].lower() == BACKSLASH:
+            return '\\\n'
+        else:
+            return '  \n'
+
+    def convert_em(self, el, text, convert_as_inline):
+        em_tag = self.options['strong_em_symbol']
         prefix, suffix, text = chomp(text)
         if not text:
             return ''
-        return '%s*%s*%s' % (prefix, text, suffix)
+        return '%s%s%s%s%s' % (prefix, em_tag, text, em_tag, suffix)
 
-    def convert_hn(self, n, el, text):
-        style = self.options['heading_style']
+    def convert_hn(self, n, el, text, convert_as_inline):
+        if convert_as_inline:
+            return text
+
+        style = self.options['heading_style'].lower()
         text = text.rstrip()
         if style == UNDERLINED and n <= 2:
             line = '=' if n == 1 else '-'
@@ -165,10 +195,14 @@ class MarkdownConverter(object):
             return '%s %s %s\n\n' % (hashes, text, hashes)
         return '%s %s\n\n' % (hashes, text)
 
-    def convert_i(self, el, text):
-        return self.convert_em(el, text)
+    def convert_i(self, el, text, convert_as_inline):
+        return self.convert_em(el, text, convert_as_inline)
+
+    def convert_list(self, el, text, convert_as_inline):
+
+        # Converting a list to inline is undefined.
+        # Ignoring convert_to_inline for list.
 
-    def convert_list(self, el, text):
         nested = False
         before_paragraph = False
         if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
@@ -186,7 +220,7 @@ class MarkdownConverter(object):
     convert_ul = convert_list
     convert_ol = convert_list
 
-    def convert_li(self, el, text):
+    def convert_li(self, el, text, convert_as_inline):
         parent = el.parent
         if parent is not None and parent.name == 'ol':
             if parent.get("start"):
@@ -204,22 +238,48 @@ class MarkdownConverter(object):
             bullet = bullets[depth % len(bullets)]
         return '%s %s\n' % (bullet, text or '')
 
-    def convert_p(self, el, text):
+    def convert_p(self, el, text, convert_as_inline):
+        if convert_as_inline:
+            return text
         return '%s\n\n' % text if text else ''
 
-    def convert_strong(self, el, text):
+    def convert_strong(self, el, text, convert_as_inline):
+        strong_tag = 2 * self.options['strong_em_symbol']
         prefix, suffix, text = chomp(text)
         if not text:
             return ''
-        return '%s**%s**%s' % (prefix, text, suffix)
+        return '%s%s%s%s%s' % (prefix, strong_tag, text, strong_tag, suffix)
 
-    def convert_img(self, el, text):
+    def convert_img(self, el, text, convert_as_inline):
         alt = el.attrs.get('alt', None) or ''
         src = el.attrs.get('src', None) or ''
         title = el.attrs.get('title', None) or ''
         title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
+        if convert_as_inline:
+            return alt
+
         return '![%s](%s%s)' % (alt, src, title_part)
 
+    def convert_table(self, el, text, convert_as_inline):
+        rows = el.find_all('tr')
+        text_data = []
+        for row in rows:
+            headers = row.find_all('th')
+            columns = row.find_all('td')
+            if len(headers) > 0:
+                headers = [head.text.strip() for head in headers]
+                text_data.append('| ' + ' | '.join(headers) + ' |')
+                text_data.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
+            elif len(columns) > 0:
+                columns = [colm.text.strip() for colm in columns]
+                text_data.append('| ' + ' | '.join(columns) + ' |')
+            else:
+                continue
+        return '\n'.join(text_data)
+
+    def convert_hr(self, el, text, convert_as_inline):
+        return '\n\n---\n\n'
+
 
 def markdownify(html, **options):
     return MarkdownConverter(**options).convert(html)
diff --git a/setup.py b/setup.py
index 06ab404..bdf2b70 100644
--- a/setup.py
+++ b/setup.py
@@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
 pkgmeta = {
     '__title__': 'markdownify',
     '__author__': 'Matthew Tretter',
-    '__version__': '0.5.2',
+    '__version__': '0.7.1',
 }
 
 
@@ -50,7 +50,7 @@ class LintCommand(Command):
                 yield "%s.py" % filename
 
     def run(self):
-        from flake8.engine import get_style_guide
+        from flake8.api.legacy import get_style_guide
         flake8_style = get_style_guide(config_file='setup.cfg')
         paths = self.distribution_files()
         report = flake8_style.check_files(paths)
@@ -70,13 +70,13 @@ setup(
     zip_safe=False,
     include_package_data=True,
     setup_requires=[
-        'flake8',
+        'flake8>=3.8,<4',
     ],
     tests_require=[
-        'pytest',
+        'pytest>=6.2,<7',
     ],
     install_requires=[
-        'beautifulsoup4', 'six'
+        'beautifulsoup4>=4.9,<5', 'six>=1.15,<2'
     ],
     classifiers=[
         'Environment :: Web Environment',
@@ -87,6 +87,9 @@ setup(
         'Programming Language :: Python :: 2.5',
         'Programming Language :: Python :: 2.6',
         'Programming Language :: Python :: 2.7',
+        'Programming Language :: Python :: 3.6',
+        'Programming Language :: Python :: 3.7',
+        'Programming Language :: Python :: 3.8',
         'Topic :: Utilities'
     ],
     cmdclass={
diff --git a/tests/test_advanced.py b/tests/test_advanced.py
index 4c480d7..7ee61d2 100644
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -4,3 +4,13 @@ from markdownify import markdownify as md
 def test_nested():
     text = md('

This is an example link.

') assert text == 'This is an [example link](http://example.com/).\n\n' + + +def test_ignore_comments(): + text = md("") + assert text == "" + + +def test_ignore_comments_with_other_tags(): + text = md("example link") + assert text == "[example link](http://example.com/)" diff --git a/tests/test_basic.py b/tests/test_basic.py index 78775b6..bf25ee0 100644 --- a/tests/test_basic.py +++ b/tests/test_basic.py @@ -10,4 +10,4 @@ def test_soup(): def test_whitespace(): - assert md(' a b \n\n c ') == ' a b c ' + assert md(' a b \t\t c ') == ' a b c ' diff --git a/tests/test_conversions.py b/tests/test_conversions.py index 07aae57..68bb81e 100644 --- a/tests/test_conversions.py +++ b/tests/test_conversions.py @@ -1,4 +1,5 @@ -from markdownify import markdownify as md, ATX, ATX_CLOSED +from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE +import re nested_uls = """ @@ -40,6 +41,76 @@ nested_ols = """ """ +table = re.sub(r'\s+', '', """ + + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
+""") + + +table_head_body = re.sub(r'\s+', '', """ + + + + + + + + + + + + + + + + + + + + +
FirstnameLastnameAge
JillSmith50
EveJackson94
+""") + +table_missing_text = re.sub(r'\s+', '', """ + + + + + + + + + + + + + + + + + + + + +
LastnameAge
Jill50
EveJackson94
+""") + + def test_chomp(): assert md(' ') == ' ' assert md(' ') == ' ' @@ -52,7 +123,11 @@ def test_chomp(): def test_a(): - assert md('Google') == '[Google](http://google.com)' + assert md('Google') == '[Google](https://google.com)' + assert md('https://google.com', autolinks=False) == '[https://google.com](https://google.com)' + assert md('https://google.com') == '' + assert md('https://community.kde.org/Get_Involved') == '' + assert md('https://community.kde.org/Get_Involved', autolinks=False) == '[https://community.kde.org/Get\\_Involved](https://community.kde.org/Get_Involved)' def test_a_spaces(): @@ -89,12 +164,16 @@ def test_b_spaces(): def test_blockquote(): - assert md('
Hello
').strip() == '> Hello' + assert md('
Hello
') == '\n> Hello\n\n' + + +def test_blockquote_with_paragraph(): + assert md('
Hello

handsome

') == '\n> Hello\n\nhandsome\n\n' def test_nested_blockquote(): - text = md('
And she was like
Hello
').strip() - assert text == '> And she was like \n> > Hello' + text = md('
And she was like
Hello
') + assert text == '\n> And she was like \n> > Hello\n> \n> \n\n' def test_br(): @@ -125,6 +204,59 @@ def test_hn(): assert md('
Hello
') == '###### Hello\n\n' +def test_hn_chained(): + assert md('

First

\n

Second

\n

Third

', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n' + assert md('X

First

', heading_style=ATX) == 'X# First\n\n' + + +def test_hn_nested_tag_heading_style(): + assert md('

A

P

C

', heading_style=ATX_CLOSED) == '# A P C #\n\n' + assert md('

A

P

C

', heading_style=ATX) == '# A P C\n\n' + + +def test_hn_nested_simple_tag(): + tag_to_markdown = [ + ("strong", "**strong**"), + ("b", "**b**"), + ("em", "*em*"), + ("i", "*i*"), + ("p", "p"), + ("a", "a"), + ("div", "div"), + ("blockquote", "blockquote"), + ] + + for tag, markdown in tag_to_markdown: + assert md('

A <' + tag + '>' + tag + ' B

') == '### A ' + markdown + ' B\n\n' + + assert md('

A
B

', heading_style=ATX) == '### A B\n\n' + + # Nested lists not supported + # assert md('

A
  • li1
  • l2

', heading_style=ATX) == '### A li1 li2 B\n\n' + + +def test_hn_nested_img(): + assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")' + assert md('Alt text') == '![Alt text](/path/to/img.jpg)' + image_attributes_to_markdown = [ + ("", ""), + ("alt='Alt Text'", "Alt Text"), + ("alt='Alt Text' title='Optional title'", "Alt Text"), + ] + for image_attributes, markdown in image_attributes_to_markdown: + assert md('

A B

') == '### A ' + markdown + ' B\n\n' + + +def test_hr(): + assert md('Hello
World') == 'Hello\n\n---\n\nWorld' + assert md('Hello
World') == 'Hello\n\n---\n\nWorld' + assert md('

Hello

\n
\n

World

') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n' + + +def test_head(): + assert md('head') == 'head' + + def test_atx_headings(): assert md('

Hello

', heading_style=ATX) == '# Hello\n\n' assert md('

Hello

', heading_style=ATX) == '## Hello\n\n' @@ -179,3 +311,24 @@ def test_bullets(): def test_img(): assert md('Alt text') == '![Alt text](/path/to/img.jpg "Optional title")' assert md('Alt text') == '![Alt text](/path/to/img.jpg)' + + +def test_div(): + assert md('Hello World') == 'Hello World' + + +def test_table(): + assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' + assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |' + assert md(table_missing_text) == '| | Lastname | Age |\n| --- | --- | --- |\n| Jill | | 50 |\n| Eve | Jackson | 94 |' + + +def test_strong_em_symbol(): + assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' + assert md('Hello', strong_em_symbol=UNDERSCORE) == '__Hello__' + assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' + assert md('Hello', strong_em_symbol=UNDERSCORE) == '_Hello_' + + +def test_newline_style(): + assert md('a
b
c', newline_style=BACKSLASH) == 'a\\\nb\\\nc' diff --git a/tests/test_escaping.py b/tests/test_escaping.py index 9b0d4fa..23a828c 100644 --- a/tests/test_escaping.py +++ b/tests/test_escaping.py @@ -2,7 +2,7 @@ from markdownify import markdownify as md def test_underscore(): - assert md('_hey_dude_') == '\_hey\_dude\_' + assert md('_hey_dude_') == r'\_hey\_dude\_' def test_xml_entities():