Compare commits

..

25 Commits

Author SHA1 Message Date
AlexVonB
9231704988 Merge branch 'develop' 2021-12-11 14:44:58 +01:00
AlexVonB
1613c302bc Merge branch 'develop' 2021-11-17 17:11:01 +01:00
AlexVonB
55c9e84f38 Merge branch 'develop' 2021-09-04 21:50:34 +02:00
AlexVonB
99875683ac Merge branch 'develop' 2021-08-25 08:53:38 +02:00
AlexVonB
eaeb0603eb Merge branch 'develop' 2021-07-11 13:21:20 +02:00
AlexVonB
cb73590623 Merge branch 'develop' 2021-07-11 13:14:29 +02:00
AlexVonB
59417ab115 Merge branch 'develop' 2021-05-30 19:10:49 +02:00
AlexVonB
917b01e548 Merge branch 'develop' 2021-05-30 11:20:32 +02:00
AlexVonB
652714859d Merge branch 'develop' 2021-05-21 14:18:14 +02:00
AlexVonB
ea5b22824b Merge branch 'develop' 2021-05-18 10:42:27 +02:00
AlexVonB
ec5858e42f Merge branch 'develop' 2021-05-16 18:41:24 +02:00
AlexVonB
02bb914ef3 Merge branch 'develop' 2021-05-02 13:49:30 +02:00
AlexVonB
21c0d034d0 Merge branch 'develop' 2021-05-02 10:51:00 +02:00
AlexVonB
e3ddc789a2 Merge branch 'develop' 2021-04-22 12:43:27 +02:00
AlexVonB
2d0cd97323 Merge branch 'develop' 2021-04-22 12:13:03 +02:00
AlexVonB
ec185e2e9c Merge branch 'develop' 2021-02-21 23:09:55 +01:00
AlexVonB
079d1721aa Merge branch 'develop' 2021-02-21 20:58:34 +01:00
AlexVonB
bf24df3e2e bump to v0.6.3 2021-01-12 22:43:18 +01:00
AlexVonB
15329588b1 Merge branch 'develop' 2021-01-12 22:42:58 +01:00
AlexVonB
34ad8485fa bump to v0.6.2 2021-01-12 22:40:03 +01:00
AlexVonB
f0ce934bf8 Merge branch 'develop' 2021-01-12 22:39:47 +01:00
AlexVonB
99cd237f27 Merge branch 'develop' 2021-01-04 10:22:02 +01:00
AlexVonB
2bde8d3e8e Merge branch 'develop' 2021-01-02 16:49:28 +01:00
AlexVonB
8c9b029756 Merge branch 'develop' 2020-09-01 18:10:07 +02:00
AlexVonB
ae50065872 Merge branch 'develop' 2020-08-18 18:53:10 +02:00
6 changed files with 20 additions and 99 deletions

View File

@@ -32,14 +32,14 @@ Convert some HTML to Markdown:
from markdownify import markdownify as md
md('<b>Yay</b> <a href="http://github.com">GitHub</a>') # > '**Yay** [GitHub](http://github.com)'
Specify tags to exclude:
Specify tags to exclude (blacklist):
.. code:: python
from markdownify import markdownify as md
md('<b>Yay</b> <a href="http://github.com">GitHub</a>', strip=['a']) # > '**Yay** GitHub'
\...or specify the tags you want to include:
\...or specify the tags you want to include (whitelist):
.. code:: python
@@ -53,11 +53,11 @@ Options
Markdownify supports the following options:
strip
A list of tags to strip. This option can't be used with the
A list of tags to strip (blacklist). This option can't be used with the
``convert`` option.
convert
A list of tags to convert. This option can't be used with the
A list of tags to convert (whitelist). This option can't be used with the
``strip`` option.
autolinks
@@ -102,50 +102,10 @@ code_language
should be annotated with `````python`` or similar.
Defaults to ``''`` (empty string) and can be any string.
code_language_callback
When the HTML code contains ``pre`` tags that in some way provide the code
language, for example as class, this callback can be used to extract the
language from the tag and prefix it to the converted ``pre`` tag.
The callback gets one single argument, an BeautifylSoup object, and returns
a string containing the code language, or ``None``.
An example to use the class name as code language could be::
def callback(el):
return el['class'][0] if el.has_attr('class') else None
Defaults to ``None``.
escape_asterisks
If set to ``False``, do not escape ``*`` to ``\*`` in text.
Defaults to ``True``.
escape_underscores
If set to ``False``, do not escape ``_`` to ``\_`` in text.
Defaults to ``True``.
keep_inline_images_in
Images are converted to their alt-text when the images are located inside
headlines or table cells. If some inline images should be converted to
markdown images instead, this option can be set to a list of parent tags
that should be allowed to contain inline images, for example ``['td']``.
Defaults to an empty list.
Options may be specified as kwargs to the ``markdownify`` function, or as a
nested ``Options`` class in ``MarkdownConverter`` subclasses.
Converting BeautifulSoup objects
================================
.. code:: python
from markdownify import MarkdownConverter
# Create shorthand method for conversion
def md(soup, **options):
return MarkdownConverter(**options).convert_soup(soup)
Creating Custom Converters
==========================

View File

@@ -25,6 +25,12 @@ ASTERISK = '*'
UNDERSCORE = '_'
def escape(text):
if not text:
return ''
return text.replace('_', r'\_')
def chomp(text):
"""
If the text in an inline tag like b, a, or em contains a leading or trailing
@@ -62,19 +68,15 @@ class MarkdownConverter(object):
class DefaultOptions:
autolinks = True
bullets = '*+-' # An iterable of bullet types.
code_language = ''
code_language_callback = None
convert = None
default_title = False
escape_asterisks = True
escape_underscores = True
heading_style = UNDERLINED
keep_inline_images_in = []
newline_style = SPACES
strip = None
strong_em_symbol = ASTERISK
sub_symbol = ''
sup_symbol = ''
code_language = ''
class Options(DefaultOptions):
pass
@@ -91,9 +93,6 @@ class MarkdownConverter(object):
def convert(self, html):
soup = BeautifulSoup(html, 'html.parser')
return self.convert_soup(soup)
def convert_soup(self, soup):
return self.process_tag(soup, convert_as_inline=False, children_only=True)
def process_tag(self, node, convert_as_inline, children_only=False):
@@ -156,7 +155,7 @@ class MarkdownConverter(object):
text = whitespace_re.sub(' ', text)
if el.parent.name != 'code':
text = self.escape(text)
text = escape(text)
# remove trailing whitespaces if any of the following condition is true:
# - current text node is the last node in li
@@ -194,15 +193,6 @@ class MarkdownConverter(object):
else:
return True
def escape(self, text):
if not text:
return ''
if self.options['escape_asterisks']:
text = text.replace('*', r'\*')
if self.options['escape_underscores']:
text = text.replace('_', r'\_')
return text
def indent(self, text, level):
return line_beginning_re.sub('\t' * level, text) if text else ''
@@ -282,8 +272,7 @@ class MarkdownConverter(object):
src = el.attrs.get('src', None) or ''
title = el.attrs.get('title', None) or ''
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
if (convert_as_inline
and el.parent.name not in self.options['keep_inline_images_in']):
if convert_as_inline:
return alt
return '![%s](%s%s)' % (alt, src, title_part)
@@ -336,12 +325,7 @@ class MarkdownConverter(object):
def convert_pre(self, el, text, convert_as_inline):
if not text:
return ''
code_language = self.options['code_language']
if self.options['code_language_callback']:
code_language = self.options['code_language_callback'](el) or code_language
return '\n```%s\n%s\n```\n' % (code_language, text)
return '\n```%s\n%s\n```\n' % (self.options['code_language'], text)
convert_s = convert_del

View File

@@ -10,7 +10,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
pkgmeta = {
'__title__': 'markdownify',
'__author__': 'Matthew Tretter',
'__version__': '0.10.3',
'__version__': '0.10.1',
}

View File

@@ -133,13 +133,12 @@ def test_hn_nested_simple_tag():
def test_hn_nested_img():
image_attributes_to_markdown = [
("", "", ""),
("alt='Alt Text'", "Alt Text", ""),
("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
("", ""),
("alt='Alt Text'", "Alt Text"),
("alt='Alt Text' title='Optional title'", "Alt Text"),
]
for image_attributes, markdown, title in image_attributes_to_markdown:
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
for image_attributes, markdown in image_attributes_to_markdown:
assert md('<h3>A <img src="/path/to/img.jpg " ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
def test_hn_atx_headings():
@@ -216,12 +215,3 @@ def test_sup():
def test_lang():
assert md('<pre>test\n foo\nbar</pre>', code_language='python') == '\n```python\ntest\n foo\nbar\n```\n'
assert md('<pre><code>test\n foo\nbar</code></pre>', code_language='javascript') == '\n```javascript\ntest\n foo\nbar\n```\n'
def test_lang_callback():
def callback(el):
return el['class'][0] if el.has_attr('class') else None
assert md('<pre class="python">test\n foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n foo\nbar\n```\n'
assert md('<pre class="javascript"><code>test\n foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'
assert md('<pre class="javascript"><code class="javascript">test\n foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n foo\nbar\n```\n'

View File

@@ -1,5 +1,4 @@
from markdownify import MarkdownConverter
from bs4 import BeautifulSoup
class ImageBlockConverter(MarkdownConverter):
@@ -17,9 +16,3 @@ def test_img():
assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />') == '![Alt text](/path/to/img.jpg "Optional title")\n\n'
assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)\n\n'
def test_soup():
html = '<b>test</b>'
soup = BeautifulSoup(html, 'html.parser')
assert MarkdownConverter().convert_soup(soup) == '**test**'

View File

@@ -1,14 +1,8 @@
from markdownify import markdownify as md
def test_asterisks():
assert md('*hey*dude*') == r'\*hey\*dude\*'
assert md('*hey*dude*', escape_asterisks=False) == r'*hey*dude*'
def test_underscore():
assert md('_hey_dude_') == r'\_hey\_dude\_'
assert md('_hey_dude_', escape_underscores=False) == r'_hey_dude_'
def test_xml_entities():