Compare commits
17 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b5c724ab33 | ||
|
|
964d89fa8a | ||
|
|
46dc1a002d | ||
|
|
8c810eb8a8 | ||
|
|
f6c8daf8a5 | ||
|
|
75a678dab9 | ||
|
|
0a5c89aa49 | ||
|
|
51390d7389 | ||
|
|
50b4640db2 | ||
|
|
7861b330cd | ||
|
|
2ec33384de | ||
|
|
c1672aee44 | ||
|
|
43dbe20aaf | ||
|
|
46af45bb3c | ||
|
|
2bd0772685 | ||
|
|
383847ee86 | ||
|
|
74ddc408cc |
5
.github/workflows/python-app.yml
vendored
5
.github/workflows/python-app.yml
vendored
@@ -23,7 +23,10 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install tox
|
||||
pip install --upgrade setuptools setuptools_scm wheel build tox
|
||||
- name: Lint and test
|
||||
run: |
|
||||
tox
|
||||
- name: Build
|
||||
run: |
|
||||
python -m build -nwsx .
|
||||
|
||||
4
.github/workflows/python-publish.yml
vendored
4
.github/workflows/python-publish.yml
vendored
@@ -21,11 +21,11 @@ jobs:
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install setuptools wheel twine
|
||||
pip install --upgrade setuptools setuptools_scm wheel build twine
|
||||
- name: Build and publish
|
||||
env:
|
||||
TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
|
||||
TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
|
||||
run: |
|
||||
python setup.py sdist bdist_wheel
|
||||
python -m build -nwsx .
|
||||
twine upload dist/*
|
||||
|
||||
@@ -1 +1,2 @@
|
||||
include README.rst
|
||||
prune tests
|
||||
|
||||
15
README.rst
15
README.rst
@@ -1,8 +1,8 @@
|
||||
|build| |version| |license| |downloads|
|
||||
|
||||
.. |build| image:: https://img.shields.io/github/workflow/status/matthewwithanm/python-markdownify/Python%20application/develop
|
||||
.. |build| image:: https://img.shields.io/github/actions/workflow/status/matthewwithanm/python-markdownify/python-app.yml?branch=develop
|
||||
:alt: GitHub Workflow Status
|
||||
:target: https://github.com/matthewwithanm/python-markdownify/actions?query=workflow%3A%22Python+application%22
|
||||
:target: https://github.com/matthewwithanm/python-markdownify/actions/workflows/python-app.yml?query=workflow%3A%22Python+application%22
|
||||
|
||||
.. |version| image:: https://img.shields.io/pypi/v/markdownify
|
||||
:alt: Pypi version
|
||||
@@ -87,7 +87,11 @@ strong_em_symbol
|
||||
sub_symbol, sup_symbol
|
||||
Define the chars that surround ``<sub>`` and ``<sup>`` text. Defaults to an
|
||||
empty string, because this is non-standard behavior. Could be something like
|
||||
``~`` and ``^`` to result in ``~sub~`` and ``^sup^``.
|
||||
``~`` and ``^`` to result in ``~sub~`` and ``^sup^``. If the value starts
|
||||
with ``<`` and ends with ``>``, it is treated as an HTML tag and a ``/`` is
|
||||
inserted after the ``<`` in the string used after the text; this allows
|
||||
specifying ``<sub>`` to use raw HTML in the output for subscripts, for
|
||||
example.
|
||||
|
||||
newline_style
|
||||
Defines the style of marking linebreaks (``<br>``) in markdown. The default
|
||||
@@ -123,6 +127,11 @@ escape_underscores
|
||||
If set to ``False``, do not escape ``_`` to ``\_`` in text.
|
||||
Defaults to ``True``.
|
||||
|
||||
escape_misc
|
||||
If set to ``False``, do not escape miscellaneous punctuation characters
|
||||
that sometimes have Markdown significance in text.
|
||||
Defaults to ``True``.
|
||||
|
||||
keep_inline_images_in
|
||||
Images are converted to their alt-text when the images are located inside
|
||||
headlines or table cells. If some inline images should be converted to
|
||||
|
||||
@@ -43,15 +43,22 @@ def abstract_inline_conversion(markup_fn):
|
||||
"""
|
||||
This abstracts all simple inline tags like b, em, del, ...
|
||||
Returns a function that wraps the chomped text in a pair of the string
|
||||
that is returned by markup_fn. markup_fn is necessary to allow for
|
||||
that is returned by markup_fn, with '/' inserted in the string used after
|
||||
the text if it looks like an HTML tag. markup_fn is necessary to allow for
|
||||
references to self.strong_em_symbol etc.
|
||||
"""
|
||||
def implementation(self, el, text, convert_as_inline):
|
||||
markup = markup_fn(self)
|
||||
markup_prefix = markup_fn(self)
|
||||
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
|
||||
markup_suffix = '</' + markup_prefix[1:]
|
||||
else:
|
||||
markup_suffix = markup_prefix
|
||||
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
|
||||
return text
|
||||
prefix, suffix, text = chomp(text)
|
||||
if not text:
|
||||
return ''
|
||||
return '%s%s%s%s%s' % (prefix, markup, text, markup, suffix)
|
||||
return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix)
|
||||
return implementation
|
||||
|
||||
|
||||
@@ -69,6 +76,7 @@ class MarkdownConverter(object):
|
||||
default_title = False
|
||||
escape_asterisks = True
|
||||
escape_underscores = True
|
||||
escape_misc = True
|
||||
heading_style = UNDERLINED
|
||||
keep_inline_images_in = []
|
||||
newline_style = SPACES
|
||||
@@ -199,6 +207,9 @@ class MarkdownConverter(object):
|
||||
def escape(self, text):
|
||||
if not text:
|
||||
return ''
|
||||
if self.options['escape_misc']:
|
||||
text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
|
||||
text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
|
||||
if self.options['escape_asterisks']:
|
||||
text = text.replace('*', r'\*')
|
||||
if self.options['escape_underscores']:
|
||||
@@ -315,7 +326,7 @@ class MarkdownConverter(object):
|
||||
def convert_li(self, el, text, convert_as_inline):
|
||||
parent = el.parent
|
||||
if parent is not None and parent.name == 'ol':
|
||||
if parent.get("start"):
|
||||
if parent.get("start") and str(parent.get("start")).isnumeric():
|
||||
start = int(parent.get("start"))
|
||||
else:
|
||||
start = 1
|
||||
@@ -377,13 +388,13 @@ class MarkdownConverter(object):
|
||||
|
||||
def convert_td(self, el, text, convert_as_inline):
|
||||
colspan = 1
|
||||
if 'colspan' in el.attrs:
|
||||
if 'colspan' in el.attrs and el['colspan'].isdigit():
|
||||
colspan = int(el['colspan'])
|
||||
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
|
||||
|
||||
def convert_th(self, el, text, convert_as_inline):
|
||||
colspan = 1
|
||||
if 'colspan' in el.attrs:
|
||||
if 'colspan' in el.attrs and el['colspan'].isdigit():
|
||||
colspan = int(el['colspan'])
|
||||
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
|
||||
|
||||
@@ -400,7 +411,7 @@ class MarkdownConverter(object):
|
||||
# first row and is headline: print headline underline
|
||||
full_colspan = 0
|
||||
for cell in cells:
|
||||
if "colspan" in cell.attrs:
|
||||
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
|
||||
full_colspan += int(cell["colspan"])
|
||||
else:
|
||||
full_colspan += 1
|
||||
|
||||
45
pyproject.toml
Normal file
45
pyproject.toml
Normal file
@@ -0,0 +1,45 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=61.2", "setuptools_scm[toml]>=3.4.3"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "markdownify"
|
||||
version = "0.13.1"
|
||||
authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
|
||||
description = "Convert HTML to markdown."
|
||||
readme = "README.rst"
|
||||
classifiers = [
|
||||
"Environment :: Web Environment",
|
||||
"Framework :: Django",
|
||||
"Intended Audience :: Developers",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
"Programming Language :: Python :: 2.5",
|
||||
"Programming Language :: Python :: 2.6",
|
||||
"Programming Language :: Python :: 2.7",
|
||||
"Programming Language :: Python :: 3.6",
|
||||
"Programming Language :: Python :: 3.7",
|
||||
"Programming Language :: Python :: 3.8",
|
||||
"Topic :: Utilities",
|
||||
]
|
||||
dependencies = [
|
||||
"beautifulsoup4>=4.9,<5",
|
||||
"six>=1.15,<2"
|
||||
]
|
||||
|
||||
[project.urls]
|
||||
Homepage = "http://github.com/matthewwithanm/python-markdownify"
|
||||
Download = "http://github.com/matthewwithanm/python-markdownify/tarball/master"
|
||||
|
||||
[project.scripts]
|
||||
markdownify = "markdownify.main:main"
|
||||
|
||||
[tool.setuptools]
|
||||
zip-safe = false
|
||||
include-package-data = true
|
||||
|
||||
[tool.setuptools.packages.find]
|
||||
include = ["markdownify", "markdownify.*"]
|
||||
namespaces = false
|
||||
|
||||
[tool.setuptools_scm]
|
||||
52
setup.py
52
setup.py
@@ -1,52 +0,0 @@
|
||||
#/usr/bin/env python
|
||||
import codecs
|
||||
import os
|
||||
from setuptools import setup, find_packages
|
||||
|
||||
|
||||
read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
|
||||
|
||||
pkgmeta = {
|
||||
'__title__': 'markdownify',
|
||||
'__author__': 'Matthew Tretter',
|
||||
'__version__': '0.11.6',
|
||||
}
|
||||
|
||||
read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
|
||||
|
||||
setup(
|
||||
name='markdownify',
|
||||
description='Convert HTML to markdown.',
|
||||
long_description=read(os.path.join(os.path.dirname(__file__), 'README.rst')),
|
||||
version=pkgmeta['__version__'],
|
||||
author=pkgmeta['__author__'],
|
||||
author_email='m@tthewwithanm.com',
|
||||
url='http://github.com/matthewwithanm/python-markdownify',
|
||||
download_url='http://github.com/matthewwithanm/python-markdownify/tarball/master',
|
||||
packages=find_packages(),
|
||||
zip_safe=False,
|
||||
include_package_data=True,
|
||||
install_requires=[
|
||||
'beautifulsoup4>=4.9,<5',
|
||||
'six>=1.15,<2',
|
||||
],
|
||||
classifiers=[
|
||||
'Environment :: Web Environment',
|
||||
'Framework :: Django',
|
||||
'Intended Audience :: Developers',
|
||||
'License :: OSI Approved :: MIT License',
|
||||
'Operating System :: OS Independent',
|
||||
'Programming Language :: Python :: 2.5',
|
||||
'Programming Language :: Python :: 2.6',
|
||||
'Programming Language :: Python :: 2.7',
|
||||
'Programming Language :: Python :: 3.6',
|
||||
'Programming Language :: Python :: 3.7',
|
||||
'Programming Language :: Python :: 3.8',
|
||||
'Topic :: Utilities'
|
||||
],
|
||||
entry_points={
|
||||
'console_scripts': [
|
||||
'markdownify = markdownify.main:main'
|
||||
]
|
||||
}
|
||||
)
|
||||
@@ -87,6 +87,16 @@ def test_code():
|
||||
assert md('<code><span>*this_should_not_escape*</span></code>') == '`*this_should_not_escape*`'
|
||||
assert md('<code>this should\t\tnormalize</code>') == '`this should normalize`'
|
||||
assert md('<code><span>this should\t\tnormalize</span></code>') == '`this should normalize`'
|
||||
assert md('<code>foo<b>bar</b>baz</code>') == '`foobarbaz`'
|
||||
assert md('<kbd>foo<i>bar</i>baz</kbd>') == '`foobarbaz`'
|
||||
assert md('<samp>foo<del> bar </del>baz</samp>') == '`foo bar baz`'
|
||||
assert md('<samp>foo <del>bar</del> baz</samp>') == '`foo bar baz`'
|
||||
assert md('<code>foo<em> bar </em>baz</code>') == '`foo bar baz`'
|
||||
assert md('<code>foo<code> bar </code>baz</code>') == '`foo bar baz`'
|
||||
assert md('<code>foo<strong> bar </strong>baz</code>') == '`foo bar baz`'
|
||||
assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
|
||||
assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
|
||||
assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
|
||||
|
||||
|
||||
def test_del():
|
||||
@@ -215,6 +225,17 @@ def test_pre():
|
||||
assert md('<pre><span>*this_should_not_escape*</span></pre>') == '\n```\n*this_should_not_escape*\n```\n'
|
||||
assert md('<pre>\t\tthis should\t\tnot normalize</pre>') == '\n```\n\t\tthis should\t\tnot normalize\n```\n'
|
||||
assert md('<pre><span>\t\tthis should\t\tnot normalize</span></pre>') == '\n```\n\t\tthis should\t\tnot normalize\n```\n'
|
||||
assert md('<pre>foo<b>\nbar\n</b>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<i>\nbar\n</i>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo\n<i>bar</i>\nbaz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<i>\n</i>baz</pre>') == '\n```\nfoo\nbaz\n```\n'
|
||||
assert md('<pre>foo<del>\nbar\n</del>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<em>\nbar\n</em>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<code>\nbar\n</code>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<strong>\nbar\n</strong>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<s>\nbar\n</s>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<sup>\nbar\n</sup>baz</pre>', sup_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
assert md('<pre>foo<sub>\nbar\n</sub>baz</pre>', sub_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'
|
||||
|
||||
|
||||
def test_script():
|
||||
@@ -247,11 +268,13 @@ def test_strong_em_symbol():
|
||||
def test_sub():
|
||||
assert md('<sub>foo</sub>') == 'foo'
|
||||
assert md('<sub>foo</sub>', sub_symbol='~') == '~foo~'
|
||||
assert md('<sub>foo</sub>', sub_symbol='<sub>') == '<sub>foo</sub>'
|
||||
|
||||
|
||||
def test_sup():
|
||||
assert md('<sup>foo</sup>') == 'foo'
|
||||
assert md('<sup>foo</sup>', sup_symbol='^') == '^foo^'
|
||||
assert md('<sup>foo</sup>', sup_symbol='<sup>') == '<sup>foo</sup>'
|
||||
|
||||
|
||||
def test_lang():
|
||||
|
||||
@@ -12,7 +12,7 @@ def test_underscore():
|
||||
|
||||
|
||||
def test_xml_entities():
|
||||
assert md('&') == '&'
|
||||
assert md('&') == r'\&'
|
||||
|
||||
|
||||
def test_named_entities():
|
||||
@@ -25,4 +25,23 @@ def test_hexadecimal_entities():
|
||||
|
||||
|
||||
def test_single_escaping_entities():
|
||||
assert md('&amp;') == '&'
|
||||
assert md('&amp;') == r'\&'
|
||||
|
||||
|
||||
def text_misc():
|
||||
assert md('\\*') == r'\\\*'
|
||||
assert md('<foo>') == r'\<foo\>'
|
||||
assert md('# foo') == r'\# foo'
|
||||
assert md('> foo') == r'\> foo'
|
||||
assert md('~~foo~~') == r'\~\~foo\~\~'
|
||||
assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
|
||||
assert md('---\n') == '\\-\\-\\-\n'
|
||||
assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
|
||||
assert md('`x`') == r'\`x\`'
|
||||
assert md('[text](link)') == r'\[text](link)'
|
||||
assert md('1. x') == r'1\. x'
|
||||
assert md('not a number. x') == r'not a number. x'
|
||||
assert md('1) x') == r'1\) x'
|
||||
assert md('not a number) x') == r'not a number) x'
|
||||
assert md('|not table|') == r'\|not table\|'
|
||||
assert md(r'\ <foo> &amp; | ` `', escape_misc=False) == r'\ <foo> & | ` `'
|
||||
|
||||
@@ -43,6 +43,9 @@ nested_ols = """
|
||||
def test_ol():
|
||||
assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
|
||||
assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
|
||||
|
||||
|
||||
def test_nested_ols():
|
||||
|
||||
@@ -215,7 +215,7 @@ table_with_colspan = """<table>
|
||||
<th>Age</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td>Jill</td>
|
||||
<td colspan="1">Jill</td>
|
||||
<td>Smith</td>
|
||||
<td>50</td>
|
||||
</tr>
|
||||
@@ -226,6 +226,17 @@ table_with_colspan = """<table>
|
||||
</tr>
|
||||
</table>"""
|
||||
|
||||
table_with_undefined_colspan = """<table>
|
||||
<tr>
|
||||
<th colspan="undefined">Name</th>
|
||||
<th>Age</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<td colspan="-1">Jill</td>
|
||||
<td>Smith</td>
|
||||
</tr>
|
||||
</table>"""
|
||||
|
||||
|
||||
def test_table():
|
||||
assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
@@ -240,3 +251,4 @@ def test_table():
|
||||
assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
|
||||
assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
|
||||
assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
|
||||
|
||||
Reference in New Issue
Block a user