Compare commits

...

12 Commits

Author SHA1 Message Date
AlexVonB
8c810eb8a8 Merge branch 'develop' 2024-07-14 21:20:04 +02:00
AlexVonB
f6c8daf8a5 bump to v0.13.0 2024-07-14 21:19:35 +02:00
AlexVonB
75a678dab9 fix pytest version to 8 2024-07-14 21:02:49 +02:00
AlexVonB
0a5c89aa49 added test for ol start check 2024-06-23 14:30:07 +02:00
microdnd
51390d7389 handle ol start value is not number (#127)
Co-authored-by: Mico <mico_wu@trendmicro.com>
2024-06-23 14:28:53 +02:00
AlexVonB
50b4640db2 better naming for markup variables 2024-06-23 13:30:08 +02:00
Joseph Myers
7861b330cd Special-case use of HTML tags for converting <sub> / <sup> (#119)
Allow different strings before / after `<sub>` / `<sup>` content

In particular, this allows setting `sub_symbol='<sub>'`,
`sup_symbol='<sup>'`, to use raw HTML in the output when
converting subscripts and superscripts.
2024-06-23 13:28:05 +02:00
AlexVonB
2ec33384de handle un-parsable colspan values
fixes #126
2024-06-23 13:17:20 +02:00
samypr100
c1672aee44 Update MANIFEST.in to exclude tests during packaging (#125) 2024-06-23 12:59:14 +02:00
AlexVonB
43dbe20aaf fixed github action badges
see https://github.com/badges/shields/issues/8671
2024-04-04 21:50:02 +02:00
Joseph Myers
46af45bb3c Escape all characters with Markdown significance (#118)
* Escape all characters with Markdown significance

There are many punctuation characters that sometimes have significance
in Markdown; more systematically escape them all (based on a new
escape_misc configuration option).

A limited attempt is made to limit the escaping of '.' and ')' to the
context where they might have Markdown significance (after a number,
where they can indicate an ordered list item); no such attempt is made
for the other characters (and even that limiting of '.' and ')' may
not be entirely safe in all cases, as it's possible the HTML could
have the number outside the block being escaped in one go,
e.g. `<span>1</span>.`.

---------

Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>
2024-04-04 21:42:58 +02:00
Joseph Myers
2bd0772685 Avoid inline styles inside <code> / <pre> conversion (#117)
* Avoid inline styles inside `<code>` / `<pre>` conversion

The check used for this is analogous to that used to avoid escaping
potential markup characters inside such tags.

Fixes #103

---------

Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>
2024-04-04 20:55:54 +02:00
9 changed files with 93 additions and 15 deletions

View File

@@ -1 +1,2 @@
include README.rst
prune tests

View File

@@ -1,8 +1,8 @@
|build| |version| |license| |downloads|
.. |build| image:: https://img.shields.io/github/workflow/status/matthewwithanm/python-markdownify/Python%20application/develop
.. |build| image:: https://img.shields.io/github/actions/workflow/status/matthewwithanm/python-markdownify/python-app.yml?branch=develop
:alt: GitHub Workflow Status
:target: https://github.com/matthewwithanm/python-markdownify/actions?query=workflow%3A%22Python+application%22
:target: https://github.com/matthewwithanm/python-markdownify/actions/workflows/python-app.yml?query=workflow%3A%22Python+application%22
.. |version| image:: https://img.shields.io/pypi/v/markdownify
:alt: Pypi version
@@ -87,7 +87,11 @@ strong_em_symbol
sub_symbol, sup_symbol
Define the chars that surround ``<sub>`` and ``<sup>`` text. Defaults to an
empty string, because this is non-standard behavior. Could be something like
``~`` and ``^`` to result in ``~sub~`` and ``^sup^``.
``~`` and ``^`` to result in ``~sub~`` and ``^sup^``. If the value starts
with ``<`` and ends with ``>``, it is treated as an HTML tag and a ``/`` is
inserted after the ``<`` in the string used after the text; this allows
specifying ``<sub>`` to use raw HTML in the output for subscripts, for
example.
newline_style
Defines the style of marking linebreaks (``<br>``) in markdown. The default
@@ -123,6 +127,11 @@ escape_underscores
If set to ``False``, do not escape ``_`` to ``\_`` in text.
Defaults to ``True``.
escape_misc
If set to ``False``, do not escape miscellaneous punctuation characters
that sometimes have Markdown significance in text.
Defaults to ``True``.
keep_inline_images_in
Images are converted to their alt-text when the images are located inside
headlines or table cells. If some inline images should be converted to

View File

@@ -43,15 +43,22 @@ def abstract_inline_conversion(markup_fn):
"""
This abstracts all simple inline tags like b, em, del, ...
Returns a function that wraps the chomped text in a pair of the string
that is returned by markup_fn. markup_fn is necessary to allow for
that is returned by markup_fn, with '/' inserted in the string used after
the text if it looks like an HTML tag. markup_fn is necessary to allow for
references to self.strong_em_symbol etc.
"""
def implementation(self, el, text, convert_as_inline):
markup = markup_fn(self)
markup_prefix = markup_fn(self)
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
markup_suffix = '</' + markup_prefix[1:]
else:
markup_suffix = markup_prefix
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
return text
prefix, suffix, text = chomp(text)
if not text:
return ''
return '%s%s%s%s%s' % (prefix, markup, text, markup, suffix)
return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix)
return implementation
@@ -69,6 +76,7 @@ class MarkdownConverter(object):
default_title = False
escape_asterisks = True
escape_underscores = True
escape_misc = True
heading_style = UNDERLINED
keep_inline_images_in = []
newline_style = SPACES
@@ -199,6 +207,9 @@ class MarkdownConverter(object):
def escape(self, text):
if not text:
return ''
if self.options['escape_misc']:
text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
if self.options['escape_asterisks']:
text = text.replace('*', r'\*')
if self.options['escape_underscores']:
@@ -315,7 +326,7 @@ class MarkdownConverter(object):
def convert_li(self, el, text, convert_as_inline):
parent = el.parent
if parent is not None and parent.name == 'ol':
if parent.get("start"):
if parent.get("start") and str(parent.get("start")).isnumeric():
start = int(parent.get("start"))
else:
start = 1
@@ -377,13 +388,13 @@ class MarkdownConverter(object):
def convert_td(self, el, text, convert_as_inline):
colspan = 1
if 'colspan' in el.attrs:
if 'colspan' in el.attrs and el['colspan'].isdigit():
colspan = int(el['colspan'])
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
def convert_th(self, el, text, convert_as_inline):
colspan = 1
if 'colspan' in el.attrs:
if 'colspan' in el.attrs and el['colspan'].isdigit():
colspan = int(el['colspan'])
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
@@ -400,7 +411,7 @@ class MarkdownConverter(object):
# first row and is headline: print headline underline
full_colspan = 0
for cell in cells:
if "colspan" in cell.attrs:
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
full_colspan += int(cell["colspan"])
else:
full_colspan += 1

View File

@@ -9,7 +9,7 @@ read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
pkgmeta = {
'__title__': 'markdownify',
'__author__': 'Matthew Tretter',
'__version__': '0.12.1',
'__version__': '0.13.0',
}
read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()

View File

@@ -87,6 +87,16 @@ def test_code():
assert md('<code><span>*this_should_not_escape*</span></code>') == '`*this_should_not_escape*`'
assert md('<code>this should\t\tnormalize</code>') == '`this should normalize`'
assert md('<code><span>this should\t\tnormalize</span></code>') == '`this should normalize`'
assert md('<code>foo<b>bar</b>baz</code>') == '`foobarbaz`'
assert md('<kbd>foo<i>bar</i>baz</kbd>') == '`foobarbaz`'
assert md('<samp>foo<del> bar </del>baz</samp>') == '`foo bar baz`'
assert md('<samp>foo <del>bar</del> baz</samp>') == '`foo bar baz`'
assert md('<code>foo<em> bar </em>baz</code>') == '`foo bar baz`'
assert md('<code>foo<code> bar </code>baz</code>') == '`foo bar baz`'
assert md('<code>foo<strong> bar </strong>baz</code>') == '`foo bar baz`'
assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
def test_del():
@@ -215,6 +225,17 @@ def test_pre():
assert md('<pre><span>*this_should_not_escape*</span></pre>') == '\n```\n*this_should_not_escape*\n```\n'
assert md('<pre>\t\tthis should\t\tnot normalize</pre>') == '\n```\n\t\tthis should\t\tnot normalize\n```\n'
assert md('<pre><span>\t\tthis should\t\tnot normalize</span></pre>') == '\n```\n\t\tthis should\t\tnot normalize\n```\n'
assert md('<pre>foo<b>\nbar\n</b>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<i>\nbar\n</i>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo\n<i>bar</i>\nbaz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<i>\n</i>baz</pre>') == '\n```\nfoo\nbaz\n```\n'
assert md('<pre>foo<del>\nbar\n</del>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<em>\nbar\n</em>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<code>\nbar\n</code>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<strong>\nbar\n</strong>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<s>\nbar\n</s>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<sup>\nbar\n</sup>baz</pre>', sup_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'
assert md('<pre>foo<sub>\nbar\n</sub>baz</pre>', sub_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'
def test_script():
@@ -247,11 +268,13 @@ def test_strong_em_symbol():
def test_sub():
assert md('<sub>foo</sub>') == 'foo'
assert md('<sub>foo</sub>', sub_symbol='~') == '~foo~'
assert md('<sub>foo</sub>', sub_symbol='<sub>') == '<sub>foo</sub>'
def test_sup():
assert md('<sup>foo</sup>') == 'foo'
assert md('<sup>foo</sup>', sup_symbol='^') == '^foo^'
assert md('<sup>foo</sup>', sup_symbol='<sup>') == '<sup>foo</sup>'
def test_lang():

View File

@@ -12,7 +12,7 @@ def test_underscore():
def test_xml_entities():
assert md('&amp;') == '&'
assert md('&amp;') == r'\&'
def test_named_entities():
@@ -25,4 +25,23 @@ def test_hexadecimal_entities():
def test_single_escaping_entities():
assert md('&amp;amp;') == '&amp;'
assert md('&amp;amp;') == r'\&amp;'
def text_misc():
assert md('\\*') == r'\\\*'
assert md('<foo>') == r'\<foo\>'
assert md('# foo') == r'\# foo'
assert md('> foo') == r'\> foo'
assert md('~~foo~~') == r'\~\~foo\~\~'
assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
assert md('---\n') == '\\-\\-\\-\n'
assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
assert md('`x`') == r'\`x\`'
assert md('[text](link)') == r'\[text](link)'
assert md('1. x') == r'1\. x'
assert md('not a number. x') == r'not a number. x'
assert md('1) x') == r'1\) x'
assert md('not a number) x') == r'not a number) x'
assert md('|not table|') == r'\|not table\|'
assert md(r'\ <foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'

View File

@@ -43,6 +43,9 @@ nested_ols = """
def test_ol():
assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
def test_nested_ols():

View File

@@ -215,7 +215,7 @@ table_with_colspan = """<table>
<th>Age</th>
</tr>
<tr>
<td>Jill</td>
<td colspan="1">Jill</td>
<td>Smith</td>
<td>50</td>
</tr>
@@ -226,6 +226,17 @@ table_with_colspan = """<table>
</tr>
</table>"""
table_with_undefined_colspan = """<table>
<tr>
<th colspan="undefined">Name</th>
<th>Age</th>
</tr>
<tr>
<td colspan="-1">Jill</td>
<td>Smith</td>
</tr>
</table>"""
def test_table():
assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
@@ -240,3 +251,4 @@ def test_table():
assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'

View File

@@ -4,7 +4,7 @@ envlist = py38
[testenv]
passenv = PYTHONPATH
deps =
pytest
pytest==8
flake8
restructuredtext_lint
Pygments