Compare commits
1 Commits
master
...
chrispy/su
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2d0a14a2a7 |
21
.github/workflows/python-app.yml
vendored
21
.github/workflows/python-app.yml
vendored
@@ -15,7 +15,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
@@ -30,22 +30,3 @@ jobs:
|
||||
- name: Build
|
||||
run: |
|
||||
python -m build -nwsx .
|
||||
|
||||
types:
|
||||
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python 3.8
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
python-version: 3.8
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
python -m pip install --upgrade pip
|
||||
pip install --upgrade setuptools setuptools_scm wheel build tox mypy types-beautifulsoup4
|
||||
- name: Check types
|
||||
run: |
|
||||
mypy .
|
||||
mypy --strict tests/types.py
|
||||
|
||||
2
.github/workflows/python-publish.yml
vendored
2
.github/workflows/python-publish.yml
vendored
@@ -13,7 +13,7 @@ jobs:
|
||||
runs-on: ubuntu-latest
|
||||
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: actions/checkout@v2
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v2
|
||||
with:
|
||||
|
||||
12
README.rst
12
README.rst
@@ -110,7 +110,7 @@ code_language_callback
|
||||
When the HTML code contains ``pre`` tags that in some way provide the code
|
||||
language, for example as class, this callback can be used to extract the
|
||||
language from the tag and prefix it to the converted ``pre`` tag.
|
||||
The callback gets one single argument, a BeautifulSoup object, and returns
|
||||
The callback gets one single argument, an BeautifylSoup object, and returns
|
||||
a string containing the code language, or ``None``.
|
||||
An example to use the class name as code language could be::
|
||||
|
||||
@@ -157,16 +157,10 @@ strip_document
|
||||
within the document are unaffected.
|
||||
Defaults to ``STRIP``.
|
||||
|
||||
strip_pre
|
||||
Controls whether leading/trailing blank lines are removed from ``<pre>``
|
||||
tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
|
||||
``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
|
||||
Defaults to ``STRIP``.
|
||||
|
||||
bs4_options
|
||||
Specify additional configuration options for the ``BeautifulSoup`` object
|
||||
used to interpret the HTML markup. String and list values (such as ``lxml``
|
||||
or ``html5lib``) are treated as ``features`` arguments to control parser
|
||||
used to interpret the HTML markup. String and list values (such as ``lxml``)
|
||||
are treated as ``features`` parameter arguments to control parser
|
||||
selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
|
||||
are treated as full kwargs to be used for the BeautifulSoup constructor,
|
||||
allowing specification of any parameter. For parameter details, see the
|
||||
|
||||
@@ -11,10 +11,6 @@ re_whitespace = re.compile(r'[\t ]+')
|
||||
re_all_whitespace = re.compile(r'[\t \r\n]+')
|
||||
re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
|
||||
re_html_heading = re.compile(r'h(\d+)')
|
||||
re_pre_lstrip1 = re.compile(r'^ *\n')
|
||||
re_pre_rstrip1 = re.compile(r'\n *$')
|
||||
re_pre_lstrip = re.compile(r'^[ \n]*\n')
|
||||
re_pre_rstrip = re.compile(r'[ \n]*$')
|
||||
|
||||
# Pattern for creating convert_<tag> function names from tag names
|
||||
re_make_convert_fn_name = re.compile(r'[\[\]:-]')
|
||||
@@ -41,9 +37,6 @@ re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
|
||||
# confused with a list item
|
||||
re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
|
||||
|
||||
# Find consecutive backtick sequences in a string
|
||||
re_backtick_runs = re.compile(r'`+')
|
||||
|
||||
# Heading styles
|
||||
ATX = 'atx'
|
||||
ATX_CLOSED = 'atx_closed'
|
||||
@@ -58,25 +51,10 @@ BACKSLASH = 'backslash'
|
||||
ASTERISK = '*'
|
||||
UNDERSCORE = '_'
|
||||
|
||||
# Document/pre strip styles
|
||||
# Document strip styles
|
||||
LSTRIP = 'lstrip'
|
||||
RSTRIP = 'rstrip'
|
||||
STRIP = 'strip'
|
||||
STRIP_ONE = 'strip_one'
|
||||
|
||||
|
||||
def strip1_pre(text):
|
||||
"""Strip one leading and trailing newline from a <pre> string."""
|
||||
text = re_pre_lstrip1.sub('', text)
|
||||
text = re_pre_rstrip1.sub('', text)
|
||||
return text
|
||||
|
||||
|
||||
def strip_pre(text):
|
||||
"""Strip all leading and trailing newlines from a <pre> string."""
|
||||
text = re_pre_lstrip.sub('', text)
|
||||
text = re_pre_rstrip.sub('', text)
|
||||
return text
|
||||
|
||||
|
||||
def chomp(text):
|
||||
@@ -190,7 +168,6 @@ class MarkdownConverter(object):
|
||||
newline_style = SPACES
|
||||
strip = None
|
||||
strip_document = STRIP
|
||||
strip_pre = STRIP
|
||||
strong_em_symbol = ASTERISK
|
||||
sub_symbol = ''
|
||||
sup_symbol = ''
|
||||
@@ -483,24 +460,10 @@ class MarkdownConverter(object):
|
||||
return ' \n'
|
||||
|
||||
def convert_code(self, el, text, parent_tags):
|
||||
if '_noformat' in parent_tags:
|
||||
if 'pre' in parent_tags:
|
||||
return text
|
||||
|
||||
prefix, suffix, text = chomp(text)
|
||||
if not text:
|
||||
return ''
|
||||
|
||||
# Find the maximum number of consecutive backticks in the text, then
|
||||
# delimit the code span with one more backtick than that
|
||||
max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
|
||||
markup_delimiter = '`' * (max_backticks + 1)
|
||||
|
||||
# If the maximum number of backticks is greater than zero, add a space
|
||||
# to avoid interpretation of inside backticks as literals
|
||||
if max_backticks > 0:
|
||||
text = " " + text + " "
|
||||
|
||||
return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)
|
||||
converter = abstract_inline_conversion(lambda self: '`')
|
||||
return converter(self, el, text, parent_tags)
|
||||
|
||||
convert_del = abstract_inline_conversion(lambda self: '~~')
|
||||
|
||||
@@ -693,15 +656,6 @@ class MarkdownConverter(object):
|
||||
if self.options['code_language_callback']:
|
||||
code_language = self.options['code_language_callback'](el) or code_language
|
||||
|
||||
if self.options['strip_pre'] == STRIP:
|
||||
text = strip_pre(text) # remove all leading/trailing newlines
|
||||
elif self.options['strip_pre'] == STRIP_ONE:
|
||||
text = strip1_pre(text) # remove one leading/trailing newline
|
||||
elif self.options['strip_pre'] is None:
|
||||
pass # leave leading and trailing newlines as-is
|
||||
else:
|
||||
raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])
|
||||
|
||||
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
|
||||
|
||||
def convert_q(self, el, text, parent_tags):
|
||||
@@ -735,13 +689,13 @@ class MarkdownConverter(object):
|
||||
def convert_td(self, el, text, parent_tags):
|
||||
colspan = 1
|
||||
if 'colspan' in el.attrs and el['colspan'].isdigit():
|
||||
colspan = max(1, min(1000, int(el['colspan'])))
|
||||
colspan = int(el['colspan'])
|
||||
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
|
||||
|
||||
def convert_th(self, el, text, parent_tags):
|
||||
colspan = 1
|
||||
if 'colspan' in el.attrs and el['colspan'].isdigit():
|
||||
colspan = max(1, min(1000, int(el['colspan'])))
|
||||
colspan = int(el['colspan'])
|
||||
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
|
||||
|
||||
def convert_tr(self, el, text, parent_tags):
|
||||
@@ -762,7 +716,7 @@ class MarkdownConverter(object):
|
||||
full_colspan = 0
|
||||
for cell in cells:
|
||||
if 'colspan' in cell.attrs and cell['colspan'].isdigit():
|
||||
full_colspan += max(1, min(1000, int(cell['colspan'])))
|
||||
full_colspan += int(cell["colspan"])
|
||||
else:
|
||||
full_colspan += 1
|
||||
if ((is_headrow
|
||||
|
||||
@@ -1,77 +0,0 @@
|
||||
from _typeshed import Incomplete
|
||||
from typing import Callable, Union
|
||||
|
||||
ATX: str
|
||||
ATX_CLOSED: str
|
||||
UNDERLINED: str
|
||||
SETEXT = UNDERLINED
|
||||
SPACES: str
|
||||
BACKSLASH: str
|
||||
ASTERISK: str
|
||||
UNDERSCORE: str
|
||||
LSTRIP: str
|
||||
RSTRIP: str
|
||||
STRIP: str
|
||||
STRIP_ONE: str
|
||||
|
||||
|
||||
def markdownify(
|
||||
html: str,
|
||||
autolinks: bool = ...,
|
||||
bs4_options: str = ...,
|
||||
bullets: str = ...,
|
||||
code_language: str = ...,
|
||||
code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
|
||||
convert: Union[list[str], None] = ...,
|
||||
default_title: bool = ...,
|
||||
escape_asterisks: bool = ...,
|
||||
escape_underscores: bool = ...,
|
||||
escape_misc: bool = ...,
|
||||
heading_style: str = ...,
|
||||
keep_inline_images_in: list[str] = ...,
|
||||
newline_style: str = ...,
|
||||
strip: Union[list[str], None] = ...,
|
||||
strip_document: Union[str, None] = ...,
|
||||
strip_pre: str = ...,
|
||||
strong_em_symbol: str = ...,
|
||||
sub_symbol: str = ...,
|
||||
sup_symbol: str = ...,
|
||||
table_infer_header: bool = ...,
|
||||
wrap: bool = ...,
|
||||
wrap_width: int = ...,
|
||||
) -> str: ...
|
||||
|
||||
|
||||
class MarkdownConverter:
|
||||
def __init__(
|
||||
self,
|
||||
autolinks: bool = ...,
|
||||
bs4_options: str = ...,
|
||||
bullets: str = ...,
|
||||
code_language: str = ...,
|
||||
code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
|
||||
convert: Union[list[str], None] = ...,
|
||||
default_title: bool = ...,
|
||||
escape_asterisks: bool = ...,
|
||||
escape_underscores: bool = ...,
|
||||
escape_misc: bool = ...,
|
||||
heading_style: str = ...,
|
||||
keep_inline_images_in: list[str] = ...,
|
||||
newline_style: str = ...,
|
||||
strip: Union[list[str], None] = ...,
|
||||
strip_document: Union[str, None] = ...,
|
||||
strip_pre: str = ...,
|
||||
strong_em_symbol: str = ...,
|
||||
sub_symbol: str = ...,
|
||||
sup_symbol: str = ...,
|
||||
table_infer_header: bool = ...,
|
||||
wrap: bool = ...,
|
||||
wrap_width: int = ...,
|
||||
) -> None:
|
||||
...
|
||||
|
||||
def convert(self, html: str) -> str:
|
||||
...
|
||||
|
||||
def convert_soup(self, soup: Incomplete) -> str:
|
||||
...
|
||||
9
markdownify/main.py
Executable file → Normal file
9
markdownify/main.py
Executable file → Normal file
@@ -70,11 +70,12 @@ def main(argv=sys.argv[1:]):
|
||||
parser.add_argument('-w', '--wrap', action='store_true',
|
||||
help="Wrap all text paragraphs at --wrap-width characters.")
|
||||
parser.add_argument('--wrap-width', type=int, default=80)
|
||||
parser.add_argument('--bs4-options',
|
||||
parser.add_argument('-p', '--beautiful-soup-parser',
|
||||
dest='beautiful_soup_parser',
|
||||
default='html.parser',
|
||||
help="Specifies the parser that BeautifulSoup should use to parse "
|
||||
"the HTML markup. Examples include 'html5.parser', 'lxml', and "
|
||||
"'html5lib'.")
|
||||
help="Specify the Beautiful Soup parser to be used for interpreting HTML markup. Parsers such "
|
||||
"as html5lib, lxml or even a custom parser as long as it is installed on the execution "
|
||||
"environment.")
|
||||
|
||||
args = parser.parse_args(argv)
|
||||
print(markdownify(**vars(args)))
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
|
||||
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "markdownify"
|
||||
version = "1.2.2"
|
||||
version = "1.1.0"
|
||||
authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
|
||||
description = "Convert HTML to markdown."
|
||||
readme = "README.rst"
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
Test whitelisting/blacklisting of specific tags.
|
||||
|
||||
"""
|
||||
from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE
|
||||
from markdownify import markdownify, LSTRIP, RSTRIP, STRIP
|
||||
from .utils import md
|
||||
|
||||
|
||||
@@ -34,13 +34,6 @@ def test_strip_document():
|
||||
assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
|
||||
|
||||
|
||||
def test_strip_pre():
|
||||
assert markdownify("<pre> \n \n Hello \n \n </pre>") == "```\n Hello\n```"
|
||||
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP) == "```\n Hello\n```"
|
||||
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=STRIP_ONE) == "```\n \n Hello \n \n```"
|
||||
assert markdownify("<pre> \n \n Hello \n \n </pre>", strip_pre=None) == "```\n \n \n Hello \n \n \n```"
|
||||
|
||||
|
||||
def bs4_options():
|
||||
assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
|
||||
assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
|
||||
|
||||
@@ -101,9 +101,6 @@ def test_code():
|
||||
assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
|
||||
assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
|
||||
assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
|
||||
assert md('foo<code>`bar`</code>baz') == 'foo`` `bar` ``baz'
|
||||
assert md('foo<code>``bar``</code>baz') == 'foo``` ``bar`` ```baz'
|
||||
assert md('foo<code> `bar` </code>baz') == 'foo `` `bar` `` baz'
|
||||
|
||||
|
||||
def test_dl():
|
||||
@@ -373,4 +370,4 @@ def test_spaces():
|
||||
assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
|
||||
assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
|
||||
assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
|
||||
assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'
|
||||
assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo \n```\n\nbar'
|
||||
|
||||
@@ -1,70 +0,0 @@
|
||||
from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter
|
||||
from bs4 import BeautifulSoup
|
||||
from typing import Union
|
||||
|
||||
markdownify("<p>Hello</p>") == "Hello" # test default of STRIP
|
||||
markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
|
||||
markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
|
||||
markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
|
||||
markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
|
||||
|
||||
# default options
|
||||
MarkdownConverter(
|
||||
autolinks=True,
|
||||
bs4_options='html.parser',
|
||||
bullets='*+-',
|
||||
code_language='',
|
||||
code_language_callback=None,
|
||||
convert=None,
|
||||
default_title=False,
|
||||
escape_asterisks=True,
|
||||
escape_underscores=True,
|
||||
escape_misc=False,
|
||||
heading_style=UNDERLINED,
|
||||
keep_inline_images_in=[],
|
||||
newline_style=SPACES,
|
||||
strip=None,
|
||||
strip_document=STRIP,
|
||||
strip_pre=STRIP,
|
||||
strong_em_symbol=ASTERISK,
|
||||
sub_symbol='',
|
||||
sup_symbol='',
|
||||
table_infer_header=False,
|
||||
wrap=False,
|
||||
wrap_width=80,
|
||||
).convert("")
|
||||
|
||||
# custom options
|
||||
MarkdownConverter(
|
||||
strip_document=None,
|
||||
bullets="-",
|
||||
escape_asterisks=True,
|
||||
escape_underscores=True,
|
||||
escape_misc=True,
|
||||
autolinks=True,
|
||||
default_title=True,
|
||||
newline_style=BACKSLASH,
|
||||
sup_symbol='^',
|
||||
sub_symbol='^',
|
||||
keep_inline_images_in=['h3'],
|
||||
wrap=True,
|
||||
wrap_width=80,
|
||||
strong_em_symbol=UNDERSCORE,
|
||||
code_language='python',
|
||||
code_language_callback=None
|
||||
).convert("")
|
||||
|
||||
html = '<b>test</b>'
|
||||
soup = BeautifulSoup(html, 'html.parser')
|
||||
MarkdownConverter().convert_soup(soup) == '**test**'
|
||||
|
||||
|
||||
def callback(el: BeautifulSoup) -> Union[str, None]:
|
||||
return el['class'][0] if el.has_attr('class') else None
|
||||
|
||||
|
||||
MarkdownConverter(code_language_callback=callback).convert("")
|
||||
MarkdownConverter(code_language_callback=lambda el: None).convert("")
|
||||
|
||||
markdownify('<pre class="python">test\n foo\nbar</pre>', code_language_callback=callback)
|
||||
markdownify('<pre class="python">test\n foo\nbar</pre>', code_language_callback=lambda el: None)
|
||||
Reference in New Issue
Block a user