propagate parent tag context downward to improve runtime (#191)

This commit is contained in:
Chris Papademetrious
2025-02-18 16:35:36 -05:00
committed by GitHub
parent c52ba47166
commit 5655f27208
3 changed files with 84 additions and 73 deletions

View File

@@ -180,7 +180,7 @@ If you have a special usecase that calls for a special conversion, you can
always inherit from ``MarkdownConverter`` and override the method you want to
change.
The function that handles a HTML tag named ``abc`` is called
``convert_abc(self, el, text, convert_as_inline)`` and returns a string
``convert_abc(self, el, text, parent_tags)`` and returns a string
containing the converted HTML tag.
The ``MarkdownConverter`` object will handle the conversion based on the
function names:
@@ -193,8 +193,8 @@ function names:
"""
Create a custom MarkdownConverter that adds two newlines after an image
"""
def convert_img(self, el, text, convert_as_inline):
return super().convert_img(el, text, convert_as_inline) + '\n\n'
def convert_img(self, el, text, parent_tags):
return super().convert_img(el, text, parent_tags) + '\n\n'
# Create shorthand method for conversion
def md(html, **options):
@@ -208,7 +208,7 @@ function names:
"""
Create a custom MarkdownConverter that ignores paragraphs
"""
def convert_p(self, el, text, convert_as_inline):
def convert_p(self, el, text, parent_tags):
return ''
# Create shorthand method for conversion

View File

@@ -57,13 +57,13 @@ def abstract_inline_conversion(markup_fn):
the text if it looks like an HTML tag. markup_fn is necessary to allow for
references to self.strong_em_symbol etc.
"""
def implementation(self, el, text, convert_as_inline):
def implementation(self, el, text, parent_tags):
markup_prefix = markup_fn(self)
if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
markup_suffix = '</' + markup_prefix[1:]
else:
markup_suffix = markup_prefix
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
if '_noformat' in parent_tags:
return text
prefix, suffix, text = chomp(text)
if not text:
@@ -170,24 +170,18 @@ class MarkdownConverter(object):
return self.convert_soup(soup)
def convert_soup(self, soup):
return self.process_tag(soup, convert_as_inline=False)
return self.process_tag(soup, parent_tags=set())
def process_element(self, node, convert_as_inline):
def process_element(self, node, parent_tags=None):
if isinstance(node, NavigableString):
return self.process_text(node)
return self.process_text(node, parent_tags=parent_tags)
else:
return self.process_tag(node, convert_as_inline)
return self.process_tag(node, parent_tags=parent_tags)
def process_tag(self, node, convert_as_inline):
text = ''
# For Markdown headings and table cells, convert children as inline
# (so that block element children do not produce newlines).
convert_children_as_inline = (
convert_as_inline # propagated from parent
or html_heading_re.match(node.name) is not None # headings
or node.name in ['td', 'th'] # table cells
)
def process_tag(self, node, parent_tags=None):
# For the top-level element, initialize the parent context with an empty set.
if parent_tags is None:
parent_tags = set()
# Collect child elements to process, ignoring whitespace-only text elements
# adjacent to the inner/outer boundaries of block elements.
@@ -220,8 +214,27 @@ class MarkdownConverter(object):
children_to_convert = [el for el in node.children if not _can_ignore(el)]
# Create a copy of this tag's parent context, then update it to include this tag
# to propagate down into the children.
parent_tags_for_children = set(parent_tags)
parent_tags_for_children.add(node.name)
# if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
if (
html_heading_re.match(node.name) is not None # headings
or node.name in {'td', 'th'} # table cells
):
parent_tags_for_children.add('_inline')
# if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
if node.name in {'pre', 'code', 'kbd', 'samp'}:
parent_tags_for_children.add('_noformat')
# Convert the children elements into a list of result strings.
child_strings = [self.process_element(el, convert_children_as_inline) for el in children_to_convert]
child_strings = [
self.process_element(el, parent_tags=parent_tags_for_children)
for el in children_to_convert
]
# Remove empty string values.
child_strings = [s for s in child_strings if s]
@@ -256,11 +269,11 @@ class MarkdownConverter(object):
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
convert_fn = getattr(self, convert_fn_name, None)
if convert_fn and self.should_convert_tag(node.name):
text = convert_fn(node, text, convert_as_inline)
text = convert_fn(node, text, parent_tags=parent_tags)
return text
def convert__document_(self, el, text, convert_as_inline):
def convert__document_(self, el, text, parent_tags):
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
if self.options['strip_document'] == LSTRIP:
text = text.lstrip('\n') # remove leading separation newlines
@@ -275,11 +288,15 @@ class MarkdownConverter(object):
return text
def process_text(self, el):
def process_text(self, el, parent_tags=None):
# For the top-level element, initialize the parent context with an empty set.
if parent_tags is None:
parent_tags = set()
text = six.text_type(el) or ''
# normalize whitespace if we're not inside a preformatted element
if not el.find_parent('pre'):
if 'pre' not in parent_tags:
if self.options['wrap']:
text = all_whitespace_re.sub(' ', text)
else:
@@ -287,7 +304,7 @@ class MarkdownConverter(object):
text = whitespace_re.sub(' ', text)
# escape special characters if we're not inside a preformatted or code element
if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
if '_noformat' not in parent_tags:
text = self.escape(text)
# remove leading whitespace at the start or just after a
@@ -310,8 +327,8 @@ class MarkdownConverter(object):
if m:
n = int(m.group(1))
def convert_tag(el, text, convert_as_inline):
return self._convert_hn(n, el, text, convert_as_inline)
def convert_tag(el, text, parent_tags):
return self._convert_hn(n, el, text, parent_tags)
convert_tag.__name__ = 'convert_h%s' % n
setattr(self, convert_tag.__name__, convert_tag)
@@ -358,8 +375,8 @@ class MarkdownConverter(object):
text = (text or '').rstrip()
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
def convert_a(self, el, text, convert_as_inline):
if el.find_parent(['pre', 'code', 'kbd', 'samp']):
def convert_a(self, el, text, parent_tags):
if '_noformat' in parent_tags:
return text
prefix, suffix, text = chomp(text)
if not text:
@@ -380,10 +397,10 @@ class MarkdownConverter(object):
convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
def convert_blockquote(self, el, text, convert_as_inline):
def convert_blockquote(self, el, text, parent_tags):
# handle some early-exit scenarios
text = (text or '').strip()
if convert_as_inline:
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return "\n"
@@ -396,8 +413,8 @@ class MarkdownConverter(object):
return '\n' + text + '\n\n'
def convert_br(self, el, text, convert_as_inline):
if convert_as_inline:
def convert_br(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ""
if self.options['newline_style'].lower() == BACKSLASH:
@@ -405,16 +422,16 @@ class MarkdownConverter(object):
else:
return ' \n'
def convert_code(self, el, text, convert_as_inline):
if el.parent.name == 'pre':
def convert_code(self, el, text, parent_tags):
if 'pre' in parent_tags:
return text
converter = abstract_inline_conversion(lambda self: '`')
return converter(self, el, text, convert_as_inline)
return converter(self, el, text, parent_tags)
convert_del = abstract_inline_conversion(lambda self: '~~')
def convert_div(self, el, text, convert_as_inline):
if convert_as_inline:
def convert_div(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ' ' + text.strip() + ' '
text = text.strip()
return '\n\n%s\n\n' % text if text else ''
@@ -427,9 +444,9 @@ class MarkdownConverter(object):
convert_kbd = convert_code
def convert_dd(self, el, text, convert_as_inline):
def convert_dd(self, el, text, parent_tags):
text = (text or '').strip()
if convert_as_inline:
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return '\n'
@@ -445,11 +462,11 @@ class MarkdownConverter(object):
return '%s\n' % text
def convert_dt(self, el, text, convert_as_inline):
def convert_dt(self, el, text, parent_tags):
# remove newlines from term text
text = (text or '').strip()
text = all_whitespace_re.sub(' ', text)
if convert_as_inline:
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return '\n'
@@ -459,9 +476,9 @@ class MarkdownConverter(object):
return '\n%s\n' % text
def _convert_hn(self, n, el, text, convert_as_inline):
def _convert_hn(self, n, el, text, parent_tags):
""" Method name prefixed with _ to prevent <hn> to call this """
if convert_as_inline:
if '_inline' in parent_tags:
return text
# prevent MemoryErrors in case of very large n
@@ -478,46 +495,40 @@ class MarkdownConverter(object):
return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
return '\n\n%s %s\n\n' % (hashes, text)
def convert_hr(self, el, text, convert_as_inline):
def convert_hr(self, el, text, parent_tags):
return '\n\n---\n\n'
convert_i = convert_em
def convert_img(self, el, text, convert_as_inline):
def convert_img(self, el, text, parent_tags):
alt = el.attrs.get('alt', None) or ''
src = el.attrs.get('src', None) or ''
title = el.attrs.get('title', None) or ''
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
if (convert_as_inline
if ('_inline' in parent_tags
and el.parent.name not in self.options['keep_inline_images_in']):
return alt
return '![%s](%s%s)' % (alt, src, title_part)
def convert_list(self, el, text, convert_as_inline):
def convert_list(self, el, text, parent_tags):
# Converting a list to inline is undefined.
# Ignoring convert_to_inline for list.
# Ignoring inline conversion parents for list.
nested = False
before_paragraph = False
next_sibling = _next_block_content_sibling(el)
if next_sibling and next_sibling.name not in ['ul', 'ol']:
before_paragraph = True
while el:
if el.name == 'li':
nested = True
break
el = el.parent
if nested:
# remove trailing newline if nested
if 'li' in parent_tags:
# remove trailing newline if we're in a nested list
return '\n' + text.rstrip()
return '\n\n' + text + ('\n' if before_paragraph else '')
convert_ul = convert_list
convert_ol = convert_list
def convert_li(self, el, text, convert_as_inline):
def convert_li(self, el, text, parent_tags):
# handle some early-exit scenarios
text = (text or '').strip()
if not text:
@@ -554,8 +565,8 @@ class MarkdownConverter(object):
return '%s\n' % text
def convert_p(self, el, text, convert_as_inline):
if convert_as_inline:
def convert_p(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ' ' + text.strip() + ' '
text = text.strip()
if self.options['wrap']:
@@ -577,7 +588,7 @@ class MarkdownConverter(object):
text = '\n'.join(new_lines)
return '\n\n%s\n\n' % text if text else ''
def convert_pre(self, el, text, convert_as_inline):
def convert_pre(self, el, text, parent_tags):
if not text:
return ''
code_language = self.options['code_language']
@@ -587,10 +598,10 @@ class MarkdownConverter(object):
return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
def convert_script(self, el, text, convert_as_inline):
def convert_script(self, el, text, parent_tags):
return ''
def convert_style(self, el, text, convert_as_inline):
def convert_style(self, el, text, parent_tags):
return ''
convert_s = convert_del
@@ -603,28 +614,28 @@ class MarkdownConverter(object):
convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])
def convert_table(self, el, text, convert_as_inline):
def convert_table(self, el, text, parent_tags):
return '\n\n' + text.strip() + '\n\n'
def convert_caption(self, el, text, convert_as_inline):
def convert_caption(self, el, text, parent_tags):
return text.strip() + '\n\n'
def convert_figcaption(self, el, text, convert_as_inline):
def convert_figcaption(self, el, text, parent_tags):
return '\n\n' + text.strip() + '\n\n'
def convert_td(self, el, text, convert_as_inline):
def convert_td(self, el, text, parent_tags):
colspan = 1
if 'colspan' in el.attrs and el['colspan'].isdigit():
colspan = int(el['colspan'])
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
def convert_th(self, el, text, convert_as_inline):
def convert_th(self, el, text, parent_tags):
colspan = 1
if 'colspan' in el.attrs and el['colspan'].isdigit():
colspan = int(el['colspan'])
return ' ' + text.strip().replace("\n", " ") + ' |' * colspan
def convert_tr(self, el, text, convert_as_inline):
def convert_tr(self, el, text, parent_tags):
cells = el.find_all(['td', 'th'])
is_first_row = el.find_previous_sibling() is None
is_headrow = (

View File

@@ -6,11 +6,11 @@ class UnitTestConverter(MarkdownConverter):
"""
Create a custom MarkdownConverter for unit tests
"""
def convert_img(self, el, text, convert_as_inline):
def convert_img(self, el, text, parent_tags):
"""Add two newlines after an image"""
return super().convert_img(el, text, convert_as_inline) + '\n\n'
return super().convert_img(el, text, parent_tags) + '\n\n'
def convert_custom_tag(self, el, text, convert_as_inline):
def convert_custom_tag(self, el, text, parent_tags):
"""Ensure conversion function is found for tags with special characters in name"""
return "FUNCTION USED: %s" % text