from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag from textwrap import fill import re import six convert_heading_re = re.compile(r'convert_h(\d+)') line_with_content_re = re.compile(r'^(.*)', flags=re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') all_whitespace_re = re.compile(r'[\t \r\n]+') newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') html_heading_re = re.compile(r'h[1-6]') # extract (leading_nl, content, trailing_nl) from a string # (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here) extract_newlines_re = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL) # Heading styles ATX = 'atx' ATX_CLOSED = 'atx_closed' UNDERLINED = 'underlined' SETEXT = UNDERLINED # Newline style SPACES = 'spaces' BACKSLASH = 'backslash' # Strong and emphasis style ASTERISK = '*' UNDERSCORE = '_' # Document strip styles LSTRIP = 'lstrip' RSTRIP = 'rstrip' STRIP = 'strip' def chomp(text): """ If the text in an inline tag like b, a, or em contains a leading or trailing space, strip the string and return a space as suffix of prefix, if needed. This function is used to prevent conversions like foo => ** foo** """ prefix = ' ' if text and text[0] == ' ' else '' suffix = ' ' if text and text[-1] == ' ' else '' text = text.strip() return (prefix, suffix, text) def abstract_inline_conversion(markup_fn): """ This abstracts all simple inline tags like b, em, del, ... Returns a function that wraps the chomped text in a pair of the string that is returned by markup_fn, with '/' inserted in the string used after the text if it looks like an HTML tag. markup_fn is necessary to allow for references to self.strong_em_symbol etc. """ def implementation(self, el, text, parent_tags): markup_prefix = markup_fn(self) if markup_prefix.startswith('<') and markup_prefix.endswith('>'): markup_suffix = '' + markup_prefix[1:] else: markup_suffix = markup_prefix if '_noformat' in parent_tags: return text prefix, suffix, text = chomp(text) if not text: return '' return '%s%s%s%s%s' % (prefix, markup_prefix, text, markup_suffix, suffix) return implementation def _todict(obj): return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_')) def should_remove_whitespace_inside(el): """Return to remove whitespace immediately inside a block-level element.""" if not el or not el.name: return False if html_heading_re.match(el.name) is not None: return True return el.name in ('p', 'blockquote', 'article', 'div', 'section', 'ol', 'ul', 'li', 'table', 'thead', 'tbody', 'tfoot', 'tr', 'td', 'th') def should_remove_whitespace_outside(el): """Return to remove whitespace immediately outside a block-level element.""" return should_remove_whitespace_inside(el) or (el and el.name == 'pre') def _is_block_content_element(el): """ In a block context, returns: - True for content elements (tags and non-whitespace text) - False for non-content elements (whitespace text, comments, doctypes) """ if isinstance(el, Tag): return True elif isinstance(el, (Comment, Doctype)): return False # (subclasses of NavigableString, must test first) elif isinstance(el, NavigableString): return el.strip() != '' else: return False def _prev_block_content_sibling(el): """Returns the first previous sibling that is a content element, else None.""" while el is not None: el = el.previous_sibling if _is_block_content_element(el): return el return None def _next_block_content_sibling(el): """Returns the first next sibling that is a content element, else None.""" while el is not None: el = el.next_sibling if _is_block_content_element(el): return el return None class MarkdownConverter(object): class DefaultOptions: autolinks = True bullets = '*+-' # An iterable of bullet types. code_language = '' code_language_callback = None convert = None default_title = False escape_asterisks = True escape_underscores = True escape_misc = False heading_style = UNDERLINED keep_inline_images_in = [] newline_style = SPACES strip = None strip_document = STRIP strong_em_symbol = ASTERISK sub_symbol = '' sup_symbol = '' table_infer_header = False wrap = False wrap_width = 80 class Options(DefaultOptions): pass def __init__(self, **options): # Create an options dictionary. Use DefaultOptions as a base so that # it doesn't have to be extended. self.options = _todict(self.DefaultOptions) self.options.update(_todict(self.Options)) self.options.update(options) if self.options['strip'] is not None and self.options['convert'] is not None: raise ValueError('You may specify either tags to strip or tags to' ' convert, but not both.') def convert(self, html): soup = BeautifulSoup(html, 'html.parser') return self.convert_soup(soup) def convert_soup(self, soup): return self.process_tag(soup, parent_tags=set()) def process_element(self, node, parent_tags=None): if isinstance(node, NavigableString): return self.process_text(node, parent_tags=parent_tags) else: return self.process_tag(node, parent_tags=parent_tags) def process_tag(self, node, parent_tags=None): # For the top-level element, initialize the parent context with an empty set. if parent_tags is None: parent_tags = set() # Collect child elements to process, ignoring whitespace-only text elements # adjacent to the inner/outer boundaries of block elements. should_remove_inside = should_remove_whitespace_inside(node) def _can_ignore(el): if isinstance(el, Tag): # Tags are always processed. return False elif isinstance(el, (Comment, Doctype)): # Comment and Doctype elements are always ignored. # (subclasses of NavigableString, must test first) return True elif isinstance(el, NavigableString): if six.text_type(el).strip() != '': # Non-whitespace text nodes are always processed. return False elif should_remove_inside and (not el.previous_sibling or not el.next_sibling): # Inside block elements (excluding
), ignore adjacent whitespace elements.
return True
elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
# Outside block elements (including ), ignore adjacent whitespace elements.
return True
else:
return False
elif el is None:
return True
else:
raise ValueError('Unexpected element type: %s' % type(el))
children_to_convert = [el for el in node.children if not _can_ignore(el)]
# Create a copy of this tag's parent context, then update it to include this tag
# to propagate down into the children.
parent_tags_for_children = set(parent_tags)
parent_tags_for_children.add(node.name)
# if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
if (
html_heading_re.match(node.name) is not None # headings
or node.name in {'td', 'th'} # table cells
):
parent_tags_for_children.add('_inline')
# if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
if node.name in {'pre', 'code', 'kbd', 'samp'}:
parent_tags_for_children.add('_noformat')
# Convert the children elements into a list of result strings.
child_strings = [
self.process_element(el, parent_tags=parent_tags_for_children)
for el in children_to_convert
]
# Remove empty string values.
child_strings = [s for s in child_strings if s]
# Collapse newlines at child element boundaries, if needed.
if node.name == 'pre' or node.find_parent('pre'):
# Inside blocks, do not collapse newlines.
pass
else:
# Collapse newlines at child element boundaries.
updated_child_strings = [''] # so the first lookback works
for child_string in child_strings:
# Separate the leading/trailing newlines from the content.
leading_nl, content, trailing_nl = extract_newlines_re.match(child_string).groups()
# If the last child had trailing newlines and this child has leading newlines,
# use the larger newline count, limited to 2.
if updated_child_strings[-1] and leading_nl:
prev_trailing_nl = updated_child_strings.pop() # will be replaced by the collapsed value
num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
leading_nl = '\n' * num_newlines
# Add the results to the updated child string list.
updated_child_strings.extend([leading_nl, content, trailing_nl])
child_strings = updated_child_strings
# Join all child text strings into a single string.
text = ''.join(child_strings)
# apply this tag's final conversion function
convert_fn_name = "convert_%s" % re.sub(r"[\[\]:-]", "_", node.name)
convert_fn = getattr(self, convert_fn_name, None)
if convert_fn and self.should_convert_tag(node.name):
text = convert_fn(node, text, parent_tags=parent_tags)
return text
def convert__document_(self, el, text, parent_tags):
"""Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
if self.options['strip_document'] == LSTRIP:
text = text.lstrip('\n') # remove leading separation newlines
elif self.options['strip_document'] == RSTRIP:
text = text.rstrip('\n') # remove trailing separation newlines
elif self.options['strip_document'] == STRIP:
text = text.strip('\n') # remove leading and trailing separation newlines
elif self.options['strip_document'] is None:
pass # leave leading and trailing separation newlines as-is
else:
raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
return text
def process_text(self, el, parent_tags=None):
# For the top-level element, initialize the parent context with an empty set.
if parent_tags is None:
parent_tags = set()
text = six.text_type(el) or ''
# normalize whitespace if we're not inside a preformatted element
if 'pre' not in parent_tags:
if self.options['wrap']:
text = all_whitespace_re.sub(' ', text)
else:
text = newline_whitespace_re.sub('\n', text)
text = whitespace_re.sub(' ', text)
# escape special characters if we're not inside a preformatted or code element
if '_noformat' not in parent_tags:
text = self.escape(text, parent_tags)
# remove leading whitespace at the start or just after a
# block-level element; remove traliing whitespace at the end
# or just before a block-level element.
if (should_remove_whitespace_outside(el.previous_sibling)
or (should_remove_whitespace_inside(el.parent)
and not el.previous_sibling)):
text = text.lstrip(' \t\r\n')
if (should_remove_whitespace_outside(el.next_sibling)
or (should_remove_whitespace_inside(el.parent)
and not el.next_sibling)):
text = text.rstrip()
return text
def __getattr__(self, attr):
# Handle headings
m = convert_heading_re.match(attr)
if m:
n = int(m.group(1))
def convert_tag(el, text, parent_tags):
return self._convert_hn(n, el, text, parent_tags)
convert_tag.__name__ = 'convert_h%s' % n
setattr(self, convert_tag.__name__, convert_tag)
return convert_tag
raise AttributeError(attr)
def should_convert_tag(self, tag):
tag = tag.lower()
strip = self.options['strip']
convert = self.options['convert']
if strip is not None:
return tag not in strip
elif convert is not None:
return tag in convert
else:
return True
def escape(self, text, parent_tags):
if not text:
return ''
if self.options['escape_misc']:
text = re.sub(r'([]\\&<`[>~=+|])', r'\\\1', text)
# A sequence of one or more consecutive '-', preceded and
# followed by whitespace or start/end of fragment, might
# be confused with an underline of a header, or with a
# list marker.
text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text)
# A sequence of up to six consecutive '#', preceded and
# followed by whitespace or start/end of fragment, might
# be confused with an ATX heading.
text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text)
# '.' or ')' preceded by up to nine digits might be
# confused with a list item.
text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2',
text)
if self.options['escape_asterisks']:
text = text.replace('*', r'\*')
if self.options['escape_underscores']:
text = text.replace('_', r'\_')
return text
def underline(self, text, pad_char):
text = (text or '').rstrip()
return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
def convert_a(self, el, text, parent_tags):
if '_noformat' in parent_tags:
return text
prefix, suffix, text = chomp(text)
if not text:
return ''
href = el.get('href')
title = el.get('title')
# For the replacement see #29: text nodes underscores are escaped
if (self.options['autolinks']
and text.replace(r'\_', '_') == href
and not title
and not self.options['default_title']):
# Shortcut syntax
return '<%s>' % href
if self.options['default_title'] and not title:
title = href
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text
convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])
def convert_blockquote(self, el, text, parent_tags):
# handle some early-exit scenarios
text = (text or '').strip(' \t\r\n')
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return "\n"
# indent lines with blockquote marker
def _indent_for_blockquote(match):
line_content = match.group(1)
return '> ' + line_content if line_content else '>'
text = line_with_content_re.sub(_indent_for_blockquote, text)
return '\n' + text + '\n\n'
def convert_br(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ""
if self.options['newline_style'].lower() == BACKSLASH:
return '\\\n'
else:
return ' \n'
def convert_code(self, el, text, parent_tags):
if 'pre' in parent_tags:
return text
converter = abstract_inline_conversion(lambda self: '`')
return converter(self, el, text, parent_tags)
convert_del = abstract_inline_conversion(lambda self: '~~')
def convert_div(self, el, text, parent_tags):
if '_inline' in parent_tags:
return ' ' + text.strip() + ' '
text = text.strip()
return '\n\n%s\n\n' % text if text else ''
convert_article = convert_div
convert_section = convert_div
convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])
convert_kbd = convert_code
def convert_dd(self, el, text, parent_tags):
text = (text or '').strip()
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return '\n'
# indent definition content lines by four spaces
def _indent_for_dd(match):
line_content = match.group(1)
return ' ' + line_content if line_content else ''
text = line_with_content_re.sub(_indent_for_dd, text)
# insert definition marker into first-line indent whitespace
text = ':' + text[1:]
return '%s\n' % text
def convert_dt(self, el, text, parent_tags):
# remove newlines from term text
text = (text or '').strip()
text = all_whitespace_re.sub(' ', text)
if '_inline' in parent_tags:
return ' ' + text + ' '
if not text:
return '\n'
# TODO - format consecutive