from bs4 import BeautifulSoup, NavigableString, Comment, Doctype from textwrap import fill import re import six convert_heading_re = re.compile(r'convert_h(\d+)') line_with_content_re = re.compile(r'^(.*)', flags=re.MULTILINE) whitespace_re = re.compile(r'[\t ]+') all_whitespace_re = re.compile(r'[\t \r\n]+') newline_whitespace_re = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*') html_heading_re = re.compile(r'h[1-6]') # Heading styles ATX = 'atx' ATX_CLOSED = 'atx_closed' UNDERLINED = 'underlined' SETEXT = UNDERLINED # Newline style SPACES = 'spaces' BACKSLASH = 'backslash' # Strong and emphasis style ASTERISK = '*' UNDERSCORE = '_' # Document strip styles LSTRIP = 'lstrip' RSTRIP = 'rstrip' STRIP = 'strip' def chomp(text): """ If the text in an inline tag like b, a, or em contains a leading or trailing space, strip the string and return a space as suffix of prefix, if needed. This function is used to prevent conversions like foo => ** foo** """ prefix = ' ' if text and text[0] == ' ' else '' suffix = ' ' if text and text[-1] == ' ' else '' text = text.strip() return (prefix, suffix, text) def abstract_inline_conversion(markup_fn): """ This abstracts all simple inline tags like b, em, del, ... Returns a function that wraps the chomped text in a pair of the string that is returned by markup_fn, with '/' inserted in the string used after the text if it looks like an HTML tag. markup_fn is necessary to allow for references to self.strong_em_symbol etc. """ def implementation(self, el, text, convert_as_inline): markup_prefix = markup_fn(self) if markup_prefix.startswith('<') and markup_prefix.endswith('>'): markup_suffix = '~=+|])', r'\\\1', text) # A sequence of one or more consecutive '-', preceded and # followed by whitespace or start/end of fragment, might # be confused with an underline of a header, or with a # list marker. text = re.sub(r'(\s|^)(-+(?:\s|$))', r'\1\\\2', text) # A sequence of up to six consecutive '#', preceded and # followed by whitespace or start/end of fragment, might # be confused with an ATX heading. text = re.sub(r'(\s|^)(#{1,6}(?:\s|$))', r'\1\\\2', text) # '.' or ')' preceded by up to nine digits might be # confused with a list item. text = re.sub(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))', r'\1\\\2', text) if self.options['escape_asterisks']: text = text.replace('*', r'\*') if self.options['escape_underscores']: text = text.replace('_', r'\_') return text def underline(self, text, pad_char): text = (text or '').rstrip() return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else '' def convert_a(self, el, text, convert_as_inline): if el.find_parent(['pre', 'code', 'kbd', 'samp']): return text prefix, suffix, text = chomp(text) if not text: return '' href = el.get('href') title = el.get('title') # For the replacement see #29: text nodes underscores are escaped if (self.options['autolinks'] and text.replace(r'\_', '_') == href and not title and not self.options['default_title']): # Shortcut syntax return '<%s>' % href if self.options['default_title'] and not title: title = href title_part = ' "%s"' % title.replace('"', r'\"') if title else '' return '%s[%s](%s%s)%s' % (prefix, text, href, title_part, suffix) if href else text convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol']) def convert_blockquote(self, el, text, convert_as_inline): # handle some early-exit scenarios text = (text or '').strip() if convert_as_inline: return ' ' + text + ' ' if not text: return "\n" # indent lines with blockquote marker def _indent_for_blockquote(match): line_content = match.group(1) return '> ' + line_content if line_content else '>' text = line_with_content_re.sub(_indent_for_blockquote, text) return '\n' + text + '\n\n' def convert_br(self, el, text, convert_as_inline): if convert_as_inline: return "" if self.options['newline_style'].lower() == BACKSLASH: return '\\\n' else: return ' \n' def convert_code(self, el, text, convert_as_inline): if el.parent.name == 'pre': return text converter = abstract_inline_conversion(lambda self: '`') return converter(self, el, text, convert_as_inline) convert_del = abstract_inline_conversion(lambda self: '~~') convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol']) convert_kbd = convert_code def convert_dd(self, el, text, convert_as_inline): text = (text or '').strip() if convert_as_inline: return ' ' + text + ' ' if not text: return '\n' # indent definition content lines by four spaces def _indent_for_dd(match): line_content = match.group(1) return ' ' + line_content if line_content else '' text = line_with_content_re.sub(_indent_for_dd, text) # insert definition marker into first-line indent whitespace text = ':' + text[1:] return '%s\n' % text def convert_dt(self, el, text, convert_as_inline): # remove newlines from term text text = (text or '').strip() text = all_whitespace_re.sub(' ', text) if convert_as_inline: return ' ' + text + ' ' if not text: return '\n' # TODO - format consecutive

elements as directly adjacent lines): # https://michelf.ca/projects/php-markdown/extra/#def-list return '\n%s\n' % text def _convert_hn(self, n, el, text, convert_as_inline): """ Method name prefixed with _ to prevent to call this """ if convert_as_inline: return text # prevent MemoryErrors in case of very large n n = max(1, min(6, n)) style = self.options['heading_style'].lower() text = text.strip() if style == UNDERLINED and n <= 2: line = '=' if n == 1 else '-' return self.underline(text, line) text = all_whitespace_re.sub(' ', text) hashes = '#' * n if style == ATX_CLOSED: return '\n\n%s %s %s\n\n' % (hashes, text, hashes) return '\n\n%s %s\n\n' % (hashes, text) def convert_hr(self, el, text, convert_as_inline): return '\n\n---\n\n' convert_i = convert_em def convert_img(self, el, text, convert_as_inline): alt = el.attrs.get('alt', None) or '' src = el.attrs.get('src', None) or '' title = el.attrs.get('title', None) or '' title_part = ' "%s"' % title.replace('"', r'\"') if title else '' if (convert_as_inline and el.parent.name not in self.options['keep_inline_images_in']): return alt return '![%s](%s%s)' % (alt, src, title_part) def convert_list(self, el, text, convert_as_inline): # Converting a list to inline is undefined. # Ignoring convert_to_inline for list. nested = False before_paragraph = False if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']: before_paragraph = True while el: if el.name == 'li': nested = True break el = el.parent if nested: # remove trailing newline if nested return '\n' + text.rstrip() return '\n\n' + text + ('\n' if before_paragraph else '') convert_ul = convert_list convert_ol = convert_list def convert_li(self, el, text, convert_as_inline): # handle some early-exit scenarios text = (text or '').strip() if not text: return "\n" # determine list item bullet character to use parent = el.parent if parent is not None and parent.name == 'ol': if parent.get("start") and str(parent.get("start")).isnumeric(): start = int(parent.get("start")) else: start = 1 bullet = '%s.' % (start + parent.index(el)) else: depth = -1 while el: if el.name == 'ul': depth += 1 el = el.parent bullets = self.options['bullets'] bullet = bullets[depth % len(bullets)] bullet = bullet + ' ' bullet_width = len(bullet) bullet_indent = ' ' * bullet_width # indent content lines by bullet width def _indent_for_li(match): line_content = match.group(1) return bullet_indent + line_content if line_content else '' text = line_with_content_re.sub(_indent_for_li, text) # insert bullet into first-line indent whitespace text = bullet + text[bullet_width:] return '%s\n' % text def convert_p(self, el, text, convert_as_inline): if convert_as_inline: return ' ' + text.strip() + ' ' text = text.strip() if self.options['wrap']: # Preserve newlines (and preceding whitespace) resulting # from
tags. Newlines in the input have already been # replaced by spaces. if self.options['wrap_width'] is not None: lines = text.split('\n') new_lines = [] for line in lines: line = line.lstrip() line_no_trailing = line.rstrip() trailing = line[len(line_no_trailing):] line = fill(line, width=self.options['wrap_width'], break_long_words=False, break_on_hyphens=False) new_lines.append(line + trailing) text = '\n'.join(new_lines) return '\n\n%s\n\n' % text if text else '' def convert_pre(self, el, text, convert_as_inline): if not text: return '' code_language = self.options['code_language'] if self.options['code_language_callback']: code_language = self.options['code_language_callback'](el) or code_language return '\n\n```%s\n%s\n```\n\n' % (code_language, text) def convert_script(self, el, text, convert_as_inline): return '' def convert_style(self, el, text, convert_as_inline): return '' convert_s = convert_del convert_strong = convert_b convert_samp = convert_code convert_sub = abstract_inline_conversion(lambda self: self.options['sub_symbol']) convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol']) def convert_table(self, el, text, convert_as_inline): return '\n\n' + text.strip() + '\n\n' def convert_caption(self, el, text, convert_as_inline): return text.strip() + '\n\n' def convert_figcaption(self, el, text, convert_as_inline): return '\n\n' + text.strip() + '\n\n' def convert_td(self, el, text, convert_as_inline): colspan = 1 if 'colspan' in el.attrs and el['colspan'].isdigit(): colspan = int(el['colspan']) return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_th(self, el, text, convert_as_inline): colspan = 1 if 'colspan' in el.attrs and el['colspan'].isdigit(): colspan = int(el['colspan']) return ' ' + text.strip().replace("\n", " ") + ' |' * colspan def convert_tr(self, el, text, convert_as_inline): cells = el.find_all(['td', 'th']) is_headrow = ( all([cell.name == 'th' for cell in cells]) or (el.parent.name == 'thead' # avoid multiple tr in thead and len(el.parent.find_all('tr')) == 1) ) is_head_row_missing = ( (not el.previous_sibling and not el.parent.name == 'tbody') or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1) ) overline = '' underline = '' if ((is_headrow or (is_head_row_missing and self.options['table_infer_header'])) and not el.previous_sibling): # first row and: # - is headline or # - headline is missing and header inference is enabled # print headline underline full_colspan = 0 for cell in cells: if 'colspan' in cell.attrs and cell['colspan'].isdigit(): full_colspan += int(cell["colspan"]) else: full_colspan += 1 underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n' elif ((is_head_row_missing and not self.options['table_infer_header']) or (not el.previous_sibling and (el.parent.name == 'table' or (el.parent.name == 'tbody' and not el.parent.previous_sibling)))): # headline is missing and header inference is disabled or: # first row, not headline, and: # - the parent is table or # - the parent is tbody at the beginning of a table. # print empty headline above this row overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n' overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n' return overline + '|' + text + '\n' + underline def markdownify(html, **options): return MarkdownConverter(**options).convert(html)