Files
python-markdownify/markdownify/__init__.py
2013-07-31 19:23:39 -04:00

140 lines
4.5 KiB
Python

from bs4 import BeautifulSoup, NavigableString
import re
convert_heading_re = re.compile(r'convert_h(\d+)')
line_beginning_re = re.compile(r'^', re.MULTILINE)
whitespace_re = re.compile(r'[\r\n\s\t ]+')
FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__'
wrapped = '<div id="%s">%%s</div>' % FRAGMENT_ID
def escape(text):
if not text:
return ''
return text.replace('_', r'\_')
class MarkdownConverter(object):
def __init__(self, tags_to_strip=None, tags_to_convert=None):
if tags_to_strip is not None and tags_to_convert is not None:
raise ValueError('You may specify either tags to strip or tags to'
' convert, but not both.')
self.tags_to_strip = tags_to_strip
self.tags_to_convert = tags_to_convert
def convert(self, html):
# We want to take advantage of the html5 parsing, but we don't actually
# want a full document. Therefore, we'll mark our fragment with an id,
# create the document, and extract the element with the id.
html = wrapped % html
soup = BeautifulSoup(html)
return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True)
def process_tag(self, node, children_only=False):
text = ''
# Convert the children first
for el in node.children:
if isinstance(el, NavigableString):
text += self.process_text(unicode(el))
else:
text += self.process_tag(el)
if not children_only:
convert_fn = getattr(self, 'convert_%s' % node.name, None)
if convert_fn and self.should_convert_tag(node.name):
text = convert_fn(node, text)
return text
def process_text(self, text):
return escape(whitespace_re.sub(' ', text or ''))
def __getattr__(self, attr):
# Handle heading levels > 2
m = convert_heading_re.match(attr)
if m:
n = int(m.group(1))
def convert_tag(el, text):
return self.convert_hn(n, el, text)
convert_tag.__name__ = 'convert_h%s' % n
setattr(self, convert_tag.__name__, convert_tag)
return convert_tag
raise AttributeError(attr)
def should_convert_tag(self, tag):
tag = tag.lower()
if self.tags_to_strip is not None:
return tag not in self.tags_to_strip
elif self.tags_to_convert is not None:
return tag in self.tags_to_convert
else:
return True
def underline(self, text, pad_char):
text = (text or '').rstrip()
return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
def convert_a(self, el, text):
href = el.get('href')
title = el.get('title')
if text == href and not title:
# Shortcut syntax
return '<%s>' % href
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
return '[%s](%s%s)' % (text or '', href, title_part) if href else text or ''
def convert_b(self, el, text):
return self.convert_strong(el, text)
def convert_blockquote(self, el, text):
return '\n' + line_beginning_re.sub('> ', text) if text else ''
def convert_br(self, el, text):
return ' \n'
def convert_em(self, el, text):
return '*%s*' % text if text else ''
def convert_h1(self, el, text):
return self.underline(text, '=')
def convert_h2(self, el, text):
return self.underline(text, '-')
def convert_hn(self, n, el, text):
return '%s %s\n\n' % ('#' * n, text.rstrip()) if text else ''
def convert_i(self, el, text):
return self.convert_em(el, text)
def convert_li(self, el, text):
parent = el.parent
if parent is not None and parent.name == 'ol':
bullet = '%s.' % (parent.index(el) + 1)
else:
bullet = '*'
return '%s %s\n' % (bullet, text or '')
def convert_p(self, el, text):
return '%s\n\n' % text if text else ''
def convert_strong(self, el, text):
return '**%s**' % text if text else ''
def convert_img(self, el, text):
alt = el.attrs.get('alt', None) or ''
src = el.attrs.get('src', None) or ''
title = el.attrs.get('title', None) or ''
title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
return '![%s](%s%s)' % (alt, src, title_part)
def markdownify(html, strip=None, convert=None):
converter = MarkdownConverter(strip, convert)
return converter.convert(html)