From a9c13a56dad161a9e21be36c3cbc62de76148054 Mon Sep 17 00:00:00 2001 From: Matthew Tretter Date: Wed, 31 Jul 2013 18:13:50 -0400 Subject: [PATCH] Identify and single out HTML fragment --- markdownify/__init__.py | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/markdownify/__init__.py b/markdownify/__init__.py index 11c6270..6e842d8 100644 --- a/markdownify/__init__.py +++ b/markdownify/__init__.py @@ -5,6 +5,8 @@ import re convert_heading_re = re.compile(r'convert_h(\d+)') line_beginning_re = re.compile(r'^', re.MULTILINE) whitespace_re = re.compile(r'[\r\n\s\t ]+') +FRAGMENT_ID = '__MARKDOWNIFY_WRAPPER__' +wrapped = '
%%s
' % FRAGMENT_ID def escape(text): @@ -22,10 +24,14 @@ class MarkdownConverter(object): self.tags_to_convert = tags_to_convert def convert(self, html): + # We want to take advantage of the html5 parsing, but we don't actually + # want a full document. Therefore, we'll mark our fragment with an id, + # create the document, and extract the element with the id. + html = wrapped % html soup = BeautifulSoup(html) - return self.process_tag(soup) + return self.process_tag(soup.find(id=FRAGMENT_ID), children_only=True) - def process_tag(self, node): + def process_tag(self, node, children_only=False): text = '' # Convert the children first @@ -35,9 +41,10 @@ class MarkdownConverter(object): else: text += self.process_tag(el) - convert_fn = getattr(self, 'convert_%s' % node.name, None) - if convert_fn and self.should_convert_tag(node.name): - text = convert_fn(node, text) + if not children_only: + convert_fn = getattr(self, 'convert_%s' % node.name, None) + if convert_fn and self.should_convert_tag(node.name): + text = convert_fn(node, text) return text