Text node processing

This commit is contained in:
Matthew Tretter
2012-06-29 14:51:28 -04:00
parent b7e528dd8a
commit 6a70f9f61b
2 changed files with 9 additions and 2 deletions

View File

@@ -4,6 +4,7 @@ import re
convert_heading_re = re.compile(r'convert_h(\d+)')
whitespace_re = re.compile(r'[\r\n\s\t ]+')
def escape(text):
@@ -26,14 +27,14 @@ class MarkdownConverter(object):
return soup.text
def process_tag(self, node):
text = escape(node.text)
text = self.process_text(node.text)
# Convert the children first
for el in node.findall('*'):
self.process_tag(el)
convert_fn = getattr(self, 'convert_%s' % el.tag, None)
tail = escape(el.tail)
tail = self.process_text(el.tail)
el.tail = ''
if convert_fn:
@@ -48,6 +49,9 @@ class MarkdownConverter(object):
node.text = text
def process_text(self, text):
return escape(whitespace_re.sub(' ', text or ''))
def __getattr__(self, attr):
# Handle heading levels > 2
m = convert_heading_re.match(attr)

View File

@@ -10,6 +10,9 @@ class BasicTests(unittest.TestCase):
def test_soup(self):
self.assertEqual(md('<div><span>Hello</div></span>'), 'Hello')
def test_whitespace(self):
self.assertEqual(md(' a b \n\n c '), ' a b c ')
class EscapeTests(unittest.TestCase):