Text node processing

2012-06-29 14:51:28 -04:00
parent b7e528dd8a
commit 6a70f9f61b
2 changed files with 9 additions and 2 deletions
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -4,6 +4,7 @@ import re


 convert_heading_re = re.compile(r'convert_h(\d+)')
+whitespace_re = re.compile(r'[\r\n\s\t ]+')


 def escape(text):
@@ -26,14 +27,14 @@ class MarkdownConverter(object):
        return soup.text

    def process_tag(self, node):
-        text = escape(node.text)
+        text = self.process_text(node.text)

        # Convert the children first
        for el in node.findall('*'):
            self.process_tag(el)

            convert_fn = getattr(self, 'convert_%s' % el.tag, None)
-            tail = escape(el.tail)
+            tail = self.process_text(el.tail)
            el.tail = ''

            if convert_fn:
@@ -48,6 +49,9 @@ class MarkdownConverter(object):

        node.text = text

+    def process_text(self, text):
+        return escape(whitespace_re.sub(' ', text or ''))
+
    def __getattr__(self, attr):
        # Handle heading levels > 2
        m = convert_heading_re.match(attr)
--- a/tests.py
+++ b/tests.py
@@ -10,6 +10,9 @@ class BasicTests(unittest.TestCase):
    def test_soup(self):
        self.assertEqual(md('<div><span>Hello</div></span>'), 'Hello')

+    def test_whitespace(self):
+        self.assertEqual(md(' a  b \n\n c '), ' a b c ')
+

 class EscapeTests(unittest.TestCase):