Merge branch 'develop'

bump to version v1.2.2
2025-11-16 20:19:50 +01:00 · 2025-11-16 20:19:33 +01:00 · 2025-11-16 20:15:01 +01:00 · 2025-11-16 20:07:11 +01:00 · 2025-08-09 19:41:10 +02:00 · 2025-08-09 19:40:43 +02:00
19 changed files with 1142 additions and 339 deletions
--- a/.github/workflows/python-app.yml
+++ b/.github/workflows/python-app.yml
@@ -14,6 +14,27 @@ jobs:

    runs-on: ubuntu-latest

+    steps:
+    - uses: actions/checkout@v4
+    - name: Set up Python 3.8
+      uses: actions/setup-python@v2
+      with:
+        python-version: 3.8
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install --upgrade setuptools setuptools_scm wheel build tox
+    - name: Lint and test
+      run: |
+        tox
+    - name: Build
+      run: |
+        python -m build -nwsx .
+
+  types:
+
+    runs-on: ubuntu-latest
+
    steps:
    - uses: actions/checkout@v2
    - name: Set up Python 3.8
@@ -23,7 +44,8 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install tox
-    - name: Lint and test
+        pip install --upgrade setuptools setuptools_scm wheel build tox mypy types-beautifulsoup4
+    - name: Check types
      run: |
-        tox
+        mypy .
+        mypy --strict tests/types.py
--- a/.github/workflows/python-publish.yml
+++ b/.github/workflows/python-publish.yml
@@ -13,7 +13,7 @@ jobs:
    runs-on: ubuntu-latest

    steps:
-    - uses: actions/checkout@v2
+    - uses: actions/checkout@v4
    - name: Set up Python
      uses: actions/setup-python@v2
      with:
@@ -21,11 +21,11 @@ jobs:
    - name: Install dependencies
      run: |
        python -m pip install --upgrade pip
-        pip install setuptools wheel twine
+        pip install --upgrade setuptools setuptools_scm wheel build twine
    - name: Build and publish
      env:
        TWINE_USERNAME: ${{ secrets.PYPI_USERNAME }}
        TWINE_PASSWORD: ${{ secrets.PYPI_PASSWORD }}
      run: |
-        python setup.py sdist bdist_wheel
+        python -m build -nwsx .
        twine upload dist/*
--- a/README.rst
+++ b/README.rst
@@ -110,7 +110,7 @@ code_language_callback
  When the HTML code contains ``pre`` tags that in some way provide the code
  language, for example as class, this callback can be used to extract the
  language from the tag and prefix it to the converted ``pre`` tag.
-  The callback gets one single argument, an BeautifylSoup object, and returns
+  The callback gets one single argument, a BeautifulSoup object, and returns
  a string containing the code language, or ``None``.
  An example to use the class name as code language could be::

@@ -128,9 +128,9 @@ escape_underscores
  Defaults to ``True``.

 escape_misc
-  If set to ``False``, do not escape miscellaneous punctuation characters
+  If set to ``True``, escape miscellaneous punctuation characters
  that sometimes have Markdown significance in text.
-  Defaults to ``True``.
+  Defaults to ``False``.

 keep_inline_images_in
  Images are converted to their alt-text when the images are located inside
@@ -139,10 +139,40 @@ keep_inline_images_in
  that should be allowed to contain inline images, for example ``['td']``.
  Defaults to an empty list.

+table_infer_header
+  Controls handling of tables with no header row (as indicated by ``<thead>``
+  or ``<th>``). When set to ``True``, the first body row is used as the header row.
+  Defaults to ``False``, which leaves the header row empty.
+
 wrap, wrap_width
  If ``wrap`` is set to ``True``, all text paragraphs are wrapped at
  ``wrap_width`` characters. Defaults to ``False`` and ``80``.
  Use with ``newline_style=BACKSLASH`` to keep line breaks in paragraphs.
+  A `wrap_width` value of `None` reflows lines to unlimited line length.
+
+strip_document
+  Controls whether leading and/or trailing separation newlines are removed from
+  the final converted document. Supported values are ``LSTRIP`` (leading),
+  ``RSTRIP`` (trailing), ``STRIP`` (both), and ``None`` (neither). Newlines
+  within the document are unaffected.
+  Defaults to ``STRIP``.
+
+strip_pre
+  Controls whether leading/trailing blank lines are removed from ``<pre>``
+  tags. Supported values are ``STRIP`` (all leading/trailing blank lines),
+  ``STRIP_ONE`` (one leading/trailing blank line), and ``None`` (neither).
+  Defaults to ``STRIP``.
+
+bs4_options
+  Specify additional configuration options for the ``BeautifulSoup`` object
+  used to interpret the HTML markup. String and list values (such as ``lxml``
+  or ``html5lib``) are treated as ``features`` arguments to control parser
+  selection. Dictionary values (such as ``{"from_encoding": "iso-8859-8"}``)
+  are treated as full kwargs to be used for the BeautifulSoup constructor,
+  allowing specification of any parameter. For parameter details, see the
+  Beautiful Soup documentation at:
+
+.. _BeautifulSoup: https://www.crummy.com/software/BeautifulSoup/bs4/doc/

 Options may be specified as kwargs to the ``markdownify`` function, or as a
 nested ``Options`` class in ``MarkdownConverter`` subclasses.
@@ -167,7 +197,7 @@ If you have a special usecase that calls for a special conversion, you can
 always inherit from ``MarkdownConverter`` and override the method you want to
 change.
 The function that handles a HTML tag named ``abc`` is called
-``convert_abc(self, el, text, convert_as_inline)`` and returns a string
+``convert_abc(self, el, text, parent_tags)`` and returns a string
 containing the converted HTML tag.
 The ``MarkdownConverter`` object will handle the conversion based on the
 function names:
@@ -180,8 +210,8 @@ function names:
        """
        Create a custom MarkdownConverter that adds two newlines after an image
        """
-        def convert_img(self, el, text, convert_as_inline):
-            return super().convert_img(el, text, convert_as_inline) + '\n\n'
+        def convert_img(self, el, text, parent_tags):
+            return super().convert_img(el, text, parent_tags) + '\n\n'

    # Create shorthand method for conversion
    def md(html, **options):
@@ -195,7 +225,7 @@ function names:
        """
        Create a custom MarkdownConverter that ignores paragraphs
        """
-        def convert_p(self, el, text, convert_as_inline):
+        def convert_p(self, el, text, parent_tags):
            return ''

    # Create shorthand method for conversion
--- a/markdownify/init.py
+++ b/markdownify/init.py
@@ -1,15 +1,48 @@
-from bs4 import BeautifulSoup, NavigableString, Comment, Doctype
+from bs4 import BeautifulSoup, Comment, Doctype, NavigableString, Tag
 from textwrap import fill
 import re
 import six


-convert_heading_re = re.compile(r'convert_h(\d+)')
-line_beginning_re = re.compile(r'^', re.MULTILINE)
-whitespace_re = re.compile(r'[\t ]+')
-all_whitespace_re = re.compile(r'[\s]+')
-html_heading_re = re.compile(r'h[1-6]')
+# General-purpose regex patterns
+re_convert_heading = re.compile(r'convert_h(\d+)')
+re_line_with_content = re.compile(r'^(.*)', flags=re.MULTILINE)
+re_whitespace = re.compile(r'[\t ]+')
+re_all_whitespace = re.compile(r'[\t \r\n]+')
+re_newline_whitespace = re.compile(r'[\t \r\n]*[\r\n][\t \r\n]*')
+re_html_heading = re.compile(r'h(\d+)')
+re_pre_lstrip1 = re.compile(r'^ *\n')
+re_pre_rstrip1 = re.compile(r'\n *$')
+re_pre_lstrip = re.compile(r'^[ \n]*\n')
+re_pre_rstrip = re.compile(r'[ \n]*$')

+# Pattern for creating convert_<tag> function names from tag names
+re_make_convert_fn_name = re.compile(r'[\[\]:-]')
+
+# Extract (leading_nl, content, trailing_nl) from a string
+# (functionally equivalent to r'^(\n*)(.*?)(\n*)$', but greedy is faster than reluctant here)
+re_extract_newlines = re.compile(r'^(\n*)((?:.*[^\n])?)(\n*)$', flags=re.DOTALL)
+
+# Escape miscellaneous special Markdown characters
+re_escape_misc_chars = re.compile(r'([]\\&<`[>~=+|])')
+
+# Escape sequence of one or more consecutive '-', preceded
+# and followed by whitespace or start/end of fragment, as it
+# might be confused with an underline of a header, or with a
+# list marker
+re_escape_misc_dash_sequences = re.compile(r'(\s|^)(-+(?:\s|$))')
+
+# Escape sequence of up to six consecutive '#', preceded
+# and followed by whitespace or start/end of fragment, as
+# it might be confused with an ATX heading
+re_escape_misc_hashes = re.compile(r'(\s|^)(#{1,6}(?:\s|$))')
+
+# Escape '.' or ')' preceded by up to nine digits, as it might be
+# confused with a list item
+re_escape_misc_list_items = re.compile(r'((?:\s|^)[0-9]{1,9})([.)](?:\s|$))')
+
+# Find consecutive backtick sequences in a string
+re_backtick_runs = re.compile(r'`+')

 # Heading styles
 ATX = 'atx'
@@ -25,6 +58,26 @@ BACKSLASH = 'backslash'
 ASTERISK = '*'
 UNDERSCORE = '_'

+# Document/pre strip styles
+LSTRIP = 'lstrip'
+RSTRIP = 'rstrip'
+STRIP = 'strip'
+STRIP_ONE = 'strip_one'
+
+
+def strip1_pre(text):
+    """Strip one leading and trailing newline from a <pre> string."""
+    text = re_pre_lstrip1.sub('', text)
+    text = re_pre_rstrip1.sub('', text)
+    return text
+
+
+def strip_pre(text):
+    """Strip all leading and trailing newlines from a <pre> string."""
+    text = re_pre_lstrip.sub('', text)
+    text = re_pre_rstrip.sub('', text)
+    return text
+

 def chomp(text):
    """
@@ -47,13 +100,13 @@ def abstract_inline_conversion(markup_fn):
    the text if it looks like an HTML tag. markup_fn is necessary to allow for
    references to self.strong_em_symbol etc.
    """
-    def implementation(self, el, text, convert_as_inline):
+    def implementation(self, el, text, parent_tags):
        markup_prefix = markup_fn(self)
        if markup_prefix.startswith('<') and markup_prefix.endswith('>'):
            markup_suffix = '</' + markup_prefix[1:]
        else:
            markup_suffix = markup_prefix
-        if el.find_parent(['pre', 'code', 'kbd', 'samp']):
+        if '_noformat' in parent_tags:
            return text
        prefix, suffix, text = chomp(text)
        if not text:
@@ -66,9 +119,64 @@ def _todict(obj):
    return dict((k, getattr(obj, k)) for k in dir(obj) if not k.startswith('_'))


+def should_remove_whitespace_inside(el):
+    """Return to remove whitespace immediately inside a block-level element."""
+    if not el or not el.name:
+        return False
+    if re_html_heading.match(el.name) is not None:
+        return True
+    return el.name in ('p', 'blockquote',
+                       'article', 'div', 'section',
+                       'ol', 'ul', 'li',
+                       'dl', 'dt', 'dd',
+                       'table', 'thead', 'tbody', 'tfoot',
+                       'tr', 'td', 'th')
+
+
+def should_remove_whitespace_outside(el):
+    """Return to remove whitespace immediately outside a block-level element."""
+    return should_remove_whitespace_inside(el) or (el and el.name == 'pre')
+
+
+def _is_block_content_element(el):
+    """
+    In a block context, returns:
+
+    - True for content elements (tags and non-whitespace text)
+    - False for non-content elements (whitespace text, comments, doctypes)
+    """
+    if isinstance(el, Tag):
+        return True
+    elif isinstance(el, (Comment, Doctype)):
+        return False  # (subclasses of NavigableString, must test first)
+    elif isinstance(el, NavigableString):
+        return el.strip() != ''
+    else:
+        return False
+
+
+def _prev_block_content_sibling(el):
+    """Returns the first previous sibling that is a content element, else None."""
+    while el is not None:
+        el = el.previous_sibling
+        if _is_block_content_element(el):
+            return el
+    return None
+
+
+def _next_block_content_sibling(el):
+    """Returns the first next sibling that is a content element, else None."""
+    while el is not None:
+        el = el.next_sibling
+        if _is_block_content_element(el):
+            return el
+    return None
+
+
 class MarkdownConverter(object):
    class DefaultOptions:
        autolinks = True
+        bs4_options = 'html.parser'
        bullets = '*+-'  # An iterable of bullet types.
        code_language = ''
        code_language_callback = None
@@ -76,14 +184,17 @@ class MarkdownConverter(object):
        default_title = False
        escape_asterisks = True
        escape_underscores = True
-        escape_misc = True
+        escape_misc = False
        heading_style = UNDERLINED
        keep_inline_images_in = []
        newline_style = SPACES
        strip = None
+        strip_document = STRIP
+        strip_pre = STRIP
        strong_em_symbol = ASTERISK
        sub_symbol = ''
        sup_symbol = ''
+        table_infer_header = False
        wrap = False
        wrap_width = 80

@@ -100,101 +211,202 @@ class MarkdownConverter(object):
            raise ValueError('You may specify either tags to strip or tags to'
                             ' convert, but not both.')

+        # If a string or list is passed to bs4_options, assume it is a 'features' specification
+        if not isinstance(self.options['bs4_options'], dict):
+            self.options['bs4_options'] = {'features': self.options['bs4_options']}
+
+        # Initialize the conversion function cache
+        self.convert_fn_cache = {}
+
    def convert(self, html):
-        soup = BeautifulSoup(html, 'html.parser')
+        soup = BeautifulSoup(html, **self.options['bs4_options'])
        return self.convert_soup(soup)

    def convert_soup(self, soup):
-        return self.process_tag(soup, convert_as_inline=False, children_only=True)
+        return self.process_tag(soup, parent_tags=set())

-    def process_tag(self, node, convert_as_inline, children_only=False):
-        text = ''
+    def process_element(self, node, parent_tags=None):
+        if isinstance(node, NavigableString):
+            return self.process_text(node, parent_tags=parent_tags)
+        else:
+            return self.process_tag(node, parent_tags=parent_tags)

-        # markdown headings or cells can't include
-        # block elements (elements w/newlines)
-        isHeading = html_heading_re.match(node.name) is not None
-        isCell = node.name in ['td', 'th']
-        convert_children_as_inline = convert_as_inline
+    def process_tag(self, node, parent_tags=None):
+        # For the top-level element, initialize the parent context with an empty set.
+        if parent_tags is None:
+            parent_tags = set()

-        if not children_only and (isHeading or isCell):
-            convert_children_as_inline = True
+        # Collect child elements to process, ignoring whitespace-only text elements
+        # adjacent to the inner/outer boundaries of block elements.
+        should_remove_inside = should_remove_whitespace_inside(node)

-        # Remove whitespace-only textnodes in purely nested nodes
-        def is_nested_node(el):
-            return el and el.name in ['ol', 'ul', 'li',
-                                      'table', 'thead', 'tbody', 'tfoot',
-                                      'tr', 'td', 'th']
-
-        if is_nested_node(node):
-            for el in node.children:
-                # Only extract (remove) whitespace-only text node if any of the
-                # conditions is true:
-                # - el is the first element in its parent
-                # - el is the last element in its parent
-                # - el is adjacent to an nested node
-                can_extract = (not el.previous_sibling
-                               or not el.next_sibling
-                               or is_nested_node(el.previous_sibling)
-                               or is_nested_node(el.next_sibling))
-                if (isinstance(el, NavigableString)
-                        and six.text_type(el).strip() == ''
-                        and can_extract):
-                    el.extract()
-
-        # Convert the children first
-        for el in node.children:
-            if isinstance(el, Comment) or isinstance(el, Doctype):
-                continue
+        def _can_ignore(el):
+            if isinstance(el, Tag):
+                # Tags are always processed.
+                return False
+            elif isinstance(el, (Comment, Doctype)):
+                # Comment and Doctype elements are always ignored.
+                # (subclasses of NavigableString, must test first)
+                return True
            elif isinstance(el, NavigableString):
-                text += self.process_text(el)
+                if six.text_type(el).strip() != '':
+                    # Non-whitespace text nodes are always processed.
+                    return False
+                elif should_remove_inside and (not el.previous_sibling or not el.next_sibling):
+                    # Inside block elements (excluding <pre>), ignore adjacent whitespace elements.
+                    return True
+                elif should_remove_whitespace_outside(el.previous_sibling) or should_remove_whitespace_outside(el.next_sibling):
+                    # Outside block elements (including <pre>), ignore adjacent whitespace elements.
+                    return True
+                else:
+                    return False
+            elif el is None:
+                return True
            else:
-                text += self.process_tag(el, convert_children_as_inline)
+                raise ValueError('Unexpected element type: %s' % type(el))

-        if not children_only:
-            convert_fn = getattr(self, 'convert_%s' % node.name, None)
-            if convert_fn and self.should_convert_tag(node.name):
-                text = convert_fn(node, text, convert_as_inline)
+        children_to_convert = [el for el in node.children if not _can_ignore(el)]
+
+        # Create a copy of this tag's parent context, then update it to include this tag
+        # to propagate down into the children.
+        parent_tags_for_children = set(parent_tags)
+        parent_tags_for_children.add(node.name)
+
+        # if this tag is a heading or table cell, add an '_inline' parent pseudo-tag
+        if (
+            re_html_heading.match(node.name) is not None  # headings
+            or node.name in {'td', 'th'}  # table cells
+        ):
+            parent_tags_for_children.add('_inline')
+
+        # if this tag is a preformatted element, add a '_noformat' parent pseudo-tag
+        if node.name in {'pre', 'code', 'kbd', 'samp'}:
+            parent_tags_for_children.add('_noformat')
+
+        # Convert the children elements into a list of result strings.
+        child_strings = [
+            self.process_element(el, parent_tags=parent_tags_for_children)
+            for el in children_to_convert
+        ]
+
+        # Remove empty string values.
+        child_strings = [s for s in child_strings if s]
+
+        # Collapse newlines at child element boundaries, if needed.
+        if node.name == 'pre' or node.find_parent('pre'):
+            # Inside <pre> blocks, do not collapse newlines.
+            pass
+        else:
+            # Collapse newlines at child element boundaries.
+            updated_child_strings = ['']  # so the first lookback works
+            for child_string in child_strings:
+                # Separate the leading/trailing newlines from the content.
+                leading_nl, content, trailing_nl = re_extract_newlines.match(child_string).groups()
+
+                # If the last child had trailing newlines and this child has leading newlines,
+                # use the larger newline count, limited to 2.
+                if updated_child_strings[-1] and leading_nl:
+                    prev_trailing_nl = updated_child_strings.pop()  # will be replaced by the collapsed value
+                    num_newlines = min(2, max(len(prev_trailing_nl), len(leading_nl)))
+                    leading_nl = '\n' * num_newlines
+
+                # Add the results to the updated child string list.
+                updated_child_strings.extend([leading_nl, content, trailing_nl])
+
+            child_strings = updated_child_strings
+
+        # Join all child text strings into a single string.
+        text = ''.join(child_strings)
+
+        # apply this tag's final conversion function
+        convert_fn = self.get_conv_fn_cached(node.name)
+        if convert_fn is not None:
+            text = convert_fn(node, text, parent_tags=parent_tags)

        return text

-    def process_text(self, el):
+    def convert__document_(self, el, text, parent_tags):
+        """Final document-level formatting for BeautifulSoup object (node.name == "[document]")"""
+        if self.options['strip_document'] == LSTRIP:
+            text = text.lstrip('\n')  # remove leading separation newlines
+        elif self.options['strip_document'] == RSTRIP:
+            text = text.rstrip('\n')  # remove trailing separation newlines
+        elif self.options['strip_document'] == STRIP:
+            text = text.strip('\n')  # remove leading and trailing separation newlines
+        elif self.options['strip_document'] is None:
+            pass  # leave leading and trailing separation newlines as-is
+        else:
+            raise ValueError('Invalid value for strip_document: %s' % self.options['strip_document'])
+
+        return text
+
+    def process_text(self, el, parent_tags=None):
+        # For the top-level element, initialize the parent context with an empty set.
+        if parent_tags is None:
+            parent_tags = set()
+
        text = six.text_type(el) or ''

        # normalize whitespace if we're not inside a preformatted element
-        if not el.find_parent('pre'):
-            text = whitespace_re.sub(' ', text)
+        if 'pre' not in parent_tags:
+            if self.options['wrap']:
+                text = re_all_whitespace.sub(' ', text)
+            else:
+                text = re_newline_whitespace.sub('\n', text)
+                text = re_whitespace.sub(' ', text)

        # escape special characters if we're not inside a preformatted or code element
-        if not el.find_parent(['pre', 'code', 'kbd', 'samp']):
-            text = self.escape(text)
+        if '_noformat' not in parent_tags:
+            text = self.escape(text, parent_tags)

-        # remove trailing whitespaces if any of the following condition is true:
-        # - current text node is the last node in li
-        # - current text node is followed by an embedded list
-        if (el.parent.name == 'li'
-                and (not el.next_sibling
-                     or el.next_sibling.name in ['ul', 'ol'])):
+        # remove leading whitespace at the start or just after a
+        # block-level element; remove traliing whitespace at the end
+        # or just before a block-level element.
+        if (should_remove_whitespace_outside(el.previous_sibling)
+                or (should_remove_whitespace_inside(el.parent)
+                    and not el.previous_sibling)):
+            text = text.lstrip(' \t\r\n')
+        if (should_remove_whitespace_outside(el.next_sibling)
+                or (should_remove_whitespace_inside(el.parent)
+                    and not el.next_sibling)):
            text = text.rstrip()

        return text

-    def __getattr__(self, attr):
-        # Handle headings
-        m = convert_heading_re.match(attr)
-        if m:
-            n = int(m.group(1))
+    def get_conv_fn_cached(self, tag_name):
+        """Given a tag name, return the conversion function using the cache."""
+        # If conversion function is not in cache, add it
+        if tag_name not in self.convert_fn_cache:
+            self.convert_fn_cache[tag_name] = self.get_conv_fn(tag_name)

-            def convert_tag(el, text, convert_as_inline):
-                return self.convert_hn(n, el, text, convert_as_inline)
+        # Return the cached entry
+        return self.convert_fn_cache[tag_name]

-            convert_tag.__name__ = 'convert_h%s' % n
-            setattr(self, convert_tag.__name__, convert_tag)
-            return convert_tag
+    def get_conv_fn(self, tag_name):
+        """Given a tag name, find and return the conversion function."""
+        tag_name = tag_name.lower()

-        raise AttributeError(attr)
+        # Handle strip/convert exclusion options
+        if not self.should_convert_tag(tag_name):
+            return None
+
+        # Look for an explicitly defined conversion function by tag name first
+        convert_fn_name = "convert_%s" % re_make_convert_fn_name.sub("_", tag_name)
+        convert_fn = getattr(self, convert_fn_name, None)
+        if convert_fn:
+            return convert_fn
+
+        # If tag is any heading, handle with convert_hN() function
+        match = re_html_heading.match(tag_name)
+        if match:
+            n = int(match.group(1))  # get value of N from <hN>
+            return lambda el, text, parent_tags: self.convert_hN(n, el, text, parent_tags)
+
+        # No conversion function was found
+        return None

    def should_convert_tag(self, tag):
-        tag = tag.lower()
+        """Given a tag name, return whether to convert based on strip/convert options."""
        strip = self.options['strip']
        convert = self.options['convert']
        if strip is not None:
@@ -204,26 +416,28 @@ class MarkdownConverter(object):
        else:
            return True

-    def escape(self, text):
+    def escape(self, text, parent_tags):
        if not text:
            return ''
        if self.options['escape_misc']:
-            text = re.sub(r'([\\&<`[>~#=+|-])', r'\\\1', text)
-            text = re.sub(r'([0-9])([.)])', r'\1\\\2', text)
+            text = re_escape_misc_chars.sub(r'\\\1', text)
+            text = re_escape_misc_dash_sequences.sub(r'\1\\\2', text)
+            text = re_escape_misc_hashes.sub(r'\1\\\2', text)
+            text = re_escape_misc_list_items.sub(r'\1\\\2', text)
+
        if self.options['escape_asterisks']:
            text = text.replace('*', r'\*')
        if self.options['escape_underscores']:
            text = text.replace('_', r'\_')
        return text

-    def indent(self, text, level):
-        return line_beginning_re.sub('\t' * level, text) if text else ''
-
    def underline(self, text, pad_char):
        text = (text or '').rstrip()
-        return '%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''
+        return '\n\n%s\n%s\n\n' % (text, pad_char * len(text)) if text else ''

-    def convert_a(self, el, text, convert_as_inline):
+    def convert_a(self, el, text, parent_tags):
+        if '_noformat' in parent_tags:
+            return text
        prefix, suffix, text = chomp(text)
        if not text:
            return ''
@@ -243,94 +457,188 @@ class MarkdownConverter(object):

    convert_b = abstract_inline_conversion(lambda self: 2 * self.options['strong_em_symbol'])

-    def convert_blockquote(self, el, text, convert_as_inline):
+    def convert_blockquote(self, el, text, parent_tags):
+        # handle some early-exit scenarios
+        text = (text or '').strip(' \t\r\n')
+        if '_inline' in parent_tags:
+            return ' ' + text + ' '
+        if not text:
+            return "\n"

-        if convert_as_inline:
-            return text
+        # indent lines with blockquote marker
+        def _indent_for_blockquote(match):
+            line_content = match.group(1)
+            return '> ' + line_content if line_content else '>'
+        text = re_line_with_content.sub(_indent_for_blockquote, text)

-        return '\n' + (line_beginning_re.sub('> ', text.strip()) + '\n\n') if text else ''
+        return '\n' + text + '\n\n'

-    def convert_br(self, el, text, convert_as_inline):
-        if convert_as_inline:
-            return ""
+    def convert_br(self, el, text, parent_tags):
+        if '_inline' in parent_tags:
+            return ' '

        if self.options['newline_style'].lower() == BACKSLASH:
            return '\\\n'
        else:
            return '  \n'

-    def convert_code(self, el, text, convert_as_inline):
-        if el.parent.name == 'pre':
+    def convert_code(self, el, text, parent_tags):
+        if '_noformat' in parent_tags:
            return text
-        converter = abstract_inline_conversion(lambda self: '`')
-        return converter(self, el, text, convert_as_inline)
+
+        prefix, suffix, text = chomp(text)
+        if not text:
+            return ''
+
+        # Find the maximum number of consecutive backticks in the text, then
+        # delimit the code span with one more backtick than that
+        max_backticks = max((len(match) for match in re.findall(re_backtick_runs, text)), default=0)
+        markup_delimiter = '`' * (max_backticks + 1)
+
+        # If the maximum number of backticks is greater than zero, add a space
+        # to avoid interpretation of inside backticks as literals
+        if max_backticks > 0:
+            text = " " + text + " "
+
+        return '%s%s%s%s%s' % (prefix, markup_delimiter, text, markup_delimiter, suffix)

    convert_del = abstract_inline_conversion(lambda self: '~~')

+    def convert_div(self, el, text, parent_tags):
+        if '_inline' in parent_tags:
+            return ' ' + text.strip() + ' '
+        text = text.strip()
+        return '\n\n%s\n\n' % text if text else ''
+
+    convert_article = convert_div
+
+    convert_section = convert_div
+
    convert_em = abstract_inline_conversion(lambda self: self.options['strong_em_symbol'])

    convert_kbd = convert_code

-    def convert_hn(self, n, el, text, convert_as_inline):
-        if convert_as_inline:
+    def convert_dd(self, el, text, parent_tags):
+        text = (text or '').strip()
+        if '_inline' in parent_tags:
+            return ' ' + text + ' '
+        if not text:
+            return '\n'
+
+        # indent definition content lines by four spaces
+        def _indent_for_dd(match):
+            line_content = match.group(1)
+            return '    ' + line_content if line_content else ''
+        text = re_line_with_content.sub(_indent_for_dd, text)
+
+        # insert definition marker into first-line indent whitespace
+        text = ':' + text[1:]
+
+        return '%s\n' % text
+
+    # definition lists are formatted as follows:
+    #   https://pandoc.org/MANUAL.html#definition-lists
+    #   https://michelf.ca/projects/php-markdown/extra/#def-list
+    convert_dl = convert_div
+
+    def convert_dt(self, el, text, parent_tags):
+        # remove newlines from term text
+        text = (text or '').strip()
+        text = re_all_whitespace.sub(' ', text)
+        if '_inline' in parent_tags:
+            return ' ' + text + ' '
+        if not text:
+            return '\n'
+
+        # TODO - format consecutive <dt> elements as directly adjacent lines):
+        #   https://michelf.ca/projects/php-markdown/extra/#def-list
+
+        return '\n\n%s\n' % text
+
+    def convert_hN(self, n, el, text, parent_tags):
+        # convert_hN() converts <hN> tags, where N is any integer
+        if '_inline' in parent_tags:
            return text

+        # Markdown does not support heading depths of n > 6
+        n = max(1, min(6, n))
+
        style = self.options['heading_style'].lower()
        text = text.strip()
        if style == UNDERLINED and n <= 2:
            line = '=' if n == 1 else '-'
            return self.underline(text, line)
+        text = re_all_whitespace.sub(' ', text)
        hashes = '#' * n
        if style == ATX_CLOSED:
-            return '%s %s %s\n\n' % (hashes, text, hashes)
-        return '%s %s\n\n' % (hashes, text)
+            return '\n\n%s %s %s\n\n' % (hashes, text, hashes)
+        return '\n\n%s %s\n\n' % (hashes, text)

-    def convert_hr(self, el, text, convert_as_inline):
+    def convert_hr(self, el, text, parent_tags):
        return '\n\n---\n\n'

    convert_i = convert_em

-    def convert_img(self, el, text, convert_as_inline):
+    def convert_img(self, el, text, parent_tags):
        alt = el.attrs.get('alt', None) or ''
        src = el.attrs.get('src', None) or ''
        title = el.attrs.get('title', None) or ''
        title_part = ' "%s"' % title.replace('"', r'\"') if title else ''
-        if (convert_as_inline
+        if ('_inline' in parent_tags
                and el.parent.name not in self.options['keep_inline_images_in']):
            return alt

        return '![%s](%s%s)' % (alt, src, title_part)

-    def convert_list(self, el, text, convert_as_inline):
+    def convert_video(self, el, text, parent_tags):
+        if ('_inline' in parent_tags
+                and el.parent.name not in self.options['keep_inline_images_in']):
+            return text
+        src = el.attrs.get('src', None) or ''
+        if not src:
+            sources = el.find_all('source', attrs={'src': True})
+            if sources:
+                src = sources[0].attrs.get('src', None) or ''
+        poster = el.attrs.get('poster', None) or ''
+        if src and poster:
+            return '[![%s](%s)](%s)' % (text, poster, src)
+        if src:
+            return '[%s](%s)' % (text, src)
+        if poster:
+            return '![%s](%s)' % (text, poster)
+        return text
+
+    def convert_list(self, el, text, parent_tags):

        # Converting a list to inline is undefined.
-        # Ignoring convert_to_inline for list.
+        # Ignoring inline conversion parents for list.

-        nested = False
        before_paragraph = False
-        if el.next_sibling and el.next_sibling.name not in ['ul', 'ol']:
+        next_sibling = _next_block_content_sibling(el)
+        if next_sibling and next_sibling.name not in ['ul', 'ol']:
            before_paragraph = True
-        while el:
-            if el.name == 'li':
-                nested = True
-                break
-            el = el.parent
-        if nested:
-            # remove trailing newline if nested
-            return '\n' + self.indent(text, 1).rstrip()
-        return text + ('\n' if before_paragraph else '')
+        if 'li' in parent_tags:
+            # remove trailing newline if we're in a nested list
+            return '\n' + text.rstrip()
+        return '\n\n' + text + ('\n' if before_paragraph else '')

    convert_ul = convert_list
    convert_ol = convert_list

-    def convert_li(self, el, text, convert_as_inline):
+    def convert_li(self, el, text, parent_tags):
+        # handle some early-exit scenarios
+        text = (text or '').strip()
+        if not text:
+            return "\n"
+
+        # determine list item bullet character to use
        parent = el.parent
        if parent is not None and parent.name == 'ol':
            if parent.get("start") and str(parent.get("start")).isnumeric():
                start = int(parent.get("start"))
            else:
                start = 1
-            bullet = '%s.' % (start + parent.index(el))
+            bullet = '%s.' % (start + len(el.find_previous_siblings('li')))
        else:
            depth = -1
            while el:
@@ -339,19 +647,45 @@ class MarkdownConverter(object):
                el = el.parent
            bullets = self.options['bullets']
            bullet = bullets[depth % len(bullets)]
-        return '%s %s\n' % (bullet, (text or '').strip())
+        bullet = bullet + ' '
+        bullet_width = len(bullet)
+        bullet_indent = ' ' * bullet_width

-    def convert_p(self, el, text, convert_as_inline):
-        if convert_as_inline:
-            return text
+        # indent content lines by bullet width
+        def _indent_for_li(match):
+            line_content = match.group(1)
+            return bullet_indent + line_content if line_content else ''
+        text = re_line_with_content.sub(_indent_for_li, text)
+
+        # insert bullet into first-line indent whitespace
+        text = bullet + text[bullet_width:]
+
+        return '%s\n' % text
+
+    def convert_p(self, el, text, parent_tags):
+        if '_inline' in parent_tags:
+            return ' ' + text.strip(' \t\r\n') + ' '
+        text = text.strip(' \t\r\n')
        if self.options['wrap']:
-            text = fill(text,
-                        width=self.options['wrap_width'],
-                        break_long_words=False,
-                        break_on_hyphens=False)
-        return '%s\n\n' % text if text else ''
+            # Preserve newlines (and preceding whitespace) resulting
+            # from <br> tags.  Newlines in the input have already been
+            # replaced by spaces.
+            if self.options['wrap_width'] is not None:
+                lines = text.split('\n')
+                new_lines = []
+                for line in lines:
+                    line = line.lstrip(' \t\r\n')
+                    line_no_trailing = line.rstrip()
+                    trailing = line[len(line_no_trailing):]
+                    line = fill(line,
+                                width=self.options['wrap_width'],
+                                break_long_words=False,
+                                break_on_hyphens=False)
+                    new_lines.append(line + trailing)
+                text = '\n'.join(new_lines)
+        return '\n\n%s\n\n' % text if text else ''

-    def convert_pre(self, el, text, convert_as_inline):
+    def convert_pre(self, el, text, parent_tags):
        if not text:
            return ''
        code_language = self.options['code_language']
@@ -359,12 +693,24 @@ class MarkdownConverter(object):
        if self.options['code_language_callback']:
            code_language = self.options['code_language_callback'](el) or code_language

-        return '\n```%s\n%s\n```\n' % (code_language, text)
+        if self.options['strip_pre'] == STRIP:
+            text = strip_pre(text)  # remove all leading/trailing newlines
+        elif self.options['strip_pre'] == STRIP_ONE:
+            text = strip1_pre(text)  # remove one leading/trailing newline
+        elif self.options['strip_pre'] is None:
+            pass  # leave leading and trailing newlines as-is
+        else:
+            raise ValueError('Invalid value for strip_pre: %s' % self.options['strip_pre'])

-    def convert_script(self, el, text, convert_as_inline):
+        return '\n\n```%s\n%s\n```\n\n' % (code_language, text)
+
+    def convert_q(self, el, text, parent_tags):
+        return '"' + text + '"'
+
+    def convert_script(self, el, text, parent_tags):
        return ''

-    def convert_style(self, el, text, convert_as_inline):
+    def convert_style(self, el, text, parent_tags):
        return ''

    convert_s = convert_del
@@ -377,55 +723,70 @@ class MarkdownConverter(object):

    convert_sup = abstract_inline_conversion(lambda self: self.options['sup_symbol'])

-    def convert_table(self, el, text, convert_as_inline):
-        return '\n\n' + text + '\n'
+    def convert_table(self, el, text, parent_tags):
+        return '\n\n' + text.strip() + '\n\n'

-    def convert_caption(self, el, text, convert_as_inline):
-        return text + '\n'
+    def convert_caption(self, el, text, parent_tags):
+        return text.strip() + '\n\n'

-    def convert_figcaption(self, el, text, convert_as_inline):
-        return '\n\n' + text + '\n\n'
+    def convert_figcaption(self, el, text, parent_tags):
+        return '\n\n' + text.strip() + '\n\n'

-    def convert_td(self, el, text, convert_as_inline):
+    def convert_td(self, el, text, parent_tags):
        colspan = 1
        if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = int(el['colspan'])
+            colspan = max(1, min(1000, int(el['colspan'])))
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

-    def convert_th(self, el, text, convert_as_inline):
+    def convert_th(self, el, text, parent_tags):
        colspan = 1
        if 'colspan' in el.attrs and el['colspan'].isdigit():
-            colspan = int(el['colspan'])
+            colspan = max(1, min(1000, int(el['colspan'])))
        return ' ' + text.strip().replace("\n", " ") + ' |' * colspan

-    def convert_tr(self, el, text, convert_as_inline):
+    def convert_tr(self, el, text, parent_tags):
        cells = el.find_all(['td', 'th'])
+        is_first_row = el.find_previous_sibling() is None
        is_headrow = (
            all([cell.name == 'th' for cell in cells])
-            or (not el.previous_sibling and not el.parent.name == 'tbody')
-            or (not el.previous_sibling and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
+            or (el.parent.name == 'thead'
+                # avoid multiple tr in thead
+                and len(el.parent.find_all('tr')) == 1)
+        )
+        is_head_row_missing = (
+            (is_first_row and not el.parent.name == 'tbody')
+            or (is_first_row and el.parent.name == 'tbody' and len(el.parent.parent.find_all(['thead'])) < 1)
        )
        overline = ''
        underline = ''
-        if is_headrow and not el.previous_sibling:
-            # first row and is headline: print headline underline
-            full_colspan = 0
-            for cell in cells:
-                if 'colspan' in cell.attrs and cell['colspan'].isdigit():
-                    full_colspan += int(cell["colspan"])
-                else:
-                    full_colspan += 1
+        full_colspan = 0
+        for cell in cells:
+            if 'colspan' in cell.attrs and cell['colspan'].isdigit():
+                full_colspan += max(1, min(1000, int(cell['colspan'])))
+            else:
+                full_colspan += 1
+        if ((is_headrow
+             or (is_head_row_missing
+                 and self.options['table_infer_header']))
+                and is_first_row):
+            # first row and:
+            # - is headline or
+            # - headline is missing and header inference is enabled
+            # print headline underline
            underline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
-        elif (not el.previous_sibling
-              and (el.parent.name == 'table'
-                   or (el.parent.name == 'tbody'
-                       and not el.parent.previous_sibling))):
+        elif ((is_head_row_missing
+               and not self.options['table_infer_header'])
+              or (is_first_row
+                  and (el.parent.name == 'table'
+                       or (el.parent.name == 'tbody'
+                           and not el.parent.find_previous_sibling())))):
+            # headline is missing and header inference is disabled or:
            # first row, not headline, and:
-            # - the parent is table or
-            # - the parent is tbody at the beginning of a table.
+            #  - the parent is table or
+            #  - the parent is tbody at the beginning of a table.
            # print empty headline above this row
-            overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
-            overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
+            overline += '| ' + ' | '.join([''] * full_colspan) + ' |' + '\n'
+            overline += '| ' + ' | '.join(['---'] * full_colspan) + ' |' + '\n'
        return overline + '|' + text + '\n' + underline


--- a/markdownify/init.pyi
+++ b/markdownify/init.pyi
@@ -0,0 +1,77 @@
+from _typeshed import Incomplete
+from typing import Callable, Union
+
+ATX: str
+ATX_CLOSED: str
+UNDERLINED: str
+SETEXT = UNDERLINED
+SPACES: str
+BACKSLASH: str
+ASTERISK: str
+UNDERSCORE: str
+LSTRIP: str
+RSTRIP: str
+STRIP: str
+STRIP_ONE: str
+
+
+def markdownify(
+    html: str,
+    autolinks: bool = ...,
+    bs4_options: str = ...,
+    bullets: str = ...,
+    code_language: str = ...,
+    code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
+    convert: Union[list[str], None] = ...,
+    default_title: bool = ...,
+    escape_asterisks: bool = ...,
+    escape_underscores: bool = ...,
+    escape_misc: bool = ...,
+    heading_style: str = ...,
+    keep_inline_images_in: list[str] = ...,
+    newline_style: str = ...,
+    strip: Union[list[str], None] = ...,
+    strip_document: Union[str, None] = ...,
+    strip_pre: str = ...,
+    strong_em_symbol: str = ...,
+    sub_symbol: str = ...,
+    sup_symbol: str = ...,
+    table_infer_header: bool = ...,
+    wrap: bool = ...,
+    wrap_width: int = ...,
+) -> str: ...
+
+
+class MarkdownConverter:
+    def __init__(
+        self,
+        autolinks: bool = ...,
+        bs4_options: str = ...,
+        bullets: str = ...,
+        code_language: str = ...,
+        code_language_callback: Union[Callable[[Incomplete], Union[str, None]], None] = ...,
+        convert: Union[list[str], None] = ...,
+        default_title: bool = ...,
+        escape_asterisks: bool = ...,
+        escape_underscores: bool = ...,
+        escape_misc: bool = ...,
+        heading_style: str = ...,
+        keep_inline_images_in: list[str] = ...,
+        newline_style: str = ...,
+        strip: Union[list[str], None] = ...,
+        strip_document: Union[str, None] = ...,
+        strip_pre: str = ...,
+        strong_em_symbol: str = ...,
+        sub_symbol: str = ...,
+        sup_symbol: str = ...,
+        table_infer_header: bool = ...,
+        wrap: bool = ...,
+        wrap_width: int = ...,
+    ) -> None:
+        ...
+  
+    def convert(self, html: str) -> str:
+        ...
+
+    def convert_soup(self, soup: Incomplete) -> str:
+        ...
--- a/markdownify/main.py
+++ b/markdownify/main.py
@@ -55,15 +55,26 @@ def main(argv=sys.argv[1:]):
    parser.add_argument('--no-escape-underscores', dest='escape_underscores',
                        action='store_false',
                        help="Do not escape '_' to '\\_' in text.")
-    parser.add_argument('-i', '--keep-inline-images-in', nargs='*',
+    parser.add_argument('-i', '--keep-inline-images-in',
+                        default=[],
+                        nargs='*',
                        help="Images are converted to their alt-text when the images are "
                        "located inside headlines or table cells. If some inline images "
                        "should be converted to markdown images instead, this option can "
                        "be set to a list of parent tags that should be allowed to "
                        "contain inline images.")
+    parser.add_argument('--table-infer-header', dest='table_infer_header',
+                        action='store_true',
+                        help="When a table has no header row (as indicated by '<thead>' "
+                        "or '<th>'), use the first body row as the header row.")
    parser.add_argument('-w', '--wrap', action='store_true',
                        help="Wrap all text paragraphs at --wrap-width characters.")
    parser.add_argument('--wrap-width', type=int, default=80)
+    parser.add_argument('--bs4-options',
+                        default='html.parser',
+                        help="Specifies the parser that BeautifulSoup should use to parse "
+                             "the HTML markup. Examples include 'html5.parser', 'lxml', and "
+                             "'html5lib'.")

    args = parser.parse_args(argv)
    print(markdownify(**vars(args)))
--- a/markdownify/py.typed
+++ b/markdownify/py.typed
@@ -0,0 +1 @@
+
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -0,0 +1,45 @@
+[build-system]
+requires = ["setuptools>=61.2", "setuptools_scm[toml]>=3.4.3"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "markdownify"
+version = "1.2.2"
+authors = [{name = "Matthew Tretter", email = "m@tthewwithanm.com"}]
+description = "Convert HTML to markdown."
+readme = "README.rst"
+classifiers = [
+    "Environment :: Web Environment",
+    "Framework :: Django",
+    "Intended Audience :: Developers",
+    "License :: OSI Approved :: MIT License",
+    "Operating System :: OS Independent",
+    "Programming Language :: Python :: 2.5",
+    "Programming Language :: Python :: 2.6",
+    "Programming Language :: Python :: 2.7",
+    "Programming Language :: Python :: 3.6",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
+    "Topic :: Utilities",
+]
+dependencies = [
+    "beautifulsoup4>=4.9,<5",
+    "six>=1.15,<2"
+]
+
+[project.urls]
+Homepage = "http://github.com/matthewwithanm/python-markdownify"
+Download = "http://github.com/matthewwithanm/python-markdownify/tarball/master"
+
+[project.scripts]
+markdownify = "markdownify.main:main"
+
+[tool.setuptools]
+zip-safe = false
+include-package-data = true
+
+[tool.setuptools.packages.find]
+include = ["markdownify", "markdownify.*"]
+namespaces = false
+
+[tool.setuptools_scm]
--- a/setup.py
+++ b/setup.py
@@ -1,52 +0,0 @@
-#/usr/bin/env python
-import codecs
-import os
-from setuptools import setup, find_packages
-
-
-read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
-
-pkgmeta = {
-    '__title__': 'markdownify',
-    '__author__': 'Matthew Tretter',
-    '__version__': '0.13.0',
-}
-
-read = lambda filepath: codecs.open(filepath, 'r', 'utf-8').read()
-
-setup(
-    name='markdownify',
-    description='Convert HTML to markdown.',
-    long_description=read(os.path.join(os.path.dirname(__file__), 'README.rst')),
-    version=pkgmeta['__version__'],
-    author=pkgmeta['__author__'],
-    author_email='m@tthewwithanm.com',
-    url='http://github.com/matthewwithanm/python-markdownify',
-    download_url='http://github.com/matthewwithanm/python-markdownify/tarball/master',
-    packages=find_packages(),
-    zip_safe=False,
-    include_package_data=True,
-    install_requires=[
-        'beautifulsoup4>=4.9,<5',
-        'six>=1.15,<2',
-    ],
-    classifiers=[
-        'Environment :: Web Environment',
-        'Framework :: Django',
-        'Intended Audience :: Developers',
-        'License :: OSI Approved :: MIT License',
-        'Operating System :: OS Independent',
-        'Programming Language :: Python :: 2.5',
-        'Programming Language :: Python :: 2.6',
-        'Programming Language :: Python :: 2.7',
-        'Programming Language :: Python :: 3.6',
-        'Programming Language :: Python :: 3.7',
-        'Programming Language :: Python :: 3.8',
-        'Topic :: Utilities'
-    ],
-    entry_points={
-        'console_scripts': [
-            'markdownify = markdownify.main:main'
-        ]
-    }
-)
--- a/tests/test_advanced.py
+++ b/tests/test_advanced.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md


 def test_chomp():
@@ -14,7 +14,7 @@ def test_chomp():

 def test_nested():
    text = md('<p>This is an <a href="http://example.com/">example link</a>.</p>')
-    assert text == 'This is an [example link](http://example.com/).\n\n'
+    assert text == '\n\nThis is an [example link](http://example.com/).\n\n'


 def test_ignore_comments():
--- a/tests/test_args.py
+++ b/tests/test_args.py
@@ -2,7 +2,8 @@
 Test whitelisting/blacklisting of specific tags.

 """
-from markdownify import markdownify as md
+from markdownify import markdownify, LSTRIP, RSTRIP, STRIP, STRIP_ONE
+from .utils import md


 def test_strip():
@@ -23,3 +24,24 @@ def test_convert():
 def test_do_not_convert():
    text = md('<a href="https://github.com/matthewwithanm">Some Text</a>', convert=[])
    assert text == 'Some Text'
+
+
+def test_strip_document():
+    assert markdownify("<p>Hello</p>") == "Hello"  # test default of STRIP
+    assert markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
+    assert markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
+    assert markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
+    assert markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
+
+
+def test_strip_pre():
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>") == "```\n  Hello\n```"
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP) == "```\n  Hello\n```"
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=STRIP_ONE) == "```\n  \n  Hello  \n  \n```"
+    assert markdownify("<pre>  \n  \n  Hello  \n  \n  </pre>", strip_pre=None) == "```\n  \n  \n  Hello  \n  \n  \n```"
+
+
+def bs4_options():
+    assert markdownify("<p>Hello</p>", bs4_options="html.parser") == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options=["html.parser"]) == "Hello"
+    assert markdownify("<p>Hello</p>", bs4_options={"features": "html.parser"}) == "Hello"
--- a/tests/test_basic.py
+++ b/tests/test_basic.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md


 def test_single_tag():
@@ -6,8 +6,9 @@ def test_single_tag():


 def test_soup():
-    assert md('<div><span>Hello</div></span>') == 'Hello'
+    assert md('<div><span>Hello</div></span>') == '\n\nHello\n\n'


 def test_whitespace():
    assert md(' a  b \t\t c ') == ' a b c '
+    assert md(' a  b \n\n c ') == ' a b\nc '
--- a/tests/test_conversions.py
+++ b/tests/test_conversions.py
@@ -1,4 +1,5 @@
-from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
+from markdownify import ATX, ATX_CLOSED, BACKSLASH, SPACES, UNDERSCORE
+from .utils import md


 def inline_tests(tag, markup):
@@ -39,6 +40,11 @@ def test_a_no_autolinks():
    assert md('<a href="https://google.com">https://google.com</a>', autolinks=False) == '[https://google.com](https://google.com)'


+def test_a_in_code():
+    assert md('<code><a href="https://google.com">Google</a></code>') == '`Google`'
+    assert md('<pre><a href="https://google.com">Google</a></pre>') == '\n\n```\nGoogle\n```\n\n'
+
+
 def test_b():
    assert md('<b>Hello</b>') == '**Hello**'

@@ -53,11 +59,12 @@ def test_b_spaces():
 def test_blockquote():
    assert md('<blockquote>Hello</blockquote>') == '\n> Hello\n\n'
    assert md('<blockquote>\nHello\n</blockquote>') == '\n> Hello\n\n'
+    assert md('<blockquote>&nbsp;Hello</blockquote>') == '\n> \u00a0Hello\n\n'


 def test_blockquote_with_nested_paragraph():
    assert md('<blockquote><p>Hello</p></blockquote>') == '\n> Hello\n\n'
-    assert md('<blockquote><p>Hello</p><p>Hello again</p></blockquote>') == '\n> Hello\n> \n> Hello again\n\n'
+    assert md('<blockquote><p>Hello</p><p>Hello again</p></blockquote>') == '\n> Hello\n>\n> Hello again\n\n'


 def test_blockquote_with_paragraph():
@@ -66,17 +73,14 @@ def test_blockquote_with_paragraph():

 def test_blockquote_nested():
    text = md('<blockquote>And she was like <blockquote>Hello</blockquote></blockquote>')
-    assert text == '\n> And she was like \n> > Hello\n\n'
+    assert text == '\n> And she was like\n> > Hello\n\n'


 def test_br():
    assert md('a<br />b<br />c') == 'a  \nb  \nc'
    assert md('a<br />b<br />c', newline_style=BACKSLASH) == 'a\\\nb\\\nc'
-
-
-def test_caption():
-    assert md('TEXT<figure><figcaption>Caption</figcaption><span>SPAN</span></figure>') == 'TEXT\n\nCaption\n\nSPAN'
-    assert md('<figure><span>SPAN</span><figcaption>Caption</figcaption></figure>TEXT') == 'SPAN\n\nCaption\n\nTEXT'
+    assert md('<h1>foo<br />bar</h1>', heading_style=ATX) == '\n\n# foo bar\n\n'
+    assert md('<td>foo<br />bar</td>', heading_style=ATX) == ' foo bar |'


 def test_code():
@@ -97,51 +101,86 @@ def test_code():
    assert md('<code>foo<s> bar </s>baz</code>') == '`foo bar baz`'
    assert md('<code>foo<sup>bar</sup>baz</code>', sup_symbol='^') == '`foobarbaz`'
    assert md('<code>foo<sub>bar</sub>baz</code>', sub_symbol='^') == '`foobarbaz`'
+    assert md('foo<code>`bar`</code>baz') == 'foo`` `bar` ``baz'
+    assert md('foo<code>``bar``</code>baz') == 'foo``` ``bar`` ```baz'
+    assert md('foo<code> `bar` </code>baz') == 'foo `` `bar` `` baz'
+
+
+def test_dl():
+    assert md('<dl><dt>term</dt><dd>definition</dd></dl>') == '\n\nterm\n:   definition\n\n'
+    assert md('<dl><dt><p>te</p><p>rm</p></dt><dd>definition</dd></dl>') == '\n\nte rm\n:   definition\n\n'
+    assert md('<dl><dt>term</dt><dd><p>definition-p1</p><p>definition-p2</p></dd></dl>') == '\n\nterm\n:   definition-p1\n\n    definition-p2\n\n'
+    assert md('<dl><dt>term</dt><dd><p>definition 1</p></dd><dd><p>definition 2</p></dd></dl>') == '\n\nterm\n:   definition 1\n:   definition 2\n\n'
+    assert md('<dl><dt>term 1</dt><dd>definition 1</dd><dt>term 2</dt><dd>definition 2</dd></dl>') == '\n\nterm 1\n:   definition 1\n\nterm 2\n:   definition 2\n\n'
+    assert md('<dl><dt>term</dt><dd><blockquote><p>line 1</p><p>line 2</p></blockquote></dd></dl>') == '\n\nterm\n:   > line 1\n    >\n    > line 2\n\n'
+    assert md('<dl><dt>term</dt><dd><ol><li><p>1</p><ul><li>2a</li><li>2b</li></ul></li><li><p>3</p></li></ol></dd></dl>') == '\n\nterm\n:   1. 1\n\n       * 2a\n       * 2b\n    2. 3\n\n'


 def test_del():
    inline_tests('del', '~~')


-def test_div():
-    assert md('Hello</div> World') == 'Hello World'
+def test_div_section_article():
+    for tag in ['div', 'section', 'article']:
+        assert md(f'<{tag}>456</{tag}>') == '\n\n456\n\n'
+        assert md(f'123<{tag}>456</{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}>\n 456 \n</{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}><p>456</p></{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}>\n<p>456</p>\n</{tag}>789') == '123\n\n456\n\n789'
+        assert md(f'123<{tag}><pre>4 5 6</pre></{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
+        assert md(f'123<{tag}>\n<pre>4 5 6</pre>\n</{tag}>789') == '123\n\n```\n4 5 6\n```\n\n789'
+        assert md(f'123<{tag}>4\n5\n6</{tag}>789') == '123\n\n4\n5\n6\n\n789'
+        assert md(f'123<{tag}>\n4\n5\n6\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
+        assert md(f'123<{tag}>\n<p>\n4\n5\n6\n</p>\n</{tag}>789') == '123\n\n4\n5\n6\n\n789'
+        assert md(f'<{tag}><h1>title</h1>body</{{tag}}>', heading_style=ATX) == '\n\n# title\n\nbody\n\n'


 def test_em():
    inline_tests('em', '*')


+def test_figcaption():
+    assert (md("TEXT<figure><figcaption>\nCaption\n</figcaption><span>SPAN</span></figure>") == "TEXT\n\nCaption\n\nSPAN")
+    assert (md("<figure><span>SPAN</span><figcaption>\nCaption\n</figcaption></figure>TEXT") == "SPAN\n\nCaption\n\nTEXT")
+
+
 def test_header_with_space():
-    assert md('<h3>\n\nHello</h3>') == '### Hello\n\n'
-    assert md('<h4>\n\nHello</h4>') == '#### Hello\n\n'
-    assert md('<h5>\n\nHello</h5>') == '##### Hello\n\n'
-    assert md('<h5>\n\nHello\n\n</h5>') == '##### Hello\n\n'
-    assert md('<h5>\n\nHello   \n\n</h5>') == '##### Hello\n\n'
+    assert md('<h3>\n\nHello</h3>') == '\n\n### Hello\n\n'
+    assert md('<h3>Hello\n\n\nWorld</h3>') == '\n\n### Hello World\n\n'
+    assert md('<h4>\n\nHello</h4>') == '\n\n#### Hello\n\n'
+    assert md('<h5>\n\nHello</h5>') == '\n\n##### Hello\n\n'
+    assert md('<h5>\n\nHello\n\n</h5>') == '\n\n##### Hello\n\n'
+    assert md('<h5>\n\nHello   \n\n</h5>') == '\n\n##### Hello\n\n'


 def test_h1():
-    assert md('<h1>Hello</h1>') == 'Hello\n=====\n\n'
+    assert md('<h1>Hello</h1>') == '\n\nHello\n=====\n\n'


 def test_h2():
-    assert md('<h2>Hello</h2>') == 'Hello\n-----\n\n'
+    assert md('<h2>Hello</h2>') == '\n\nHello\n-----\n\n'


 def test_hn():
-    assert md('<h3>Hello</h3>') == '### Hello\n\n'
-    assert md('<h4>Hello</h4>') == '#### Hello\n\n'
-    assert md('<h5>Hello</h5>') == '##### Hello\n\n'
-    assert md('<h6>Hello</h6>') == '###### Hello\n\n'
+    assert md('<h3>Hello</h3>') == '\n\n### Hello\n\n'
+    assert md('<h4>Hello</h4>') == '\n\n#### Hello\n\n'
+    assert md('<h5>Hello</h5>') == '\n\n##### Hello\n\n'
+    assert md('<h6>Hello</h6>') == '\n\n###### Hello\n\n'
+    assert md('<h10>Hello</h10>') == md('<h6>Hello</h6>')
+    assert md('<h0>Hello</h0>') == md('<h1>Hello</h1>')
+    assert md('<hx>Hello</hx>') == md('Hello')


 def test_hn_chained():
-    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '# First\n\n\n## Second\n\n\n### Third\n\n'
-    assert md('X<h1>First</h1>', heading_style=ATX) == 'X# First\n\n'
+    assert md('<h1>First</h1>\n<h2>Second</h2>\n<h3>Third</h3>', heading_style=ATX) == '\n\n# First\n\n## Second\n\n### Third\n\n'
+    assert md('X<h1>First</h1>', heading_style=ATX) == 'X\n\n# First\n\n'
+    assert md('X<h1>First</h1>', heading_style=ATX_CLOSED) == 'X\n\n# First #\n\n'
+    assert md('X<h1>First</h1>') == 'X\n\nFirst\n=====\n\n'


 def test_hn_nested_tag_heading_style():
-    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '# A P C #\n\n'
-    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '# A P C\n\n'
+    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX_CLOSED) == '\n\n# A P C #\n\n'
+    assert md('<h1>A <p>P</p> C </h1>', heading_style=ATX) == '\n\n# A P C\n\n'


 def test_hn_nested_simple_tag():
@@ -157,12 +196,12 @@ def test_hn_nested_simple_tag():
    ]

    for tag, markdown in tag_to_markdown:
-        assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '### A ' + markdown + ' B\n\n'
+        assert md('<h3>A <' + tag + '>' + tag + '</' + tag + '> B</h3>') == '\n\n### A ' + markdown + ' B\n\n'

-    assert md('<h3>A <br>B</h3>', heading_style=ATX) == '### A B\n\n'
+    assert md('<h3>A <br>B</h3>', heading_style=ATX) == '\n\n### A B\n\n'

    # Nested lists not supported
-    # assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '### A li1 li2 B\n\n'
+    # assert md('<h3>A <ul><li>li1</i><li>l2</li></ul></h3>', heading_style=ATX) == '\n### A li1 li2 B\n\n'


 def test_hn_nested_img():
@@ -172,18 +211,23 @@ def test_hn_nested_img():
        ("alt='Alt Text' title='Optional title'", "Alt Text", " \"Optional title\""),
    ]
    for image_attributes, markdown, title in image_attributes_to_markdown:
-        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '### A ' + markdown + ' B\n\n'
-        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>') == '\n\n### A' + (' ' + markdown + ' ' if markdown else ' ') + 'B\n\n'
+        assert md('<h3>A <img src="/path/to/img.jpg" ' + image_attributes + '/> B</h3>', keep_inline_images_in=['h3']) == '\n\n### A ![' + markdown + '](/path/to/img.jpg' + title + ') B\n\n'


 def test_hn_atx_headings():
-    assert md('<h1>Hello</h1>', heading_style=ATX) == '# Hello\n\n'
-    assert md('<h2>Hello</h2>', heading_style=ATX) == '## Hello\n\n'
+    assert md('<h1>Hello</h1>', heading_style=ATX) == '\n\n# Hello\n\n'
+    assert md('<h2>Hello</h2>', heading_style=ATX) == '\n\n## Hello\n\n'


 def test_hn_atx_closed_headings():
-    assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '# Hello #\n\n'
-    assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '## Hello ##\n\n'
+    assert md('<h1>Hello</h1>', heading_style=ATX_CLOSED) == '\n\n# Hello #\n\n'
+    assert md('<h2>Hello</h2>', heading_style=ATX_CLOSED) == '\n\n## Hello ##\n\n'
+
+
+def test_hn_newlines():
+    assert md("<h1>H1-1</h1>TEXT<h2>H2-2</h2>TEXT<h1>H1-2</h1>TEXT", heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT'
+    assert md('<h1>H1-1</h1>\n<p>TEXT</p>\n<h2>H2-2</h2>\n<p>TEXT</p>\n<h1>H1-2</h1>\n<p>TEXT</p>', heading_style=ATX) == '\n\n# H1-1\n\nTEXT\n\n## H2-2\n\nTEXT\n\n# H1-2\n\nTEXT\n\n'


 def test_head():
@@ -193,7 +237,7 @@ def test_head():
 def test_hr():
    assert md('Hello<hr>World') == 'Hello\n\n---\n\nWorld'
    assert md('Hello<hr />World') == 'Hello\n\n---\n\nWorld'
-    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == 'Hello\n\n\n\n\n---\n\n\nWorld\n\n'
+    assert md('<p>Hello</p>\n<hr>\n<p>World</p>') == '\n\nHello\n\n---\n\nWorld\n\n'


 def test_i():
@@ -205,37 +249,68 @@ def test_img():
    assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)'


+def test_video():
+    assert md('<video src="/path/to/video.mp4" poster="/path/to/img.jpg">text</video>') == '[![text](/path/to/img.jpg)](/path/to/video.mp4)'
+    assert md('<video src="/path/to/video.mp4">text</video>') == '[text](/path/to/video.mp4)'
+    assert md('<video><source src="/path/to/video.mp4"/>text</video>') == '[text](/path/to/video.mp4)'
+    assert md('<video poster="/path/to/img.jpg">text</video>') == '![text](/path/to/img.jpg)'
+    assert md('<video>text</video>') == 'text'
+
+
 def test_kbd():
    inline_tests('kbd', '`')


 def test_p():
-    assert md('<p>hello</p>') == 'hello\n\n'
-    assert md('<p>123456789 123456789</p>') == '123456789 123456789\n\n'
-    assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '123456789\n123456789\n\n'
-    assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '[Some long\nlink](https://example.com)\n\n'
-    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345\\\n67890\n\n'
-    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '12345678901\\\n12345\n\n'
+    assert md('<p>hello</p>') == '\n\nhello\n\n'
+    assert md("<p><p>hello</p></p>") == "\n\nhello\n\n"
+    assert md('<p>123456789 123456789</p>') == '\n\n123456789 123456789\n\n'
+    assert md('<p>123456789\n\n\n123456789</p>') == '\n\n123456789\n123456789\n\n'
+    assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=80) == '\n\n123456789 123456789\n\n'
+    assert md('<p>123456789\n\n\n123456789</p>', wrap=True, wrap_width=None) == '\n\n123456789 123456789\n\n'
+    assert md('<p>123456789 123456789</p>', wrap=True, wrap_width=10) == '\n\n123456789\n123456789\n\n'
+    assert md('<p><a href="https://example.com">Some long link</a></p>', wrap=True, wrap_width=10) == '\n\n[Some long\nlink](https://example.com)\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345\\\n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345  \n67890\n\n'
+    assert md('<p>12345<br />67890</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345  \n67890\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=BACKSLASH) == '\n\n12345678901\\\n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n12345678901  \n12345\n\n'
+    assert md('<p>12345678901<br />12345</p>', wrap=True, wrap_width=50, newline_style=SPACES) == '\n\n12345678901  \n12345\n\n'
+    assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=BACKSLASH) == '\n\n1234 5678\n9012\\\n67890\n\n'
+    assert md('<p>1234 5678 9012<br />67890</p>', wrap=True, wrap_width=10, newline_style=SPACES) == '\n\n1234 5678\n9012  \n67890\n\n'
+    assert md('First<p>Second</p><p>Third</p>Fourth') == 'First\n\nSecond\n\nThird\n\nFourth'
+    assert md('<p>&nbsp;x y</p>', wrap=True, wrap_width=80) == '\n\n\u00a0x y\n\n'


 def test_pre():
-    assert md('<pre>test\n    foo\nbar</pre>') == '\n```\ntest\n    foo\nbar\n```\n'
-    assert md('<pre><code>test\n    foo\nbar</code></pre>') == '\n```\ntest\n    foo\nbar\n```\n'
-    assert md('<pre>*this_should_not_escape*</pre>') == '\n```\n*this_should_not_escape*\n```\n'
-    assert md('<pre><span>*this_should_not_escape*</span></pre>') == '\n```\n*this_should_not_escape*\n```\n'
-    assert md('<pre>\t\tthis  should\t\tnot  normalize</pre>') == '\n```\n\t\tthis  should\t\tnot  normalize\n```\n'
-    assert md('<pre><span>\t\tthis  should\t\tnot  normalize</span></pre>') == '\n```\n\t\tthis  should\t\tnot  normalize\n```\n'
-    assert md('<pre>foo<b>\nbar\n</b>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<i>\nbar\n</i>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo\n<i>bar</i>\nbaz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<i>\n</i>baz</pre>') == '\n```\nfoo\nbaz\n```\n'
-    assert md('<pre>foo<del>\nbar\n</del>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<em>\nbar\n</em>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<code>\nbar\n</code>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<strong>\nbar\n</strong>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<s>\nbar\n</s>baz</pre>') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<sup>\nbar\n</sup>baz</pre>', sup_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'
-    assert md('<pre>foo<sub>\nbar\n</sub>baz</pre>', sub_symbol='^') == '\n```\nfoo\nbar\nbaz\n```\n'
+    assert md('<pre>test\n    foo\nbar</pre>') == '\n\n```\ntest\n    foo\nbar\n```\n\n'
+    assert md('<pre><code>test\n    foo\nbar</code></pre>') == '\n\n```\ntest\n    foo\nbar\n```\n\n'
+    assert md('<pre>*this_should_not_escape*</pre>') == '\n\n```\n*this_should_not_escape*\n```\n\n'
+    assert md('<pre><span>*this_should_not_escape*</span></pre>') == '\n\n```\n*this_should_not_escape*\n```\n\n'
+    assert md('<pre>\t\tthis  should\t\tnot  normalize</pre>') == '\n\n```\n\t\tthis  should\t\tnot  normalize\n```\n\n'
+    assert md('<pre><span>\t\tthis  should\t\tnot  normalize</span></pre>') == '\n\n```\n\t\tthis  should\t\tnot  normalize\n```\n\n'
+    assert md('<pre>foo<b>\nbar\n</b>baz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<i>\nbar\n</i>baz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo\n<i>bar</i>\nbaz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<i>\n</i>baz</pre>') == '\n\n```\nfoo\nbaz\n```\n\n'
+    assert md('<pre>foo<del>\nbar\n</del>baz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<em>\nbar\n</em>baz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<code>\nbar\n</code>baz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<strong>\nbar\n</strong>baz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<s>\nbar\n</s>baz</pre>') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<sup>\nbar\n</sup>baz</pre>', sup_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<sub>\nbar\n</sub>baz</pre>', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+    assert md('<pre>foo<sub>\nbar\n</sub>baz</pre>', sub_symbol='^') == '\n\n```\nfoo\nbar\nbaz\n```\n\n'
+
+    assert md('foo<pre>bar</pre>baz', sub_symbol='^') == 'foo\n\n```\nbar\n```\n\nbaz'
+    assert md("<p>foo</p>\n<pre>bar</pre>\n</p>baz</p>", sub_symbol="^") == "\n\nfoo\n\n```\nbar\n```\n\nbaz"
+
+
+def test_q():
+    assert md('foo <q>quote</q> bar') == 'foo "quote" bar'
+    assert md('foo <q cite="https://example.com">quote</q> bar') == 'foo "quote" bar'


 def test_script():
@@ -278,14 +353,24 @@ def test_sup():


 def test_lang():
-    assert md('<pre>test\n    foo\nbar</pre>', code_language='python') == '\n```python\ntest\n    foo\nbar\n```\n'
-    assert md('<pre><code>test\n    foo\nbar</code></pre>', code_language='javascript') == '\n```javascript\ntest\n    foo\nbar\n```\n'
+    assert md('<pre>test\n    foo\nbar</pre>', code_language='python') == '\n\n```python\ntest\n    foo\nbar\n```\n\n'
+    assert md('<pre><code>test\n    foo\nbar</code></pre>', code_language='javascript') == '\n\n```javascript\ntest\n    foo\nbar\n```\n\n'


 def test_lang_callback():
    def callback(el):
        return el['class'][0] if el.has_attr('class') else None

-    assert md('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback) == '\n```python\ntest\n    foo\nbar\n```\n'
-    assert md('<pre class="javascript"><code>test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
-    assert md('<pre class="javascript"><code class="javascript">test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n```javascript\ntest\n    foo\nbar\n```\n'
+    assert md('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback) == '\n\n```python\ntest\n    foo\nbar\n```\n\n'
+    assert md('<pre class="javascript"><code>test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n\n```javascript\ntest\n    foo\nbar\n```\n\n'
+    assert md('<pre class="javascript"><code class="javascript">test\n    foo\nbar</code></pre>', code_language_callback=callback) == '\n\n```javascript\ntest\n    foo\nbar\n```\n\n'
+
+
+def test_spaces():
+    assert md('<p> a b </p> <p> c d </p>') == '\n\na b\n\nc d\n\n'
+    assert md('<p> <i>a</i> </p>') == '\n\n*a*\n\n'
+    assert md('test <p> again </p>') == 'test\n\nagain\n\n'
+    assert md('test <blockquote> text </blockquote> after') == 'test\n> text\n\nafter'
+    assert md(' <ol> <li> x </li> <li> y </li> </ol> ') == '\n\n1. x\n2. y\n'
+    assert md(' <ul> <li> x </li> <li> y </li> </ol> ') == '\n\n* x\n* y\n'
+    assert md('test <pre> foo </pre> bar') == 'test\n\n```\n foo\n```\n\nbar'
--- a/tests/test_custom_converter.py
+++ b/tests/test_custom_converter.py
@@ -2,21 +2,40 @@ from markdownify import MarkdownConverter
 from bs4 import BeautifulSoup


-class ImageBlockConverter(MarkdownConverter):
+class UnitTestConverter(MarkdownConverter):
    """
-    Create a custom MarkdownConverter that adds two newlines after an image
+    Create a custom MarkdownConverter for unit tests
    """
-    def convert_img(self, el, text, convert_as_inline):
-        return super().convert_img(el, text, convert_as_inline) + '\n\n'
+    def convert_img(self, el, text, parent_tags):
+        """Add two newlines after an image"""
+        return super().convert_img(el, text, parent_tags) + '\n\n'
+
+    def convert_custom_tag(self, el, text, parent_tags):
+        """Ensure conversion function is found for tags with special characters in name"""
+        return "convert_custom_tag(): %s" % text
+
+    def convert_h1(self, el, text, parent_tags):
+        """Ensure explicit heading conversion function is used"""
+        return "convert_h1: %s" % (text)
+
+    def convert_hN(self, n, el, text, parent_tags):
+        """Ensure general heading conversion function is used"""
+        return "convert_hN(%d): %s" % (n, text)


-def test_img():
+def test_custom_conversion_functions():
    # Create shorthand method for conversion
    def md(html, **options):
-        return ImageBlockConverter(**options).convert(html)
+        return UnitTestConverter(**options).convert(html)

-    assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />') == '![Alt text](/path/to/img.jpg "Optional title")\n\n'
-    assert md('<img src="/path/to/img.jpg" alt="Alt text" />') == '![Alt text](/path/to/img.jpg)\n\n'
+    assert md('<img src="/path/to/img.jpg" alt="Alt text" title="Optional title" />text') == '![Alt text](/path/to/img.jpg "Optional title")\n\ntext'
+    assert md('<img src="/path/to/img.jpg" alt="Alt text" />text') == '![Alt text](/path/to/img.jpg)\n\ntext'
+
+    assert md("<custom-tag>text</custom-tag>") == "convert_custom_tag(): text"
+
+    assert md("<h1>text</h1>") == "convert_h1: text"
+
+    assert md("<h3>text</h3>") == "convert_hN(3): text"


 def test_soup():
--- a/tests/test_escaping.py
+++ b/tests/test_escaping.py
@@ -1,4 +1,6 @@
-from markdownify import markdownify as md
+import warnings
+from bs4 import MarkupResemblesLocatorWarning
+from .utils import md


 def test_asterisks():
@@ -12,7 +14,7 @@ def test_underscore():


 def test_xml_entities():
-    assert md('&amp;') == r'\&'
+    assert md('&amp;', escape_misc=True) == r'\&'


 def test_named_entities():
@@ -25,23 +27,51 @@ def test_hexadecimal_entities():


 def test_single_escaping_entities():
-    assert md('&amp;amp;') == r'\&amp;'
+    assert md('&amp;amp;', escape_misc=True) == r'\&amp;'


-def text_misc():
-    assert md('\\*') == r'\\\*'
-    assert md('<foo>') == r'\<foo\>'
-    assert md('# foo') == r'\# foo'
-    assert md('> foo') == r'\> foo'
-    assert md('~~foo~~') == r'\~\~foo\~\~'
-    assert md('foo\n===\n') == 'foo\n\\=\\=\\=\n'
-    assert md('---\n') == '\\-\\-\\-\n'
-    assert md('+ x\n+ y\n') == '\\+ x\n\\+ y\n'
-    assert md('`x`') == r'\`x\`'
-    assert md('[text](link)') == r'\[text](link)'
-    assert md('1. x') == r'1\. x'
-    assert md('not a number. x') == r'not a number. x'
-    assert md('1) x') == r'1\) x'
-    assert md('not a number) x') == r'not a number) x'
-    assert md('|not table|') == r'\|not table\|'
-    assert md(r'\ <foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'
+def test_misc():
+    # ignore the bs4 warning that "1.2" or "*" looks like a filename
+    warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning)
+
+    assert md('\\*', escape_misc=True) == r'\\\*'
+    assert md('&lt;foo>', escape_misc=True) == r'\<foo\>'
+    assert md('# foo', escape_misc=True) == r'\# foo'
+    assert md('#5', escape_misc=True) == r'#5'
+    assert md('5#', escape_misc=True) == '5#'
+    assert md('####### foo', escape_misc=True) == r'####### foo'
+    assert md('> foo', escape_misc=True) == r'\> foo'
+    assert md('~~foo~~', escape_misc=True) == r'\~\~foo\~\~'
+    assert md('foo\n===\n', escape_misc=True) == 'foo\n\\=\\=\\=\n'
+    assert md('---\n', escape_misc=True) == '\\---\n'
+    assert md('- test', escape_misc=True) == r'\- test'
+    assert md('x - y', escape_misc=True) == r'x \- y'
+    assert md('test-case', escape_misc=True) == 'test-case'
+    assert md('x-', escape_misc=True) == 'x-'
+    assert md('-y', escape_misc=True) == '-y'
+    assert md('+ x\n+ y\n', escape_misc=True) == '\\+ x\n\\+ y\n'
+    assert md('`x`', escape_misc=True) == r'\`x\`'
+    assert md('[text](notalink)', escape_misc=True) == r'\[text\](notalink)'
+    assert md('<a href="link">text]</a>', escape_misc=True) == r'[text\]](link)'
+    assert md('<a href="link">[text]</a>', escape_misc=True) == r'[\[text\]](link)'
+    assert md('1. x', escape_misc=True) == r'1\. x'
+    # assert md('1<span>.</span> x', escape_misc=True) == r'1\. x'
+    assert md('<span>1.</span> x', escape_misc=True) == r'1\. x'
+    assert md(' 1. x', escape_misc=True) == r' 1\. x'
+    assert md('123456789. x', escape_misc=True) == r'123456789\. x'
+    assert md('1234567890. x', escape_misc=True) == r'1234567890. x'
+    assert md('A1. x', escape_misc=True) == r'A1. x'
+    assert md('1.2', escape_misc=True) == r'1.2'
+    assert md('not a number. x', escape_misc=True) == r'not a number. x'
+    assert md('1) x', escape_misc=True) == r'1\) x'
+    # assert md('1<span>)</span> x', escape_misc=True) == r'1\) x'
+    assert md('<span>1)</span> x', escape_misc=True) == r'1\) x'
+    assert md(' 1) x', escape_misc=True) == r' 1\) x'
+    assert md('123456789) x', escape_misc=True) == r'123456789\) x'
+    assert md('1234567890) x', escape_misc=True) == r'1234567890) x'
+    assert md('(1) x', escape_misc=True) == r'(1) x'
+    assert md('A1) x', escape_misc=True) == r'A1) x'
+    assert md('1)x', escape_misc=True) == r'1)x'
+    assert md('not a number) x', escape_misc=True) == r'not a number) x'
+    assert md('|not table|', escape_misc=True) == r'\|not table\|'
+    assert md(r'\ &lt;foo> &amp;amp; | ` `', escape_misc=False) == r'\ <foo> &amp; | ` `'
--- a/tests/test_lists.py
+++ b/tests/test_lists.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md


 nested_uls = """
@@ -41,19 +41,22 @@ nested_ols = """


 def test_ol():
-    assert md('<ol><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
-    assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '3. a\n4. b\n'
-    assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
-    assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
-    assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '1. a\n2. b\n'
+    assert md('<ol><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol><!--comment--><li>a</li><span/><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="3"><li>a</li><li>b</li></ol>') == '\n\n3. a\n4. b\n'
+    assert md('foo<ol start="3"><li>a</li><li>b</li></ol>bar') == 'foo\n\n3. a\n4. b\n\nbar'
+    assert md('<ol start="-1"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="foo"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="1.5"><li>a</li><li>b</li></ol>') == '\n\n1. a\n2. b\n'
+    assert md('<ol start="1234"><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ol>') == '\n\n1234. first para\n\n      second para\n1235. third para\n\n      fourth para\n'


 def test_nested_ols():
-    assert md(nested_ols) == '\n1. 1\n\t1. a\n\t\t1. I\n\t\t2. II\n\t\t3. III\n\t2. b\n\t3. c\n2. 2\n3. 3\n'
+    assert md(nested_ols) == '\n\n1. 1\n   1. a\n      1. I\n      2. II\n      3. III\n   2. b\n   3. c\n2. 2\n3. 3\n'


 def test_ul():
-    assert md('<ul><li>a</li><li>b</li></ul>') == '* a\n* b\n'
+    assert md('<ul><li>a</li><li>b</li></ul>') == '\n\n* a\n* b\n'
    assert md("""<ul>
     <li>
             a
@@ -61,11 +64,13 @@ def test_ul():
     <li> b </li>
     <li>   c
     </li>
- </ul>""") == '* a\n* b\n* c\n'
+ </ul>""") == '\n\n* a\n* b\n* c\n'
+    assert md('<ul><li><p>first para</p><p>second para</p></li><li><p>third para</p><p>fourth para</p></li></ul>') == '\n\n* first para\n\n  second para\n* third para\n\n  fourth para\n'


 def test_inline_ul():
-    assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == 'foo\n\n* a\n* b\n\nbar\n\n'
+    assert md('<p>foo</p><ul><li>a</li><li>b</li></ul><p>bar</p>') == '\n\nfoo\n\n* a\n* b\n\nbar\n\n'
+    assert md('foo<ul><li>bar</li></ul>baz') == 'foo\n\n* bar\n\nbaz'


 def test_nested_uls():
@@ -73,12 +78,12 @@ def test_nested_uls():
    Nested ULs should alternate bullet characters.

    """
-    assert md(nested_uls) == '\n* 1\n\t+ a\n\t\t- I\n\t\t- II\n\t\t- III\n\t+ b\n\t+ c\n* 2\n* 3\n'
+    assert md(nested_uls) == '\n\n* 1\n  + a\n    - I\n    - II\n    - III\n  + b\n  + c\n* 2\n* 3\n'


 def test_bullets():
-    assert md(nested_uls, bullets='-') == '\n- 1\n\t- a\n\t\t- I\n\t\t- II\n\t\t- III\n\t- b\n\t- c\n- 2\n- 3\n'
+    assert md(nested_uls, bullets='-') == '\n\n- 1\n  - a\n    - I\n    - II\n    - III\n  - b\n  - c\n- 2\n- 3\n'


 def test_li_text():
-    assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar  </li><li>foo <b>bar</b>   <i>space</i>.</ul>') == '* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
+    assert md('<ul><li>foo <a href="#">bar</a></li><li>foo bar  </li><li>foo <b>bar</b>   <i>space</i>.</ul>') == '\n\n* foo [bar](#)\n* foo bar\n* foo **bar** *space*.\n'
--- a/tests/test_tables.py
+++ b/tests/test_tables.py
@@ -1,4 +1,4 @@
-from markdownify import markdownify as md
+from .utils import md


 table = """<table>
@@ -141,6 +141,33 @@ table_head_body_missing_head = """<table>
    </tbody>
 </table>"""

+table_head_body_multiple_head = """<table>
+    <thead>
+        <tr>
+            <td>Creator</td>
+            <td>Editor</td>
+            <td>Server</td>
+        </tr>
+        <tr>
+            <td>Operator</td>
+            <td>Manager</td>
+            <td>Engineer</td>
+        </tr>
+    </thead>
+    <tbody>
+        <tr>
+            <td>Bob</td>
+            <td>Oliver</td>
+            <td>Tom</td>
+        </tr>
+        <tr>
+            <td>Thomas</td>
+            <td>Lucas</td>
+            <td>Ethan</td>
+        </tr>
+    </tbody>
+</table>"""
+
 table_missing_text = """<table>
    <thead>
        <tr>
@@ -201,7 +228,10 @@ table_body = """<table>
    </tbody>
 </table>"""

-table_with_caption = """TEXT<table><caption>Caption</caption>
+table_with_caption = """TEXT<table>
+    <caption>
+        Caption
+    </caption>
    <tbody><tr><td>Firstname</td>
            <td>Lastname</td>
            <td>Age</td>
@@ -237,18 +267,55 @@ table_with_undefined_colspan = """<table>
    </tr>
 </table>"""

+table_with_colspan_missing_head = """<table>
+    <tr>
+        <td colspan="2">Name</td>
+        <td>Age</td>
+    </tr>
+    <tr>
+        <td>Jill</td>
+        <td>Smith</td>
+        <td>50</td>
+    </tr>
+    <tr>
+        <td>Eve</td>
+        <td>Jackson</td>
+        <td>94</td>
+    </tr>
+</table>"""
+

 def test_table():
    assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_paragraphs) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
-    assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith  Jackson | 50 |\n| Eve | Jackson  Smith | 94 |\n\n'
+    assert md(table_with_linebreaks) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
    assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_head_body_multiple_head) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Creator | Editor | Server |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n'
    assert md(table_head_body_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_missing_text) == '\n\n|  | Lastname | Age |\n| --- | --- | --- |\n| Jill |  | 50 |\n| Eve | Jackson | 94 |\n\n'
-    assert md(table_missing_head) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
-    assert md(table_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
-    assert md(table_with_caption) == 'TEXT\n\nCaption\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
+    assert md(table_missing_head) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_body) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_caption) == 'TEXT\n\nCaption\n\n|  |  |  |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n\n'
    assert md(table_with_colspan) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
    assert md(table_with_undefined_colspan) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
+    assert md(table_with_colspan_missing_head) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Name | | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+
+
+def test_table_infer_header():
+    assert md(table, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_html_content, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_paragraphs, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_linebreaks, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith Jackson | 50 |\n| Eve | Jackson Smith | 94 |\n\n'
+    assert md(table_with_header_column, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_head_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_head_body_multiple_head, table_infer_header=True) == '\n\n| Creator | Editor | Server |\n| --- | --- | --- |\n| Operator | Manager | Engineer |\n| Bob | Oliver | Tom |\n| Thomas | Lucas | Ethan |\n\n'
+    assert md(table_head_body_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_missing_text, table_infer_header=True) == '\n\n|  | Lastname | Age |\n| --- | --- | --- |\n| Jill |  | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_missing_head, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_body, table_infer_header=True) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_caption, table_infer_header=True) == 'TEXT\n\nCaption\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n\n'
+    assert md(table_with_colspan, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_undefined_colspan, table_infer_header=True) == '\n\n| Name | Age |\n| --- | --- |\n| Jill | Smith |\n\n'
+    assert md(table_with_colspan_missing_head, table_infer_header=True) == '\n\n| Name | | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
--- a/tests/types.py
+++ b/tests/types.py
@@ -0,0 +1,70 @@
+from markdownify import markdownify, ASTERISK, BACKSLASH, LSTRIP, RSTRIP, SPACES, STRIP, UNDERLINED, UNDERSCORE, MarkdownConverter
+from bs4 import BeautifulSoup
+from typing import Union
+
+markdownify("<p>Hello</p>") == "Hello"  # test default of STRIP
+markdownify("<p>Hello</p>", strip_document=LSTRIP) == "Hello\n\n"
+markdownify("<p>Hello</p>", strip_document=RSTRIP) == "\n\nHello"
+markdownify("<p>Hello</p>", strip_document=STRIP) == "Hello"
+markdownify("<p>Hello</p>", strip_document=None) == "\n\nHello\n\n"
+
+# default options
+MarkdownConverter(
+    autolinks=True,
+    bs4_options='html.parser',
+    bullets='*+-',
+    code_language='',
+    code_language_callback=None,
+    convert=None,
+    default_title=False,
+    escape_asterisks=True,
+    escape_underscores=True,
+    escape_misc=False,
+    heading_style=UNDERLINED,
+    keep_inline_images_in=[],
+    newline_style=SPACES,
+    strip=None,
+    strip_document=STRIP,
+    strip_pre=STRIP,
+    strong_em_symbol=ASTERISK,
+    sub_symbol='',
+    sup_symbol='',
+    table_infer_header=False,
+    wrap=False,
+    wrap_width=80,
+).convert("")
+
+# custom options
+MarkdownConverter(
+    strip_document=None,
+    bullets="-",
+    escape_asterisks=True,
+    escape_underscores=True,
+    escape_misc=True,
+    autolinks=True,
+    default_title=True,
+    newline_style=BACKSLASH,
+    sup_symbol='^',
+    sub_symbol='^',
+    keep_inline_images_in=['h3'],
+    wrap=True,
+    wrap_width=80,
+    strong_em_symbol=UNDERSCORE,
+    code_language='python',
+    code_language_callback=None
+).convert("")
+
+html = '<b>test</b>'
+soup = BeautifulSoup(html, 'html.parser')
+MarkdownConverter().convert_soup(soup) == '**test**'
+
+
+def callback(el: BeautifulSoup) -> Union[str, None]:
+    return el['class'][0] if el.has_attr('class') else None
+
+
+MarkdownConverter(code_language_callback=callback).convert("")
+MarkdownConverter(code_language_callback=lambda el: None).convert("")
+
+markdownify('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=callback)
+markdownify('<pre class="python">test\n    foo\nbar</pre>', code_language_callback=lambda el: None)
--- a/tests/utils.py
+++ b/tests/utils.py
@@ -0,0 +1,9 @@
+from markdownify import MarkdownConverter
+
+
+# for unit testing, disable document-level stripping by default so that
+# separation newlines are included in testing
+def md(html, **options):
+    options = {"strip_document": None, **options}
+
+    return MarkdownConverter(**options).convert(html)
Author	SHA1	Message	Date
AlexVonB	241ed02bc1	Merge branch 'develop'	2025-11-16 20:19:50 +01:00
AlexVonB	add391a623	bump to version v1.2.2	2025-11-16 20:19:33 +01:00
AlexVonB	e89cc2a1f8	Merge branch 'develop'	2025-11-16 20:15:01 +01:00
Gareth Jones	aafa4c3b16	fix: include `py.typed` file (#235 )	2025-11-16 20:07:11 +01:00
AlexVonB	c47709c21c	Merge branch 'develop'	2025-08-09 19:41:10 +02:00
AlexVonB	fbc1353593	bump to version v1.2.0	2025-08-09 19:40:43 +02:00
Gareth Jones	85ef82e083	Add basic type stubs (#221 ) (#215 ) * feat: add basic type stubs * feat: add types for constants * feat: add type for `MarkdownConverter` class * ci: add basic job for checking types * feat: add new constant * ci: install types as required * ci: install types package manually * test: add strict coverage for types * fix: allow `strip_document` to be `None` * feat: expand types for MarkdownConverter * fix: do not use `Unpack` as it requires Python 3.12 * feat: define `MarkdownConverter#convert_soup` * feat: improve type for `code_language_callback` * chore: add end-of-file newline * refactor: use `Union` for now	2025-08-03 06:35:46 -04:00
Gareth Jones	f7053e46ab	docs: fix typo (#234 )	2025-08-03 06:24:28 -04:00
Gareth Jones	7edbc5a22b	ci: update `actions/checkout` to v4 (#233 ) * ci: update `actions/checkout` to v4	2025-07-14 21:52:04 +02:00
alheiveea	76e5edb357	limit colspan values to range [1, 1000] (#232 )	2025-07-09 22:08:47 +02:00
Chris Papademetrious	48724e7002	support backticks in <code> spans (#226 ) (#230 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-06-29 14:56:21 -04:00
Chris Papademetrious	9b1412aa5b	implement a strip_pre configuration option (#218 ) (#222 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-06-14 16:37:47 -04:00
Chris Papademetrious	75ab3064dd	allow BeautifulSoup configuration kwargs to be specified (#224 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-06-14 09:06:22 -04:00
Chris Papademetrious	016251e915	ensure that explicitly provided heading conversion functions are used (#212 ) (#214 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-05-03 10:57:09 -04:00
Colin	0e1a849346	Add conversion support for <q> tags (#217 )	2025-04-28 06:37:33 -04:00
Chris Papademetrious	e29de4e753	make convert_hn() public instead of internal (#213 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-04-20 06:20:01 -04:00
Vincent Kelleher	2d654a6b7e	Add beautiful_soup_parser option (#206 ) * add beautiful_soup_parser option * add Beautiful Soup parser argument to command line --------- Co-authored-by: Vincent Kelleher <vincent.kelleher-ext@francetravail.fr> Co-authored-by: AlexVonB <AlexVonB@users.noreply.github.com>	2025-03-29 11:29:29 +01:00
chrispy	26566891a7	Merge branch 'develop'	2025-03-05 06:48:47 -05:00
chrispy	13183f9925	bump to version v1.1.0 Signed-off-by: chrispy <chrispy@synopsys.com>	2025-03-05 06:47:28 -05:00
Stephen V. Brown	7908f1492a	Generalize handling of colspan in case where colspan is in first row but header row is missing (#203 )	2025-03-04 20:01:16 -05:00
Chris Papademetrious	618747c18c	in inline contexts, resolve <br/> to a space instead of an empty string (#202 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-03-04 07:37:22 -05:00
Chris Papademetrious	5122c973c1	add missing newlines for definition lists (#200 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-03-02 06:42:56 -05:00
itmammoth	ac5736f0a3	Support `video` tag with `poster` attribute (#189 )	2025-02-28 10:51:42 +01:00
chrispy	47856cd429	Merge branch 'develop'	2025-02-24 16:20:32 -05:00
chrispy	daa9e28287	bump to version v1.0.0 Signed-off-by: chrispy <chrispy@synopsys.com>	2025-02-24 16:18:23 -05:00
Chris Papademetrious	ba5e222b45	use compiled regex for escaping patterns (#194 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-02-24 12:29:09 -05:00
Chris Papademetrious	6984dca7ab	use a conversion function cache to improve runtime (#196 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-02-24 11:48:40 -05:00
Chris Papademetrious	24977fd192	rename regex pattern variables (#195 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-02-19 20:01:12 -05:00
Joseph Myers	c7329ac1ef	Escape right square brackets (#187 )	2025-02-19 10:04:29 -05:00
Joseph Myers	3311f4d896	Avoid stripping nonbreaking spaces (#188 )	2025-02-19 07:40:53 -05:00
Chris Papademetrious	5655f27208	propagate parent tag context downward to improve runtime (#191 )	2025-02-18 16:35:36 -05:00
Chris Papademetrious	c52ba47166	use list-based processing (inspired by AlextheYounga) (#186 )	2025-02-17 05:47:19 -08:00
Chris Papademetrious	3026602686	make conversion non-destructive to soup; improve div/article/section handling (#184 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-02-04 18:09:24 -05:00
Chris Papademetrious	c52a50e66a	when computing <ol><li> numbering, ignore non-<li> previous siblings (#183 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-02-04 15:39:32 -05:00
Chris Papademetrious	d0c4b85fd5	simplify computation of convert_children_as_inline variable (#182 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-02-04 15:20:42 -05:00
Chris Papademetrious	ae0597d80c	remove superfluous leading/trailing whitespace (#181 )	2025-01-27 11:55:32 -05:00
Chris Papademetrious	dbb5988802	add blank line before/after preformatted block (#179 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-01-21 11:01:11 -05:00
Chris Papademetrious	f24ec9e83c	add blank line before ATX-style headings to avoid ambiguity (#178 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-01-21 11:00:51 -05:00
Chris Papademetrious	7fec8a2080	code simplification to remove need for children_only parameter (#174 ) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-01-19 10:23:58 -05:00
Fess-AKA-DeadMonk	1b3333073a	for convert_* functions, allow for tags with special characters in their name (like "subtag-name") (#136 ) support custom conversion functions for tags with `:` and `-` characters in their names by mapping them to underscores in the function name	2025-01-19 09:48:08 -05:00
SomeBottle	3bf0b527a4	Add a new configuration option to control tabler header row inference (#161 ) Add option to infer first table row as table header (defaults to false)	2025-01-19 08:13:24 -05:00
Chris Papademetrious	1783995cb2	Merge pull request #173 from chrispy-snps/chrispy/support-definition-lists support HTML definition lists (`<dl>`, `<dt>`, and `<dd>`)	2025-01-18 19:45:03 -05:00
chrispy	0fb855676d	support HTML definition lists (<dl>, <dt>, and <dd>) Signed-off-by: chrispy <chrispy@synopsys.com>	2025-01-18 19:43:28 -05:00
Chris Papademetrious	f73a435315	Merge pull request #171 from chrispy-snps/chrispy/optimize-li-blockquote-empty-lines optimize empty-line handling for li and blockquote content	2025-01-18 19:30:06 -05:00
chrispy	17c3678d0e	optimize empty-line handling for li and blockquote content Signed-off-by: chrispy <chrispy@synopsys.com>	2025-01-18 19:25:03 -05:00
Chris Papademetrious	600f77d244	allow a wrap_width value of None for unlimited line lengths (#169 ) allow a wrap_width value of None to reflow text to unlimited line length	2025-01-18 19:20:22 -05:00
Chris Papademetrious	9339571ae9	Merge pull request #167 from chrispy-snps/chrispy/table-caption-blank-line insert a blank line between table caption, table content	2025-01-18 19:09:24 -05:00
Chris Papademetrious	5bc3059abf	Merge pull request #165 from chrispy-snps/chrispy/fix-a-in-code do not construct Markdown links in code spans and code blocks	2025-01-18 19:06:51 -05:00
chrispy	1009087d41	insert a blank line between table caption, table content Signed-off-by: chrispy <chrispy@synopsys.com>	2024-12-29 13:52:32 -05:00
chrispy	71e1471e18	do not construct Markdown links in code spans and code blocks Signed-off-by: chrispy <chrispy@synopsys.com>	2024-12-29 12:33:46 -05:00
AlexVonB	8f70e3952f	Merge branch 'develop'	2024-11-24 23:05:17 +01:00
AlexVonB	6258f5c38b	bump to version v0.14.1	2024-11-24 23:05:02 +01:00
AlexVonB	3466061ca9	prevent `<hn>` to call convert_hn and crash fixes #142	2024-11-24 21:20:57 +01:00
AlexVonB	9595618796	prevent very large headline prefixes for example: `<h9999999>` could crash the conversion. fixes #143	2024-11-24 21:11:42 +01:00
AlexVonB	e935ce819e	Merge branch 'develop'	2024-11-24 12:26:53 +01:00
AlexVonB	fe8a821a20	bump to version v0.14.0	2024-11-24 12:26:29 +01:00
AlexVonB	54c7ca9937	renamed functions that return boolean	2024-11-24 12:10:57 +01:00
AlexVonB	19780834af	Merge branch 'alfonsrv-fix-pr-118' into jsm28-list-indentation	2024-11-24 12:07:59 +01:00
AlexVonB	9202027e26	ignore bs4 warnings in tests	2024-11-24 12:00:27 +01:00
AlexVonB	9bf4ff14b9	Merge branch 'jsm28-selective-escaping' into jsm28-list-indentation	2024-11-20 14:16:06 +01:00
alfonsrv	7ff4d835ae	Set escape_misc to False by default to improve backwards compatibility	2024-10-09 18:55:50 +02:00
Joseph Myers	c13bdd5c14	Fix logic for indentation inside list items This fixes problems with the markdownify logic for indentation inside list items. This PR uses a branch building on that for #120, #150 and #151, so those three PRs should be merged first before merging this one. There is limited logic in markdownify for handling indentation in the case of nested lists. There are two major problems with this logic: * As it's in `convert_list`, causing a list to be indented when inside another list, it does not add indentation for any other elements such as paragraphs that may be found inside list items (or `<pre>`, `<blockquote>`, etc.), so such elements are wrongly not indented and terminate the list in the output. * It uses fixed indentation of one tab. Following CommonMark, a tab in Markdown is considered equivalent to four spaces, which is not sufficient indentation in ordered list items with a number of three or more digits. Fix both of these issues by making `convert_li` handle indentation for the contents of `<li>`, based on the length of the list item marker, rather than doing it in `convert_list` at all.	2024-10-03 21:04:40 +00:00
Joseph Myers	340aecbe98	More thorough cleanup of input whitespace This improves the markdownify logic for cleaning up input whitespace that has no semantic significance in HTML. This PR uses a branch based on that for #150 (which in turn is based on that for #120) to avoid conflicts with those fixes. The suggested order of merging is just first to merge #120, then the rest of #150, then the rest of this PR. Whitespace in HTML input isn't generally significant before or after block-level elements, or at the start of end of such an element other than `<pre>`. There is some limited logic in markdownify for removing it, (a) for whitespace-only nodes in conjunction with a limited list of elements (and with questionable logic that ony removes whitespace adjacent to such an element when also inside such an element) and (b) only for trailing whitespace, in certain places in relation to lists. Replace both those places with more thorough logic using a common list of block-level elements (which could be expanded more). In general, this reduces the number of unnecessary blank lines in output from markdownify (sometimes lines with just a newline, sometimes lines containing a space as well as that newline). There are open issues about cases where propagating such input whitespace to the output actually results in badly formed Markdown output (wrongly indented output), but #120 (which this builds on) fixes those issues, sometimes leaving unnecessary lines with just a space on them in the output, which are dealt with fully by the present PR. There are a few testcases that are affected because they were relying on such whitespace for good output from bad HTML input that used `<p>` or `<blockquote>` inside header tags. To keep reasonable output in those cases of bad input now input whitespace adjacent to those two tags is ignored, make the `<p>` and `<blockquote>` output explicitly include leading and trailing spaces if `convert_as_inline`; such explicit spaces seem the best that can be done for such bad input. Given those fixes, all the remaining changes needed to the expectations of existing tests seem like improvements (removing useless spaces or newlines from the output).	2024-10-03 20:16:23 +00:00
Joseph Myers	c2ffe46e85	Fix whitespace issues around wrapping This fixes various issues relating to how input whitespace is handled and how wrapping handles whitespace resulting from hard line breaks. This PR uses a branch based on that for #120 to avoid conflicts with the fixes and associated test changes there. My suggestion is thus first to merge #120 (which fixes two open issues), then to merge the remaining changes from this PR. Wrapping paragraphs has the effect of losing all newlines including those from `<br>` tags, contrary to HTML semantics (wrapping should be a matter of pretty-printing the output; input whitespace from the HTML input should be normalized, but `<br>` should remain as a hard line break). To fix this, we need to wrap the portions of a paragraph between hard line breaks separately. For this to work, ensure that when wrapping, all input whitespace is normalized at an early stage, including turning newlines into spaces. (Only ASCII whitespace is handled this way; `\s` is not used as it's not clear Unicode whitespace should get such normalization.) When not wrapping, there is still too much input whitespace preservation. If the input contains a blank line, that ends up as a paragraph break in the output, or breaks the header formatting when appearing in a header tag, though in terms of HTML semantics such a blank line is no different from a space. In the case of an ATX header, even a single newline appearing in the output breaks the Markdown. Thus, when not wrapping, arrange for input whitespace containing at least one `\r` or `\n` to be normalized to a single newline, and in the ATX header case, normalize to a space. Fixes #130 (probably, not sure exactly what the HTML input there is) Fixes #88 (a related case, anyway; the actual input in #88 has already been fixed)	2024-10-03 00:30:50 +00:00
Joseph Myers	a369e07211	More selective escaping of `-#.)` (alternative approach) This is a partial alternative to #122 (open since April) for more selective escaping of some special characters. Here, we fix the test function naming (as noted in that PR) so the tests are actually run (and fix some incorrect test assertions so they pass). We also make escaping of `-#.)` (the most common cases of unnecessary escaping in my use case) more selective, while still being conservatively safe in escaping all cases of those characters that might have Markdown significance (including in the presence of wrapping, unlike in #122). (Being conservatively safe doesn't include the cases where `.` or `)` start a fragment, where the existing code already was not conservatively safe.) There are certainly more cases where the code could also be made more selective while remaining conservatively safe (including in the presence of wrapping), so this is not a complete replacement for #122, but by fixing some of the most common cases in a safe way, and getting the tests actually running, I hope this allows progress to be made where the previous attempt appears to have stalled, while still allowing further incremental progress with appropriately safe logic for other characters where useful.	2024-10-02 21:59:39 +00:00
Joseph Myers	4399ee75db	Merge branch 'develop' into para-newlines-92-98	2024-09-30 18:05:32 +00:00
AlexVonB	b5c724ab33	Merge branch 'develop'	2024-07-14 22:40:15 +02:00
AlexVonB	964d89fa8a	bump to version v0.13.1	2024-07-14 22:40:02 +02:00
AlexVonB	46dc1a002d	Migrated the metadata into PEP 621-compliant pyproject.toml (#138 ) * Move the metadata from `setup.py` into `setup.cfg`. Added `pyproject.toml`. Removed `setup.py` - it is no longer needed. Got rid of tests erroroneously finding their way into the wheel. * Started populating version automatically from git tags using `setuptools_scm`. * Migrated the metadata into `PEP 621`-compliant `pyproject.toml`, got rid of `setup.cfg`. * test build in develop and pull requests * use static version instead of dynamic git tag info --------- Co-authored-by: KOLANICH <kolan_n@mail.ru>	2024-07-14 22:38:29 +02:00
AlexVonB	8c810eb8a8	Merge branch 'develop'	2024-07-14 21:20:04 +02:00
Joseph Myers	60d86663d7	More carefully separate inline text from block content There are various cases in which inline text fails to be separated by (sufficiently many) newlines from adjacent block content. A paragraph needs a blank line (two newlines) separating it from prior text, as does an underlined header; an ATX header needs a single newline separating it from prior text. A list needs at least one newline separating it from prior text, but in general two newlines (for an ordered list starting other than at 1, which will only be recognized given a blank line before). To avoid accumulation of more newlines than necessary, take care when concatenating the results of converting consecutive tags to remove redundant newlines (keeping the greater of the number ending the prior text and the number starting the subsequent text). This is thus an alternative to #108 that tries to avoid the excess newline accumulation that was a concern there, as well as fixing more cases than just paragraphs, and updating tests. Fixes #92 Fixes #98	2024-04-09 16:54:33 +00:00
AlexVonB	383847ee86	Merge branch 'develop'	2024-03-26 21:56:09 +01:00
AlexVonB	be3a7f4672	Merge branch 'develop'	2024-03-26 21:52:16 +01:00
AlexVonB	8219d2a673	Merge branch 'develop'	2022-09-02 10:11:08 +02:00
AlexVonB	0c8ac578c9	Merge branch 'develop'	2022-08-31 21:45:38 +02:00
AlexVonB	8f047753ae	Merge branch 'develop'	2022-08-28 22:03:22 +02:00
AlexVonB	194c646a20	Merge branch 'develop'	2022-08-28 21:43:12 +02:00
AlexVonB	2c533339cf	Merge branch 'develop'	2022-04-24 11:01:54 +02:00
AlexVonB	2b8cf444f1	Merge branch 'develop'	2022-04-14 10:25:35 +02:00
AlexVonB	d375116807	Merge branch 'develop'	2022-04-13 20:47:52 +02:00
AlexVonB	eb0330bfc6	Merge branch 'develop'	2022-01-23 11:01:45 +01:00
AlexVonB	28793ac0b3	Merge branch 'develop'	2022-01-18 08:56:33 +01:00
AlexVonB	9231704988	Merge branch 'develop'	2021-12-11 14:44:58 +01:00
AlexVonB	1613c302bc	Merge branch 'develop'	2021-11-17 17:11:01 +01:00
AlexVonB	55c9e84f38	Merge branch 'develop'	2021-09-04 21:50:34 +02:00
AlexVonB	99875683ac	Merge branch 'develop'	2021-08-25 08:53:38 +02:00
AlexVonB	eaeb0603eb	Merge branch 'develop'	2021-07-11 13:21:20 +02:00
AlexVonB	cb73590623	Merge branch 'develop'	2021-07-11 13:14:29 +02:00
AlexVonB	59417ab115	Merge branch 'develop'	2021-05-30 19:10:49 +02:00
AlexVonB	917b01e548	Merge branch 'develop'	2021-05-30 11:20:32 +02:00
AlexVonB	652714859d	Merge branch 'develop'	2021-05-21 14:18:14 +02:00
AlexVonB	ea5b22824b	Merge branch 'develop'	2021-05-18 10:42:27 +02:00
AlexVonB	ec5858e42f	Merge branch 'develop'	2021-05-16 18:41:24 +02:00
AlexVonB	02bb914ef3	Merge branch 'develop'	2021-05-02 13:49:30 +02:00
AlexVonB	21c0d034d0	Merge branch 'develop'	2021-05-02 10:51:00 +02:00
AlexVonB	e3ddc789a2	Merge branch 'develop'	2021-04-22 12:43:27 +02:00
AlexVonB	2d0cd97323	Merge branch 'develop'	2021-04-22 12:13:03 +02:00
AlexVonB	ec185e2e9c	Merge branch 'develop'	2021-02-21 23:09:55 +01:00
AlexVonB	079d1721aa	Merge branch 'develop'	2021-02-21 20:58:34 +01:00
AlexVonB	bf24df3e2e	bump to v0.6.3	2021-01-12 22:43:18 +01:00
AlexVonB	15329588b1	Merge branch 'develop'	2021-01-12 22:42:58 +01:00
AlexVonB	34ad8485fa	bump to v0.6.2	2021-01-12 22:40:03 +01:00
AlexVonB	f0ce934bf8	Merge branch 'develop'	2021-01-12 22:39:47 +01:00
AlexVonB	99cd237f27	Merge branch 'develop'	2021-01-04 10:22:02 +01:00
AlexVonB	2bde8d3e8e	Merge branch 'develop'	2021-01-02 16:49:28 +01:00
AlexVonB	8c9b029756	Merge branch 'develop'	2020-09-01 18:10:07 +02:00
AlexVonB	ae50065872	Merge branch 'develop'	2020-08-18 18:53:10 +02:00