rofl0r · June 8, 2017 20:43
diff --git a/python3_fixes_with_bs4_support.patch b/python3_fixes_with_bs4_support.patch
 diff --git a/python/gumbo/bs4_adapter.py b/python/gumbo/bs4_adapter.py
 new file mode 100644
 index 0000000..5a8d273
 --- /dev/null
 +++ b/python/gumbo/bs4_adapter.py
 @@ -0,0 +1,183 @@
 +# -*- coding: utf-8 -*-
 +# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
 +
 +from __future__ import unicode_literals, print_function
 +
 +# Copyright 2012 Google Inc. All Rights Reserved.
 +# Modifications to use BeautifulSoup4 Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, Canada
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
 +# you may not use this file except in compliance with the License.
 +# You may obtain a copy of the License at
 +#
 +#     http://www.apache.org/licenses/LICENSE-2.0
 +#
 +# Unless required by applicable law or agreed to in writing, software
 +# distributed under the License is distributed on an "AS IS" BASIS,
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
 +#
 +
 +# Should this be reworked to be a bs4 treebuilder?
 +
 +"""
 +  Adapter between Gumbo and BeautifulSoup4.
 +  This parses an HTML document and gives back a BeautifulSoup4 object, which you
 +  can then manipulate like a normal BeautifulSoup4 parse tree.
 +
 +  Groks namespaces on elements and attributes
 +"""
 +
 +__author__ = '[email protected] (Jonathan Tang)'
 +
 +import sys
 +import gumboc
 +
 +import bs4
 +# uses bs4.element classes:
 +#      Comment, DocType, NavigableString, CData, Tag, NamespacedAttribute, whitespace_re
 +
 +# These should be indexed by the enum
 +# values of gumboc.Namespace
 +
 +_NAMESPACES = [
 +    'http://www.w3.org/1999/xhtml',
 +    'http://www.w3.org/2000/svg',
 +    'http://www.w3.org/1998/Math/MathML',
 +    ]
 +
 +
 +def _fromutf8(text):
 +    return text.decode('utf-8', 'replace')
 +
 +
 +def _add_source_info(obj, original_text, start_pos, end_pos):
 +    obj.original = _fromutf8(bytes(original_text))
 +    obj.line = start_pos.line
 +    obj.col = start_pos.column
 +    obj.offset = start_pos.offset
 +    if end_pos:
 +        obj.end_line = end_pos.line
 +        obj.end_col = end_pos.column
 +        obj.end_offset = end_pos.offset
 +
 +
 +def _convert_attrs(element_attrs):
 +    def maybe_namespace(attr):
 +        if attr.namespace != gumboc.AttributeNamespace.NONE:
 +            name = _fromutf8(attr.name)
 +            prefix = repr(attr.namespace).lower() if name != 'xmlns' else None
 +            nsurl = atr.namespace.to_url()
 +            return bs4.element.NamespacedAttributes(prefix, name, nsurl)
 +        else:
 +            return _fromutf8(attr.name)
 +    def maybe_value_list(attr):
 +        value = _fromutf8(attr.value)
 +        if " " in value:
 +            value = bs4.element.whitespace_re.split(value)
 +        return value
 +    return dict((maybe_namespace(attr), maybe_value_list(attr)) for attr in element_attrs)
 +
 +
 +def _add_document(soup, element):
 +    if not element.has_doctype:
 +        # Mimic html5lib behavior: if no doctype token, no doctype node.
 +        return
 +    doctype = bs4.element.Doctype.for_name_and_ids(_fromutf8(element.name),
 +                                                   _fromutf8(element.public_identifier),
 +                                                   _fromutf8(element.system_identifier))
 +    soup.object_was_parsed(doctype)
 +
 +
 +def _add_element(soup, element):
 +    tag = bs4.element.Tag(parser=soup,
 +                  name=_fromutf8(element.tag_name),
 +                  namespace=_NAMESPACES[element.tag_namespace.value],
 +                  attrs=_convert_attrs(element.attributes))
 +    for child in element.children:
 +        tag.append(_add_node(soup, child))
 +    _add_source_info(tag, element.original_tag, element.start_pos, element.end_pos)
 +    tag.original_end_tag = _fromutf8(bytes(element.original_end_tag))
 +    return tag
 +
 +
 +def _add_text(cls):
 +    def add_text_internal(soup, element):
 +        text = cls(_fromutf8(element.text))
 +        _add_source_info(text, element.original_text, element.start_pos, None)
 +        return text
 +    return add_text_internal
 +
 +
 +_HANDLERS = [
 +    _add_document,                              # DOCUMENT
 +    _add_element,                               # ELEMENT
 +    _add_text(bs4.element.NavigableString),     # TEXT
 +    _add_text(bs4.element.CData),               # CDATA
 +    _add_text(bs4.element.Comment),             # COMMENT
 +    _add_text(bs4.element.NavigableString),     # WHITESPACE
 +    _add_element,                               # TEMPLATE
 +    ]
 +
 +
 +def _add_node(soup, node):
 +  return _HANDLERS[node.type.value](soup, node.contents)
 +
 +
 +def _add_next_prev_pointers(soup):
 +    def _traverse(node):
 +        # .findAll requires the .next pointer, which is what we're trying to add
 +        # when we call this, and so we manually supply a generator to yield the
 +        # nodes in DOM order.
 +        yield node
 +        try:
 +            for child in node.contents:
 +                for descendant in _traverse(child):
 +                    yield descendant
 +        except AttributeError:
 +            # Not an element.
 +            return
 +    nodes = sorted(_traverse(soup), key=lambda node: node.offset)
 +    if nodes:
 +        nodes[0].previous_element = None
 +        nodes[-1].next_element = None
 +    for i, node in enumerate(nodes[1:-1], 1):
 +        nodes[i-1].next_element = node
 +        node.previous_element = nodes[i-1]
 +
 +
 +def parse(text, **kwargs):
 +    with gumboc.parse(text, **kwargs) as output:
 +        soup = bs4.BeautifulSoup('', "html.parser")
 +        _add_document(soup, output.contents.document.contents)
 +        for node in output.contents.document.contents.children:
 +            soup.append(_add_node(soup, node))
 +        _add_next_prev_pointers(soup.html)
 +        return soup
 +
 +
 +def main():
 +    samp = """
 +<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
 +  "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
 +<html xmlns="http://www.w3.org/1999/xhtml/" xml:lang="en" lang="en-US">
 +<head><title>testing & entities</title></head>
 +<body>
 +  <p class="first second">this&nbsp;is&#160;the&#xa0;<i><b>copyright</i></b> symbol "&copy;"</p>
 +  <p xmlns:xlink="http://www.w3.org/xlink" class="second" xlink:href="http://www.ggogle.com">
 +     this used to test atribute namespaces
 + </p>
 +</body>
 +</html>
 +"""
 +    soup = parse(samp)
 +    print(soup.decode())
 +    for node in soup.findAll("head"):
 +        print(node)
 +    for node in soup.find_all(attrs={'class':'second'}):
 +        print(node)
 +    return 0
 +
 +if __name__ == '__main__':
 +    sys.exit(main())
 diff --git a/python/gumbo/bs4_adapter_test.py b/python/gumbo/bs4_adapter_test.py
 new file mode 100644
 index 0000000..aa25d4b
 --- /dev/null
 +++ b/python/gumbo/bs4_adapter_test.py
 @@ -0,0 +1,66 @@
 +from __future__ import unicode_literals, print_function
 +
 +# Copyright 2012 Google Inc. All Rights Reserved.
 +#
 +# Licensed under the Apache License, Version 2.0 (the "License");
 +# you may not use this file except in compliance with the License.
 +# You may obtain a copy of the License at
 +#
 +#     http://www.apache.org/licenses/LICENSE-2.0
 +#
 +# Unless required by applicable law or agreed to in writing, software
 +# distributed under the License is distributed on an "AS IS" BASIS,
 +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 +# See the License for the specific language governing permissions and
 +# limitations under the License.
 +#
 +"""Tests for the Gumbo's BeautifulSoup Python adapter."""
 +
 +__author__ = '[email protected] (Jonathan Tang)'
 +
 +import unittest
 +
 +import bs4_adapter
 +
 +
 +class SoupAdapterTest(unittest.TestCase):
 +
 +  def testSimpleParse(self):
 +    soup = bs4_adapter.parse(
 +        """
 +        <ul>
 +          <li class=odd><a href="one.html">One</a>
 +          <li class="even"><a href="two.html">Two</a>
 +          <li class='odd'><a href="three.html">Three</a>
 +          <li class="even"><a href="four.html">Four</a>
 +        </ul>
 +        """)
 +
 +    head = soup.head
 +    self.assertEqual(soup, head.parent.parent)
 +    self.assertEqual(u'head', head.name)
 +    self.assertEqual(0, len(head))
 +
 +    body = soup.body
 +    self.assertEqual(head, body.previousSibling)
 +    self.assertEqual(2, len(body))  # <ul> + trailing whitespace
 +    self.assertEqual(u'ul', body.contents[0].name)
 +
 +    list_items = body.findAll('li')
 +    self.assertEqual(4, len(list_items))
 +
 +    evens = body('li', 'even')
 +    self.assertEqual(2, len(evens))
 +
 +    a2 = body.find('a', href='two.html')
 +    self.assertEqual(u'a', a2.name)
 +    self.assertEqual(u'Two', a2.contents[0])
 +
 +    li2 = a2.parent
 +    self.assertEqual(u'li', li2.name)
 +    self.assertEqual(u'even', li2['class'])
 +    self.assertEqual(list_items[1], li2)
 +    self.assertEqual(evens[0], li2)
 +
 +if __name__ == '__main__':
 +  unittest.main()
 diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py
 index 04da319..887f8e2 100644
 --- a/python/gumbo/gumboc.py
 +++ b/python/gumbo/gumboc.py
 @@ -13,6 +13,8 @@
 # limitations under the License.
 #
 
 +from __future__ import unicode_literals, print_function
 +
 """CTypes bindings for the Gumbo HTML5 parser.
 
 This exports the raw interface of the library as a set of very thin ctypes
 @@ -23,6 +25,19 @@ Pythonic API.
 __author__ = '[email protected] (Jonathan Tang)'
 
 import sys
 +
 +PY3 = sys.version_info[0] == 3
 +if PY3:
 +  text_type = str
 +else:
 +  text_type = unicode
 +
 +# When supporting both python 2 and 3 using one code base, using str(obj) is confusing
 +# at best since its return type is python version specific
 +# Notes:
 +#   - The unicode(obj) operator does not exist in PY3
 +#   - The bytes(obj) exists and works on python >= 2.6 (as it aliased to str in python 2.X)
 +
 import contextlib
 import ctypes
 import os.path
 @@ -113,6 +128,13 @@ class StringPiece(ctypes.Structure):
     return self.length
 
   def __str__(self):
 +    # Warning: in Python 3 the str() operator method may **never** return bytes
 +    #  to write code that employs gumboc.py that will work under both Python 2 and 3 use bytes() instead
 +    if PY3:
 +      return ctypes.string_at(self.data, self.length).decode('utf-8')
 +    return ctypes.string_at(self.data, self.length)
 +
 +  def __bytes__(self):
     return ctypes.string_at(self.data, self.length)
 
 
 @@ -273,11 +295,11 @@ class Element(ctypes.Structure):
     if self.tag_namespace == Namespace.SVG:
       svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
       if svg_tagname is not None:
 -        return str(svg_tagname)
 +        return bytes(svg_tagname)
     if self.tag == Tag.UNKNOWN:
       if original_tag.data is None:
         return ''
 -      return str(original_tag).lower()
 +      return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8')
     return _tagname(self.tag)
 
   def __repr__(self):
 @@ -384,7 +406,9 @@ def parse(text, **kwargs):
   # outlives the parse output.  If we let ctypes do it automatically on function
   # call, it creates a temporary buffer which is destroyed when the call
   # completes, and then the original_text pointers point into invalid memory.
 -  text_ptr = ctypes.c_char_p(text.encode('utf-8'))
 +  if isinstance(text, text_type):
 +    text = text.encode('utf-8')
 +  text_ptr = ctypes.c_char_p(text)
   output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
   try:
     yield output
 diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py
 index 3e9c41f..ed170a1 100644
 --- a/python/gumbo/gumboc_tags.py
 +++ b/python/gumbo/gumboc_tags.py
 @@ -1,3 +1,5 @@
 +from __future__ import unicode_literals
 +
 # Generated via `gentags.py src/tag.in`.
 # Do not edit; edit src/tag.in instead.
 # clang-format off
 diff --git a/python/gumbo/gumboc_test.py b/python/gumbo/gumboc_test.py
 index 1f30d38..b510ca8 100644
 --- a/python/gumbo/gumboc_test.py
 +++ b/python/gumbo/gumboc_test.py
 @@ -13,11 +13,16 @@
 # limitations under the License.
 #
 
 +from __future__ import unicode_literals, print_function
 +
 """Tests for Gumbo CTypes bindings."""
 
 __author__ = '[email protected] (Jonathan Tang)'
 
 -import StringIO
 +try:
 +  import StringIO as io
 +except ImportError:
 +  import io
 
 import unittest
 
 @@ -28,67 +33,67 @@ class CtypesTest(unittest.TestCase):
   def testWordParse(self):
     with gumboc.parse('Test') as output:
       doctype_node = output.contents.document.contents
 -      self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type)
 +      self.assertEqual(gumboc.NodeType.DOCUMENT, doctype_node.type)
       document = doctype_node.v.document
 -      self.assertEquals('', document.name)
 -      self.assertEquals('', document.public_identifier)
 -      self.assertEquals('', document.system_identifier)
 +      self.assertEqual(b'', document.name)
 +      self.assertEqual(b'', document.public_identifier)
 +      self.assertEqual(b'', document.system_identifier)
 
       root = output.contents.root.contents
 -      self.assertEquals(gumboc.NodeType.ELEMENT, root.type)
 -      self.assertEquals(gumboc.Tag.HTML, root.tag)
 -      self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace)
 -      self.assertEquals(2, len(root.children))
 +      self.assertEqual(gumboc.NodeType.ELEMENT, root.type)
 +      self.assertEqual(gumboc.Tag.HTML, root.tag)
 +      self.assertEqual(gumboc.Namespace.HTML, root.tag_namespace)
 +      self.assertEqual(2, len(root.children))
 
       head = root.children[0]
 -      self.assertEquals(gumboc.NodeType.ELEMENT, head.type)
 -      self.assertEquals(gumboc.Tag.HEAD, head.tag)
 -      self.assertEquals('head', head.tag_name)
 -      self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace)
 -      self.assertEquals(0, len(head.original_tag))
 -      self.assertEquals('', str(head.original_end_tag))
 -      self.assertEquals(0, head.children.length)
 +      self.assertEqual(gumboc.NodeType.ELEMENT, head.type)
 +      self.assertEqual(gumboc.Tag.HEAD, head.tag)
 +      self.assertEqual(b'head', head.tag_name)
 +      self.assertEqual(gumboc.Namespace.HTML, head.tag_namespace)
 +      self.assertEqual(0, len(head.original_tag))
 +      self.assertEqual(b'', bytes(head.original_end_tag))
 +      self.assertEqual(0, head.children.length)
 
       body = root.children[1]
 -      self.assertNotEquals(body, doctype_node)
 -      self.assertEquals(gumboc.NodeType.ELEMENT, body.type)
 -      self.assertEquals(gumboc.Tag.BODY, body.tag)
 -      self.assertEquals('body', body.tag_name)
 -      self.assertEquals(1, len(body.children))
 +      self.assertNotEqual(body, doctype_node)
 +      self.assertEqual(gumboc.NodeType.ELEMENT, body.type)
 +      self.assertEqual(gumboc.Tag.BODY, body.tag)
 +      self.assertEqual(b'body', body.tag_name)
 +      self.assertEqual(1, len(body.children))
 
       text_node = body.children[0]
 -      self.assertEquals(gumboc.NodeType.TEXT, text_node.type)
 -      self.assertEquals('Test', text_node.text)
 +      self.assertEqual(gumboc.NodeType.TEXT, text_node.type)
 +      self.assertEqual(b'Test', text_node.text)
 
   def testBufferThatGoesAway(self):
     for i in range(10):
 -      source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>')
 +      source = io.StringIO('<foo bar=quux>1<p>2</foo>')
       parse_tree = gumboc.parse(source.read())
       source.close()
     with parse_tree as output:
       root = output.contents.root.contents
       body = root.children[1]
       foo = body.children[0]
 -      self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
 -      self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
 -      self.assertEquals('<foo bar=quux>', str(foo.original_tag))
 -      self.assertEquals('', str(foo.original_end_tag))
 -      self.assertEquals('foo', foo.tag_name.decode('utf-8'))
 -      self.assertEquals('bar', foo.attributes[0].name)
 -      self.assertEquals('quux', foo.attributes[0].value)
 +      self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
 +      self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
 +      self.assertEqual('<foo bar=quux>', str(foo.original_tag))
 +      self.assertEqual(b'', bytes(foo.original_end_tag))
 +      self.assertEqual(b'foo', foo.tag_name)
 +      self.assertEqual(b'bar', foo.attributes[0].name)
 +      self.assertEqual(b'quux', foo.attributes[0].value)
 
   def testUnknownTag(self):
     with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output:
       root = output.contents.root.contents
       body = root.children[1]
       foo = body.children[0]
 -      self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
 -      self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
 -      self.assertEquals('<foo bar=quux>', str(foo.original_tag))
 -      self.assertEquals('', str(foo.original_end_tag))
 -      self.assertEquals('foo', foo.tag_name.decode('utf-8'))
 -      self.assertEquals('bar', foo.attributes[0].name)
 -      self.assertEquals('quux', foo.attributes[0].value)
 +      self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
 +      self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
 +      self.assertEqual('<foo bar=quux>', str(foo.original_tag))
 +      self.assertEqual(b'', bytes(foo.original_end_tag))
 +      self.assertEqual(b'foo', foo.tag_name)
 +      self.assertEqual(b'bar', foo.attributes[0].name)
 +      self.assertEqual(b'quux', foo.attributes[0].value)
 
   def testSarcasm(self):
     with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output:
 @@ -96,15 +101,15 @@ class CtypesTest(unittest.TestCase):
       body = root.children[1]
       div = body.children[0]
       sarcasm = div.children[0]
 -      self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type)
 -      self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag)
 -      self.assertEquals('<sarcasm>', str(sarcasm.original_tag))
 -      self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag))
 -      self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
 +      self.assertEqual(gumboc.NodeType.ELEMENT, sarcasm.type)
 +      self.assertEqual(gumboc.Tag.UNKNOWN, sarcasm.tag)
 +      self.assertEqual(b'<sarcasm>', bytes(sarcasm.original_tag))
 +      self.assertEqual(b'</sarcasm>', bytes(sarcasm.original_end_tag))
 +      self.assertEqual(b'sarcasm', sarcasm.tag_name)
 
   def testEnums(self):
 -    self.assertEquals(gumboc.Tag.A, gumboc.Tag.A)
 -    self.assertEquals(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))
 +    self.assertEqual(gumboc.Tag.A, gumboc.Tag.A)
 +    self.assertEqual(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))
 
   def testFragment(self):
     with gumboc.parse(
 @@ -112,11 +117,11 @@ class CtypesTest(unittest.TestCase):
         fragment_context=gumboc.Tag.TITLE,
         fragment_namespace=gumboc.Namespace.SVG) as output:
       root = output.contents.root.contents
 -      self.assertEquals(1, len(root.children))
 +      self.assertEqual(1, len(root.children))
       div = root.children[0]
 -      self.assertEquals(gumboc.NodeType.ELEMENT, div.type)
 -      self.assertEquals(gumboc.Tag.DIV, div.tag)
 -      self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace)
 +      self.assertEqual(gumboc.NodeType.ELEMENT, div.type)
 +      self.assertEqual(gumboc.Tag.DIV, div.tag)
 +      self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace)
	diff --git a/python/gumbo/bs4_adapter.py b/python/gumbo/bs4_adapter.py
	new file mode 100644
	index 0000000..5a8d273
	--- /dev/null
	+++ b/python/gumbo/bs4_adapter.py
	@@ -0,0 +1,183 @@
	+# -- coding: utf-8 --
	+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab
	+
	+from __future__ import unicode_literals, print_function
	+
	+# Copyright 2012 Google Inc. All Rights Reserved.
	+# Modifications to use BeautifulSoup4 Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, Canada
	+#
	+# Licensed under the Apache License, Version 2.0 (the "License");
	+# you may not use this file except in compliance with the License.
	+# You may obtain a copy of the License at
	+#
	+# http://www.apache.org/licenses/LICENSE-2.0
	+#
	+# Unless required by applicable law or agreed to in writing, software
	+# distributed under the License is distributed on an "AS IS" BASIS,
	+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+# See the License for the specific language governing permissions and
	+# limitations under the License.
	+#
	+
	+# Should this be reworked to be a bs4 treebuilder?
	+
	+"""
	+ Adapter between Gumbo and BeautifulSoup4.
	+ This parses an HTML document and gives back a BeautifulSoup4 object, which you
	+ can then manipulate like a normal BeautifulSoup4 parse tree.
	+
	+ Groks namespaces on elements and attributes
	+"""
	+
	+__author__ = '[email protected] (Jonathan Tang)'
	+
	+import sys
	+import gumboc
	+
	+import bs4
	+# uses bs4.element classes:
	+# Comment, DocType, NavigableString, CData, Tag, NamespacedAttribute, whitespace_re
	+
	+# These should be indexed by the enum
	+# values of gumboc.Namespace
	+
	+_NAMESPACES = [
	+ 'http://www.w3.org/1999/xhtml',
	+ 'http://www.w3.org/2000/svg',
	+ 'http://www.w3.org/1998/Math/MathML',
	+ ]
	+
	+
	+def _fromutf8(text):
	+ return text.decode('utf-8', 'replace')
	+
	+
	+def _add_source_info(obj, original_text, start_pos, end_pos):
	+ obj.original = _fromutf8(bytes(original_text))
	+ obj.line = start_pos.line
	+ obj.col = start_pos.column
	+ obj.offset = start_pos.offset
	+ if end_pos:
	+ obj.end_line = end_pos.line
	+ obj.end_col = end_pos.column
	+ obj.end_offset = end_pos.offset
	+
	+
	+def _convert_attrs(element_attrs):
	+ def maybe_namespace(attr):
	+ if attr.namespace != gumboc.AttributeNamespace.NONE:
	+ name = _fromutf8(attr.name)
	+ prefix = repr(attr.namespace).lower() if name != 'xmlns' else None
	+ nsurl = atr.namespace.to_url()
	+ return bs4.element.NamespacedAttributes(prefix, name, nsurl)
	+ else:
	+ return _fromutf8(attr.name)
	+ def maybe_value_list(attr):
	+ value = _fromutf8(attr.value)
	+ if " " in value:
	+ value = bs4.element.whitespace_re.split(value)
	+ return value
	+ return dict((maybe_namespace(attr), maybe_value_list(attr)) for attr in element_attrs)
	+
	+
	+def _add_document(soup, element):
	+ if not element.has_doctype:
	+ # Mimic html5lib behavior: if no doctype token, no doctype node.
	+ return
	+ doctype = bs4.element.Doctype.for_name_and_ids(_fromutf8(element.name),
	+ _fromutf8(element.public_identifier),
	+ _fromutf8(element.system_identifier))
	+ soup.object_was_parsed(doctype)
	+
	+
	+def _add_element(soup, element):
	+ tag = bs4.element.Tag(parser=soup,
	+ name=_fromutf8(element.tag_name),
	+ namespace=_NAMESPACES[element.tag_namespace.value],
	+ attrs=_convert_attrs(element.attributes))
	+ for child in element.children:
	+ tag.append(_add_node(soup, child))
	+ _add_source_info(tag, element.original_tag, element.start_pos, element.end_pos)
	+ tag.original_end_tag = _fromutf8(bytes(element.original_end_tag))
	+ return tag
	+
	+
	+def _add_text(cls):
	+ def add_text_internal(soup, element):
	+ text = cls(_fromutf8(element.text))
	+ _add_source_info(text, element.original_text, element.start_pos, None)
	+ return text
	+ return add_text_internal
	+
	+
	+_HANDLERS = [
	+ _add_document, # DOCUMENT
	+ _add_element, # ELEMENT
	+ _add_text(bs4.element.NavigableString), # TEXT
	+ _add_text(bs4.element.CData), # CDATA
	+ _add_text(bs4.element.Comment), # COMMENT
	+ _add_text(bs4.element.NavigableString), # WHITESPACE
	+ _add_element, # TEMPLATE
	+ ]
	+
	+
	+def _add_node(soup, node):
	+ return _HANDLERS[node.type.value](soup, node.contents)
	+
	+
	+def _add_next_prev_pointers(soup):
	+ def _traverse(node):
	+ # .findAll requires the .next pointer, which is what we're trying to add
	+ # when we call this, and so we manually supply a generator to yield the
	+ # nodes in DOM order.
	+ yield node
	+ try:
	+ for child in node.contents:
	+ for descendant in _traverse(child):
	+ yield descendant
	+ except AttributeError:
	+ # Not an element.
	+ return
	+ nodes = sorted(_traverse(soup), key=lambda node: node.offset)
	+ if nodes:
	+ nodes[0].previous_element = None
	+ nodes[-1].next_element = None
	+ for i, node in enumerate(nodes[1:-1], 1):
	+ nodes[i-1].next_element = node
	+ node.previous_element = nodes[i-1]
	+
	+
	+def parse(text, **kwargs):
	+ with gumboc.parse(text, **kwargs) as output:
	+ soup = bs4.BeautifulSoup('', "html.parser")
	+ _add_document(soup, output.contents.document.contents)
	+ for node in output.contents.document.contents.children:
	+ soup.append(_add_node(soup, node))
	+ _add_next_prev_pointers(soup.html)
	+ return soup
	+
	+
	+def main():
	+ samp = """
	+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"
	+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">
	+<html xmlns="http://www.w3.org/1999/xhtml/" xml:lang="en" lang="en-US">
	+<head><title>testing & entities</title></head>
	+<body>
	+ <p class="first second">this is the <i><b>copyright</i></b> symbol "©"</p>
	+ <p xmlns:xlink="http://www.w3.org/xlink" class="second" xlink:href="http://www.ggogle.com">
	+ this used to test atribute namespaces
	+ </p>
	+</body>
	+</html>
	+"""
	+ soup = parse(samp)
	+ print(soup.decode())
	+ for node in soup.findAll("head"):
	+ print(node)
	+ for node in soup.find_all(attrs={'class':'second'}):
	+ print(node)
	+ return 0
	+
	+if __name__ == '__main__':
	+ sys.exit(main())
	diff --git a/python/gumbo/bs4_adapter_test.py b/python/gumbo/bs4_adapter_test.py
	new file mode 100644
	index 0000000..aa25d4b
	--- /dev/null
	+++ b/python/gumbo/bs4_adapter_test.py
	@@ -0,0 +1,66 @@
	+from __future__ import unicode_literals, print_function
	+
	+# Copyright 2012 Google Inc. All Rights Reserved.
	+#
	+# Licensed under the Apache License, Version 2.0 (the "License");
	+# you may not use this file except in compliance with the License.
	+# You may obtain a copy of the License at
	+#
	+# http://www.apache.org/licenses/LICENSE-2.0
	+#
	+# Unless required by applicable law or agreed to in writing, software
	+# distributed under the License is distributed on an "AS IS" BASIS,
	+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	+# See the License for the specific language governing permissions and
	+# limitations under the License.
	+#
	+"""Tests for the Gumbo's BeautifulSoup Python adapter."""
	+
	+__author__ = '[email protected] (Jonathan Tang)'
	+
	+import unittest
	+
	+import bs4_adapter
	+
	+
	+class SoupAdapterTest(unittest.TestCase):
	+
	+ def testSimpleParse(self):
	+ soup = bs4_adapter.parse(
	+ """
	+ <ul>
	+ <li class=odd><a href="one.html">One</a>
	+ <li class="even"><a href="two.html">Two</a>
	+ <li class='odd'><a href="three.html">Three</a>
	+ <li class="even"><a href="four.html">Four</a>
	+ </ul>
	+ """)
	+
	+ head = soup.head
	+ self.assertEqual(soup, head.parent.parent)
	+ self.assertEqual(u'head', head.name)
	+ self.assertEqual(0, len(head))
	+
	+ body = soup.body
	+ self.assertEqual(head, body.previousSibling)
	+ self.assertEqual(2, len(body)) # <ul> + trailing whitespace
	+ self.assertEqual(u'ul', body.contents[0].name)
	+
	+ list_items = body.findAll('li')
	+ self.assertEqual(4, len(list_items))
	+
	+ evens = body('li', 'even')
	+ self.assertEqual(2, len(evens))
	+
	+ a2 = body.find('a', href='two.html')
	+ self.assertEqual(u'a', a2.name)
	+ self.assertEqual(u'Two', a2.contents[0])
	+
	+ li2 = a2.parent
	+ self.assertEqual(u'li', li2.name)
	+ self.assertEqual(u'even', li2['class'])
	+ self.assertEqual(list_items[1], li2)
	+ self.assertEqual(evens[0], li2)
	+
	+if __name__ == '__main__':
	+ unittest.main()
	diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py
	index 04da319..887f8e2 100644
	--- a/python/gumbo/gumboc.py
	+++ b/python/gumbo/gumboc.py
	@@ -13,6 +13,8 @@
	# limitations under the License.
	#

	+from __future__ import unicode_literals, print_function
	+
	"""CTypes bindings for the Gumbo HTML5 parser.

	This exports the raw interface of the library as a set of very thin ctypes
	@@ -23,6 +25,19 @@ Pythonic API.
	__author__ = '[email protected] (Jonathan Tang)'

	import sys
	+
	+PY3 = sys.version_info[0] == 3
	+if PY3:
	+ text_type = str
	+else:
	+ text_type = unicode
	+
	+# When supporting both python 2 and 3 using one code base, using str(obj) is confusing
	+# at best since its return type is python version specific
	+# Notes:
	+# - The unicode(obj) operator does not exist in PY3
	+# - The bytes(obj) exists and works on python >= 2.6 (as it aliased to str in python 2.X)
	+
	import contextlib
	import ctypes
	import os.path
	@@ -113,6 +128,13 @@ class StringPiece(ctypes.Structure):
	return self.length

	def __str__(self):
	+ # Warning: in Python 3 the str() operator method may never return bytes
	+ # to write code that employs gumboc.py that will work under both Python 2 and 3 use bytes() instead
	+ if PY3:
	+ return ctypes.string_at(self.data, self.length).decode('utf-8')
	+ return ctypes.string_at(self.data, self.length)
	+
	+ def __bytes__(self):
	return ctypes.string_at(self.data, self.length)


	@@ -273,11 +295,11 @@ class Element(ctypes.Structure):
	if self.tag_namespace == Namespace.SVG:
	svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag))
	if svg_tagname is not None:
	- return str(svg_tagname)
	+ return bytes(svg_tagname)
	if self.tag == Tag.UNKNOWN:
	if original_tag.data is None:
	return ''
	- return str(original_tag).lower()
	+ return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8')
	return _tagname(self.tag)

	def __repr__(self):
	@@ -384,7 +406,9 @@ def parse(text, **kwargs):
	# outlives the parse output. If we let ctypes do it automatically on function
	# call, it creates a temporary buffer which is destroyed when the call
	# completes, and then the original_text pointers point into invalid memory.
	- text_ptr = ctypes.c_char_p(text.encode('utf-8'))
	+ if isinstance(text, text_type):
	+ text = text.encode('utf-8')
	+ text_ptr = ctypes.c_char_p(text)
	output = _parse_with_options(ctypes.byref(options), text_ptr, len(text))
	try:
	yield output
	diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py
	index 3e9c41f..ed170a1 100644
	--- a/python/gumbo/gumboc_tags.py
	+++ b/python/gumbo/gumboc_tags.py
	@@ -1,3 +1,5 @@
	+from __future__ import unicode_literals
	+
	# Generated via `gentags.py src/tag.in`.
	# Do not edit; edit src/tag.in instead.
	# clang-format off
	diff --git a/python/gumbo/gumboc_test.py b/python/gumbo/gumboc_test.py
	index 1f30d38..b510ca8 100644
	--- a/python/gumbo/gumboc_test.py
	+++ b/python/gumbo/gumboc_test.py
	@@ -13,11 +13,16 @@
	# limitations under the License.
	#

	+from __future__ import unicode_literals, print_function
	+
	"""Tests for Gumbo CTypes bindings."""

	__author__ = '[email protected] (Jonathan Tang)'

	-import StringIO
	+try:
	+ import StringIO as io
	+except ImportError:
	+ import io

	import unittest

	@@ -28,67 +33,67 @@ class CtypesTest(unittest.TestCase):
	def testWordParse(self):
	with gumboc.parse('Test') as output:
	doctype_node = output.contents.document.contents
	- self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type)
	+ self.assertEqual(gumboc.NodeType.DOCUMENT, doctype_node.type)
	document = doctype_node.v.document
	- self.assertEquals('', document.name)
	- self.assertEquals('', document.public_identifier)
	- self.assertEquals('', document.system_identifier)
	+ self.assertEqual(b'', document.name)
	+ self.assertEqual(b'', document.public_identifier)
	+ self.assertEqual(b'', document.system_identifier)

	root = output.contents.root.contents
	- self.assertEquals(gumboc.NodeType.ELEMENT, root.type)
	- self.assertEquals(gumboc.Tag.HTML, root.tag)
	- self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace)
	- self.assertEquals(2, len(root.children))
	+ self.assertEqual(gumboc.NodeType.ELEMENT, root.type)
	+ self.assertEqual(gumboc.Tag.HTML, root.tag)
	+ self.assertEqual(gumboc.Namespace.HTML, root.tag_namespace)
	+ self.assertEqual(2, len(root.children))

	head = root.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, head.type)
	- self.assertEquals(gumboc.Tag.HEAD, head.tag)
	- self.assertEquals('head', head.tag_name)
	- self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace)
	- self.assertEquals(0, len(head.original_tag))
	- self.assertEquals('', str(head.original_end_tag))
	- self.assertEquals(0, head.children.length)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, head.type)
	+ self.assertEqual(gumboc.Tag.HEAD, head.tag)
	+ self.assertEqual(b'head', head.tag_name)
	+ self.assertEqual(gumboc.Namespace.HTML, head.tag_namespace)
	+ self.assertEqual(0, len(head.original_tag))
	+ self.assertEqual(b'', bytes(head.original_end_tag))
	+ self.assertEqual(0, head.children.length)

	body = root.children[1]
	- self.assertNotEquals(body, doctype_node)
	- self.assertEquals(gumboc.NodeType.ELEMENT, body.type)
	- self.assertEquals(gumboc.Tag.BODY, body.tag)
	- self.assertEquals('body', body.tag_name)
	- self.assertEquals(1, len(body.children))
	+ self.assertNotEqual(body, doctype_node)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, body.type)
	+ self.assertEqual(gumboc.Tag.BODY, body.tag)
	+ self.assertEqual(b'body', body.tag_name)
	+ self.assertEqual(1, len(body.children))

	text_node = body.children[0]
	- self.assertEquals(gumboc.NodeType.TEXT, text_node.type)
	- self.assertEquals('Test', text_node.text)
	+ self.assertEqual(gumboc.NodeType.TEXT, text_node.type)
	+ self.assertEqual(b'Test', text_node.text)

	def testBufferThatGoesAway(self):
	for i in range(10):
	- source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>')
	+ source = io.StringIO('<foo bar=quux>1<p>2</foo>')
	parse_tree = gumboc.parse(source.read())
	source.close()
	with parse_tree as output:
	root = output.contents.root.contents
	body = root.children[1]
	foo = body.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
	- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
	- self.assertEquals('<foo bar=quux>', str(foo.original_tag))
	- self.assertEquals('', str(foo.original_end_tag))
	- self.assertEquals('foo', foo.tag_name.decode('utf-8'))
	- self.assertEquals('bar', foo.attributes[0].name)
	- self.assertEquals('quux', foo.attributes[0].value)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
	+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
	+ self.assertEqual('<foo bar=quux>', str(foo.original_tag))
	+ self.assertEqual(b'', bytes(foo.original_end_tag))
	+ self.assertEqual(b'foo', foo.tag_name)
	+ self.assertEqual(b'bar', foo.attributes[0].name)
	+ self.assertEqual(b'quux', foo.attributes[0].value)

	def testUnknownTag(self):
	with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output:
	root = output.contents.root.contents
	body = root.children[1]
	foo = body.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type)
	- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag)
	- self.assertEquals('<foo bar=quux>', str(foo.original_tag))
	- self.assertEquals('', str(foo.original_end_tag))
	- self.assertEquals('foo', foo.tag_name.decode('utf-8'))
	- self.assertEquals('bar', foo.attributes[0].name)
	- self.assertEquals('quux', foo.attributes[0].value)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type)
	+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag)
	+ self.assertEqual('<foo bar=quux>', str(foo.original_tag))
	+ self.assertEqual(b'', bytes(foo.original_end_tag))
	+ self.assertEqual(b'foo', foo.tag_name)
	+ self.assertEqual(b'bar', foo.attributes[0].name)
	+ self.assertEqual(b'quux', foo.attributes[0].value)

	def testSarcasm(self):
	with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output:
	@@ -96,15 +101,15 @@ class CtypesTest(unittest.TestCase):
	body = root.children[1]
	div = body.children[0]
	sarcasm = div.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type)
	- self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag)
	- self.assertEquals('<sarcasm>', str(sarcasm.original_tag))
	- self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag))
	- self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8'))
	+ self.assertEqual(gumboc.NodeType.ELEMENT, sarcasm.type)
	+ self.assertEqual(gumboc.Tag.UNKNOWN, sarcasm.tag)
	+ self.assertEqual(b'<sarcasm>', bytes(sarcasm.original_tag))
	+ self.assertEqual(b'</sarcasm>', bytes(sarcasm.original_end_tag))
	+ self.assertEqual(b'sarcasm', sarcasm.tag_name)

	def testEnums(self):
	- self.assertEquals(gumboc.Tag.A, gumboc.Tag.A)
	- self.assertEquals(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))
	+ self.assertEqual(gumboc.Tag.A, gumboc.Tag.A)
	+ self.assertEqual(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A))

	def testFragment(self):
	with gumboc.parse(
	@@ -112,11 +117,11 @@ class CtypesTest(unittest.TestCase):
	fragment_context=gumboc.Tag.TITLE,
	fragment_namespace=gumboc.Namespace.SVG) as output:
	root = output.contents.root.contents
	- self.assertEquals(1, len(root.children))
	+ self.assertEqual(1, len(root.children))
	div = root.children[0]
	- self.assertEquals(gumboc.NodeType.ELEMENT, div.type)
	- self.assertEquals(gumboc.Tag.DIV, div.tag)
	- self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace)
	+ self.assertEqual(gumboc.NodeType.ELEMENT, div.type)
	+ self.assertEqual(gumboc.Tag.DIV, div.tag)
	+ self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace)