Forked from kevinhendricks/python3_fixes_with_bs4_support.patch
Created
June 8, 2017 20:43
-
-
Save rofl0r/84b9850e416204a1c2b4258b4b155555 to your computer and use it in GitHub Desktop.
google/gumbo-parser patch to allow gumboc.py to work with both python 2 and 3 and add BeautifulSoup4 support for both
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
diff --git a/python/gumbo/bs4_adapter.py b/python/gumbo/bs4_adapter.py | |
new file mode 100644 | |
index 0000000..5a8d273 | |
--- /dev/null | |
+++ b/python/gumbo/bs4_adapter.py | |
@@ -0,0 +1,183 @@ | |
+# -*- coding: utf-8 -*- | |
+# vim:ts=4:sw=4:softtabstop=4:smarttab:expandtab | |
+ | |
+from __future__ import unicode_literals, print_function | |
+ | |
+# Copyright 2012 Google Inc. All Rights Reserved. | |
+# Modifications to use BeautifulSoup4 Copyright 2015 Kevin B. Hendricks, Stratford, Ontario, Canada | |
+# | |
+# Licensed under the Apache License, Version 2.0 (the "License"); | |
+# you may not use this file except in compliance with the License. | |
+# You may obtain a copy of the License at | |
+# | |
+# http://www.apache.org/licenses/LICENSE-2.0 | |
+# | |
+# Unless required by applicable law or agreed to in writing, software | |
+# distributed under the License is distributed on an "AS IS" BASIS, | |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
+# See the License for the specific language governing permissions and | |
+# limitations under the License. | |
+# | |
+ | |
+# Should this be reworked to be a bs4 treebuilder? | |
+ | |
+""" | |
+ Adapter between Gumbo and BeautifulSoup4. | |
+ This parses an HTML document and gives back a BeautifulSoup4 object, which you | |
+ can then manipulate like a normal BeautifulSoup4 parse tree. | |
+ | |
+ Groks namespaces on elements and attributes | |
+""" | |
+ | |
+__author__ = '[email protected] (Jonathan Tang)' | |
+ | |
+import sys | |
+import gumboc | |
+ | |
+import bs4 | |
+# uses bs4.element classes: | |
+# Comment, DocType, NavigableString, CData, Tag, NamespacedAttribute, whitespace_re | |
+ | |
+# These should be indexed by the enum | |
+# values of gumboc.Namespace | |
+ | |
+_NAMESPACES = [ | |
+ 'http://www.w3.org/1999/xhtml', | |
+ 'http://www.w3.org/2000/svg', | |
+ 'http://www.w3.org/1998/Math/MathML', | |
+ ] | |
+ | |
+ | |
+def _fromutf8(text): | |
+ return text.decode('utf-8', 'replace') | |
+ | |
+ | |
+def _add_source_info(obj, original_text, start_pos, end_pos): | |
+ obj.original = _fromutf8(bytes(original_text)) | |
+ obj.line = start_pos.line | |
+ obj.col = start_pos.column | |
+ obj.offset = start_pos.offset | |
+ if end_pos: | |
+ obj.end_line = end_pos.line | |
+ obj.end_col = end_pos.column | |
+ obj.end_offset = end_pos.offset | |
+ | |
+ | |
+def _convert_attrs(element_attrs): | |
+ def maybe_namespace(attr): | |
+ if attr.namespace != gumboc.AttributeNamespace.NONE: | |
+ name = _fromutf8(attr.name) | |
+ prefix = repr(attr.namespace).lower() if name != 'xmlns' else None | |
+ nsurl = atr.namespace.to_url() | |
+ return bs4.element.NamespacedAttributes(prefix, name, nsurl) | |
+ else: | |
+ return _fromutf8(attr.name) | |
+ def maybe_value_list(attr): | |
+ value = _fromutf8(attr.value) | |
+ if " " in value: | |
+ value = bs4.element.whitespace_re.split(value) | |
+ return value | |
+ return dict((maybe_namespace(attr), maybe_value_list(attr)) for attr in element_attrs) | |
+ | |
+ | |
+def _add_document(soup, element): | |
+ if not element.has_doctype: | |
+ # Mimic html5lib behavior: if no doctype token, no doctype node. | |
+ return | |
+ doctype = bs4.element.Doctype.for_name_and_ids(_fromutf8(element.name), | |
+ _fromutf8(element.public_identifier), | |
+ _fromutf8(element.system_identifier)) | |
+ soup.object_was_parsed(doctype) | |
+ | |
+ | |
+def _add_element(soup, element): | |
+ tag = bs4.element.Tag(parser=soup, | |
+ name=_fromutf8(element.tag_name), | |
+ namespace=_NAMESPACES[element.tag_namespace.value], | |
+ attrs=_convert_attrs(element.attributes)) | |
+ for child in element.children: | |
+ tag.append(_add_node(soup, child)) | |
+ _add_source_info(tag, element.original_tag, element.start_pos, element.end_pos) | |
+ tag.original_end_tag = _fromutf8(bytes(element.original_end_tag)) | |
+ return tag | |
+ | |
+ | |
+def _add_text(cls): | |
+ def add_text_internal(soup, element): | |
+ text = cls(_fromutf8(element.text)) | |
+ _add_source_info(text, element.original_text, element.start_pos, None) | |
+ return text | |
+ return add_text_internal | |
+ | |
+ | |
+_HANDLERS = [ | |
+ _add_document, # DOCUMENT | |
+ _add_element, # ELEMENT | |
+ _add_text(bs4.element.NavigableString), # TEXT | |
+ _add_text(bs4.element.CData), # CDATA | |
+ _add_text(bs4.element.Comment), # COMMENT | |
+ _add_text(bs4.element.NavigableString), # WHITESPACE | |
+ _add_element, # TEMPLATE | |
+ ] | |
+ | |
+ | |
+def _add_node(soup, node): | |
+ return _HANDLERS[node.type.value](soup, node.contents) | |
+ | |
+ | |
+def _add_next_prev_pointers(soup): | |
+ def _traverse(node): | |
+ # .findAll requires the .next pointer, which is what we're trying to add | |
+ # when we call this, and so we manually supply a generator to yield the | |
+ # nodes in DOM order. | |
+ yield node | |
+ try: | |
+ for child in node.contents: | |
+ for descendant in _traverse(child): | |
+ yield descendant | |
+ except AttributeError: | |
+ # Not an element. | |
+ return | |
+ nodes = sorted(_traverse(soup), key=lambda node: node.offset) | |
+ if nodes: | |
+ nodes[0].previous_element = None | |
+ nodes[-1].next_element = None | |
+ for i, node in enumerate(nodes[1:-1], 1): | |
+ nodes[i-1].next_element = node | |
+ node.previous_element = nodes[i-1] | |
+ | |
+ | |
+def parse(text, **kwargs): | |
+ with gumboc.parse(text, **kwargs) as output: | |
+ soup = bs4.BeautifulSoup('', "html.parser") | |
+ _add_document(soup, output.contents.document.contents) | |
+ for node in output.contents.document.contents.children: | |
+ soup.append(_add_node(soup, node)) | |
+ _add_next_prev_pointers(soup.html) | |
+ return soup | |
+ | |
+ | |
+def main(): | |
+ samp = """ | |
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" | |
+ "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd"> | |
+<html xmlns="http://www.w3.org/1999/xhtml/" xml:lang="en" lang="en-US"> | |
+<head><title>testing & entities</title></head> | |
+<body> | |
+ <p class="first second">this is the <i><b>copyright</i></b> symbol "©"</p> | |
+ <p xmlns:xlink="http://www.w3.org/xlink" class="second" xlink:href="http://www.ggogle.com"> | |
+ this used to test atribute namespaces | |
+ </p> | |
+</body> | |
+</html> | |
+""" | |
+ soup = parse(samp) | |
+ print(soup.decode()) | |
+ for node in soup.findAll("head"): | |
+ print(node) | |
+ for node in soup.find_all(attrs={'class':'second'}): | |
+ print(node) | |
+ return 0 | |
+ | |
+if __name__ == '__main__': | |
+ sys.exit(main()) | |
diff --git a/python/gumbo/bs4_adapter_test.py b/python/gumbo/bs4_adapter_test.py | |
new file mode 100644 | |
index 0000000..aa25d4b | |
--- /dev/null | |
+++ b/python/gumbo/bs4_adapter_test.py | |
@@ -0,0 +1,66 @@ | |
+from __future__ import unicode_literals, print_function | |
+ | |
+# Copyright 2012 Google Inc. All Rights Reserved. | |
+# | |
+# Licensed under the Apache License, Version 2.0 (the "License"); | |
+# you may not use this file except in compliance with the License. | |
+# You may obtain a copy of the License at | |
+# | |
+# http://www.apache.org/licenses/LICENSE-2.0 | |
+# | |
+# Unless required by applicable law or agreed to in writing, software | |
+# distributed under the License is distributed on an "AS IS" BASIS, | |
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
+# See the License for the specific language governing permissions and | |
+# limitations under the License. | |
+# | |
+"""Tests for the Gumbo's BeautifulSoup Python adapter.""" | |
+ | |
+__author__ = '[email protected] (Jonathan Tang)' | |
+ | |
+import unittest | |
+ | |
+import bs4_adapter | |
+ | |
+ | |
+class SoupAdapterTest(unittest.TestCase): | |
+ | |
+ def testSimpleParse(self): | |
+ soup = bs4_adapter.parse( | |
+ """ | |
+ <ul> | |
+ <li class=odd><a href="one.html">One</a> | |
+ <li class="even"><a href="two.html">Two</a> | |
+ <li class='odd'><a href="three.html">Three</a> | |
+ <li class="even"><a href="four.html">Four</a> | |
+ </ul> | |
+ """) | |
+ | |
+ head = soup.head | |
+ self.assertEqual(soup, head.parent.parent) | |
+ self.assertEqual(u'head', head.name) | |
+ self.assertEqual(0, len(head)) | |
+ | |
+ body = soup.body | |
+ self.assertEqual(head, body.previousSibling) | |
+ self.assertEqual(2, len(body)) # <ul> + trailing whitespace | |
+ self.assertEqual(u'ul', body.contents[0].name) | |
+ | |
+ list_items = body.findAll('li') | |
+ self.assertEqual(4, len(list_items)) | |
+ | |
+ evens = body('li', 'even') | |
+ self.assertEqual(2, len(evens)) | |
+ | |
+ a2 = body.find('a', href='two.html') | |
+ self.assertEqual(u'a', a2.name) | |
+ self.assertEqual(u'Two', a2.contents[0]) | |
+ | |
+ li2 = a2.parent | |
+ self.assertEqual(u'li', li2.name) | |
+ self.assertEqual(u'even', li2['class']) | |
+ self.assertEqual(list_items[1], li2) | |
+ self.assertEqual(evens[0], li2) | |
+ | |
+if __name__ == '__main__': | |
+ unittest.main() | |
diff --git a/python/gumbo/gumboc.py b/python/gumbo/gumboc.py | |
index 04da319..887f8e2 100644 | |
--- a/python/gumbo/gumboc.py | |
+++ b/python/gumbo/gumboc.py | |
@@ -13,6 +13,8 @@ | |
# limitations under the License. | |
# | |
+from __future__ import unicode_literals, print_function | |
+ | |
"""CTypes bindings for the Gumbo HTML5 parser. | |
This exports the raw interface of the library as a set of very thin ctypes | |
@@ -23,6 +25,19 @@ Pythonic API. | |
__author__ = '[email protected] (Jonathan Tang)' | |
import sys | |
+ | |
+PY3 = sys.version_info[0] == 3 | |
+if PY3: | |
+ text_type = str | |
+else: | |
+ text_type = unicode | |
+ | |
+# When supporting both python 2 and 3 using one code base, using str(obj) is confusing | |
+# at best since its return type is python version specific | |
+# Notes: | |
+# - The unicode(obj) operator does not exist in PY3 | |
+# - The bytes(obj) exists and works on python >= 2.6 (as it aliased to str in python 2.X) | |
+ | |
import contextlib | |
import ctypes | |
import os.path | |
@@ -113,6 +128,13 @@ class StringPiece(ctypes.Structure): | |
return self.length | |
def __str__(self): | |
+ # Warning: in Python 3 the str() operator method may **never** return bytes | |
+ # to write code that employs gumboc.py that will work under both Python 2 and 3 use bytes() instead | |
+ if PY3: | |
+ return ctypes.string_at(self.data, self.length).decode('utf-8') | |
+ return ctypes.string_at(self.data, self.length) | |
+ | |
+ def __bytes__(self): | |
return ctypes.string_at(self.data, self.length) | |
@@ -273,11 +295,11 @@ class Element(ctypes.Structure): | |
if self.tag_namespace == Namespace.SVG: | |
svg_tagname = _normalize_svg_tagname(ctypes.byref(original_tag)) | |
if svg_tagname is not None: | |
- return str(svg_tagname) | |
+ return bytes(svg_tagname) | |
if self.tag == Tag.UNKNOWN: | |
if original_tag.data is None: | |
return '' | |
- return str(original_tag).lower() | |
+ return (bytes(original_tag).decode('utf-8').lower()).encode('utf-8') | |
return _tagname(self.tag) | |
def __repr__(self): | |
@@ -384,7 +406,9 @@ def parse(text, **kwargs): | |
# outlives the parse output. If we let ctypes do it automatically on function | |
# call, it creates a temporary buffer which is destroyed when the call | |
# completes, and then the original_text pointers point into invalid memory. | |
- text_ptr = ctypes.c_char_p(text.encode('utf-8')) | |
+ if isinstance(text, text_type): | |
+ text = text.encode('utf-8') | |
+ text_ptr = ctypes.c_char_p(text) | |
output = _parse_with_options(ctypes.byref(options), text_ptr, len(text)) | |
try: | |
yield output | |
diff --git a/python/gumbo/gumboc_tags.py b/python/gumbo/gumboc_tags.py | |
index 3e9c41f..ed170a1 100644 | |
--- a/python/gumbo/gumboc_tags.py | |
+++ b/python/gumbo/gumboc_tags.py | |
@@ -1,3 +1,5 @@ | |
+from __future__ import unicode_literals | |
+ | |
# Generated via `gentags.py src/tag.in`. | |
# Do not edit; edit src/tag.in instead. | |
# clang-format off | |
diff --git a/python/gumbo/gumboc_test.py b/python/gumbo/gumboc_test.py | |
index 1f30d38..b510ca8 100644 | |
--- a/python/gumbo/gumboc_test.py | |
+++ b/python/gumbo/gumboc_test.py | |
@@ -13,11 +13,16 @@ | |
# limitations under the License. | |
# | |
+from __future__ import unicode_literals, print_function | |
+ | |
"""Tests for Gumbo CTypes bindings.""" | |
__author__ = '[email protected] (Jonathan Tang)' | |
-import StringIO | |
+try: | |
+ import StringIO as io | |
+except ImportError: | |
+ import io | |
import unittest | |
@@ -28,67 +33,67 @@ class CtypesTest(unittest.TestCase): | |
def testWordParse(self): | |
with gumboc.parse('Test') as output: | |
doctype_node = output.contents.document.contents | |
- self.assertEquals(gumboc.NodeType.DOCUMENT, doctype_node.type) | |
+ self.assertEqual(gumboc.NodeType.DOCUMENT, doctype_node.type) | |
document = doctype_node.v.document | |
- self.assertEquals('', document.name) | |
- self.assertEquals('', document.public_identifier) | |
- self.assertEquals('', document.system_identifier) | |
+ self.assertEqual(b'', document.name) | |
+ self.assertEqual(b'', document.public_identifier) | |
+ self.assertEqual(b'', document.system_identifier) | |
root = output.contents.root.contents | |
- self.assertEquals(gumboc.NodeType.ELEMENT, root.type) | |
- self.assertEquals(gumboc.Tag.HTML, root.tag) | |
- self.assertEquals(gumboc.Namespace.HTML, root.tag_namespace) | |
- self.assertEquals(2, len(root.children)) | |
+ self.assertEqual(gumboc.NodeType.ELEMENT, root.type) | |
+ self.assertEqual(gumboc.Tag.HTML, root.tag) | |
+ self.assertEqual(gumboc.Namespace.HTML, root.tag_namespace) | |
+ self.assertEqual(2, len(root.children)) | |
head = root.children[0] | |
- self.assertEquals(gumboc.NodeType.ELEMENT, head.type) | |
- self.assertEquals(gumboc.Tag.HEAD, head.tag) | |
- self.assertEquals('head', head.tag_name) | |
- self.assertEquals(gumboc.Namespace.HTML, head.tag_namespace) | |
- self.assertEquals(0, len(head.original_tag)) | |
- self.assertEquals('', str(head.original_end_tag)) | |
- self.assertEquals(0, head.children.length) | |
+ self.assertEqual(gumboc.NodeType.ELEMENT, head.type) | |
+ self.assertEqual(gumboc.Tag.HEAD, head.tag) | |
+ self.assertEqual(b'head', head.tag_name) | |
+ self.assertEqual(gumboc.Namespace.HTML, head.tag_namespace) | |
+ self.assertEqual(0, len(head.original_tag)) | |
+ self.assertEqual(b'', bytes(head.original_end_tag)) | |
+ self.assertEqual(0, head.children.length) | |
body = root.children[1] | |
- self.assertNotEquals(body, doctype_node) | |
- self.assertEquals(gumboc.NodeType.ELEMENT, body.type) | |
- self.assertEquals(gumboc.Tag.BODY, body.tag) | |
- self.assertEquals('body', body.tag_name) | |
- self.assertEquals(1, len(body.children)) | |
+ self.assertNotEqual(body, doctype_node) | |
+ self.assertEqual(gumboc.NodeType.ELEMENT, body.type) | |
+ self.assertEqual(gumboc.Tag.BODY, body.tag) | |
+ self.assertEqual(b'body', body.tag_name) | |
+ self.assertEqual(1, len(body.children)) | |
text_node = body.children[0] | |
- self.assertEquals(gumboc.NodeType.TEXT, text_node.type) | |
- self.assertEquals('Test', text_node.text) | |
+ self.assertEqual(gumboc.NodeType.TEXT, text_node.type) | |
+ self.assertEqual(b'Test', text_node.text) | |
def testBufferThatGoesAway(self): | |
for i in range(10): | |
- source = StringIO.StringIO('<foo bar=quux>1<p>2</foo>') | |
+ source = io.StringIO('<foo bar=quux>1<p>2</foo>') | |
parse_tree = gumboc.parse(source.read()) | |
source.close() | |
with parse_tree as output: | |
root = output.contents.root.contents | |
body = root.children[1] | |
foo = body.children[0] | |
- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type) | |
- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag) | |
- self.assertEquals('<foo bar=quux>', str(foo.original_tag)) | |
- self.assertEquals('', str(foo.original_end_tag)) | |
- self.assertEquals('foo', foo.tag_name.decode('utf-8')) | |
- self.assertEquals('bar', foo.attributes[0].name) | |
- self.assertEquals('quux', foo.attributes[0].value) | |
+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type) | |
+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag) | |
+ self.assertEqual('<foo bar=quux>', str(foo.original_tag)) | |
+ self.assertEqual(b'', bytes(foo.original_end_tag)) | |
+ self.assertEqual(b'foo', foo.tag_name) | |
+ self.assertEqual(b'bar', foo.attributes[0].name) | |
+ self.assertEqual(b'quux', foo.attributes[0].value) | |
def testUnknownTag(self): | |
with gumboc.parse('<foo bar=quux>1<p>2</foo>') as output: | |
root = output.contents.root.contents | |
body = root.children[1] | |
foo = body.children[0] | |
- self.assertEquals(gumboc.NodeType.ELEMENT, foo.type) | |
- self.assertEquals(gumboc.Tag.UNKNOWN, foo.tag) | |
- self.assertEquals('<foo bar=quux>', str(foo.original_tag)) | |
- self.assertEquals('', str(foo.original_end_tag)) | |
- self.assertEquals('foo', foo.tag_name.decode('utf-8')) | |
- self.assertEquals('bar', foo.attributes[0].name) | |
- self.assertEquals('quux', foo.attributes[0].value) | |
+ self.assertEqual(gumboc.NodeType.ELEMENT, foo.type) | |
+ self.assertEqual(gumboc.Tag.UNKNOWN, foo.tag) | |
+ self.assertEqual('<foo bar=quux>', str(foo.original_tag)) | |
+ self.assertEqual(b'', bytes(foo.original_end_tag)) | |
+ self.assertEqual(b'foo', foo.tag_name) | |
+ self.assertEqual(b'bar', foo.attributes[0].name) | |
+ self.assertEqual(b'quux', foo.attributes[0].value) | |
def testSarcasm(self): | |
with gumboc.parse('<div><sarcasm><div></div></sarcasm></div>') as output: | |
@@ -96,15 +101,15 @@ class CtypesTest(unittest.TestCase): | |
body = root.children[1] | |
div = body.children[0] | |
sarcasm = div.children[0] | |
- self.assertEquals(gumboc.NodeType.ELEMENT, sarcasm.type) | |
- self.assertEquals(gumboc.Tag.UNKNOWN, sarcasm.tag) | |
- self.assertEquals('<sarcasm>', str(sarcasm.original_tag)) | |
- self.assertEquals('</sarcasm>', str(sarcasm.original_end_tag)) | |
- self.assertEquals('sarcasm', sarcasm.tag_name.decode('utf-8')) | |
+ self.assertEqual(gumboc.NodeType.ELEMENT, sarcasm.type) | |
+ self.assertEqual(gumboc.Tag.UNKNOWN, sarcasm.tag) | |
+ self.assertEqual(b'<sarcasm>', bytes(sarcasm.original_tag)) | |
+ self.assertEqual(b'</sarcasm>', bytes(sarcasm.original_end_tag)) | |
+ self.assertEqual(b'sarcasm', sarcasm.tag_name) | |
def testEnums(self): | |
- self.assertEquals(gumboc.Tag.A, gumboc.Tag.A) | |
- self.assertEquals(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A)) | |
+ self.assertEqual(gumboc.Tag.A, gumboc.Tag.A) | |
+ self.assertEqual(hash(gumboc.Tag.A.value), hash(gumboc.Tag.A)) | |
def testFragment(self): | |
with gumboc.parse( | |
@@ -112,11 +117,11 @@ class CtypesTest(unittest.TestCase): | |
fragment_context=gumboc.Tag.TITLE, | |
fragment_namespace=gumboc.Namespace.SVG) as output: | |
root = output.contents.root.contents | |
- self.assertEquals(1, len(root.children)) | |
+ self.assertEqual(1, len(root.children)) | |
div = root.children[0] | |
- self.assertEquals(gumboc.NodeType.ELEMENT, div.type) | |
- self.assertEquals(gumboc.Tag.DIV, div.tag) | |
- self.assertEquals(gumboc.Namespace.HTML, div.tag_namespace) | |
+ self.assertEqual(gumboc.NodeType.ELEMENT, div.type) | |
+ self.assertEqual(gumboc.Tag.DIV, div.tag) | |
+ self.assertEqual(gumboc.Namespace.HTML, div.tag_namespace) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment