dcrosta · November 4, 2011 17:50 · Jun 15, 2012 · Dec 2, 2011 · Nov 4, 2011
diff --git a/htmlabbrev.py b/htmlabbrev.py
@@ -52,7 +52,7 @@ def handle_entityref(self, name):
         self.emit('&%s;' % name)
 
     def handle_charref(self, name):
-        return self.handle_entityref(name)
+        return self.handle_entityref('#%s' % name)
 
     def close(self):
         return ''.join(self.out)
diff --git a/htmlabbrev.py b/htmlabbrev.py
@@ -14,10 +14,10 @@ def __init__(self, maxlength, *args, **kwargs):
         self.out = []
 
     def emit(self, thing, count=False):
+        if count:
+            self.length += len(thing)
         if self.length < self.maxlength:
             self.out.append(thing)
-            if count:
-                self.length += len(thing)
         elif not self.done:
             # trim trailing whitespace
             self.out[-1] = self.out[-1].rstrip()

diff --git a/htmlabbrev.py b/htmlabbrev.py
@@ -0,0 +1,58 @@
+import re
+from HTMLParser import HTMLParser
+
+whitespace = re.compile('(\w+)')
+
+class HTMLAbbrev(HTMLParser):
+
+    def __init__(self, maxlength, *args, **kwargs):
+        HTMLParser.__init__(self, *args, **kwargs)
+        self.stack = []
+        self.maxlength = maxlength
+        self.length = 0
+        self.done = False
+        self.out = []
+
+    def emit(self, thing, count=False):
+        if self.length < self.maxlength:
+            self.out.append(thing)
+            if count:
+                self.length += len(thing)
+        elif not self.done:
+            # trim trailing whitespace
+            self.out[-1] = self.out[-1].rstrip()
+
+            # close out tags on the stack
+            for tag in reversed(self.stack):
+                self.out.append('</%s>' % tag)
+            self.done = True
+
+    def handle_starttag(self, tag, attrs):
+        self.stack.append(tag)
+        attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs)
+        self.emit('<%s%s>' % (tag, (' ' + attrs).rstrip()))
+
+    def handle_endtag(self, tag):
+        if tag == self.stack[-1]:
+            self.emit('</%s>' % tag)
+            del self.stack[-1]
+        else:
+            raise Exception('end tag %r does not match stack: %r' % (tag, self.stack))
+
+    def handle_startendtag(self, tag, attrs):
+        self.stack.append(tag)
+        attrs = ' '.join('%s="%s"' % (k, v) for k, v in attrs)
+        self.emit('<%s%s/>' % (tag, (' ' + attrs).rstrip()))
+
+    def handle_data(self, data):
+        for word in whitespace.split(data):
+            self.emit(word, count=True)
+
+    def handle_entityref(self, name):
+        self.emit('&%s;' % name)
+
+    def handle_charref(self, name):
+        return self.handle_entityref(name)
+
+    def close(self):
+        return ''.join(self.out)