Created
August 22, 2010 20:52
-
-
Save alextp/544268 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import random | |
import math | |
from scipy.special import gamma,gammaln | |
from scipy import weave | |
import sys | |
import collections | |
from libbayes import discrete, gamma_pdf, slice_sample, exp_pdf | |
from libbayes import single_collapsed_likelihood as scl | |
from toputils import get_words | |
from . import sumbasic | |
class ContentSampler(object): | |
def __init__(self, classes, nt): | |
self.all_words = [] | |
self.reverse_map = {} | |
self.doc_map = {} | |
self.documents = [] | |
self.Ndocuments = 0 | |
self.Nwords = 0 | |
self.alpha = 0.01 | |
self.beta = 6. | |
self.Nt = nt | |
self.load_docs(classes, sf) | |
self.Nwords = len(self.all_words) | |
self.Ndocuments = len(self.documents) | |
self.assignments = [[0 for w in d] for d in self.documents] | |
self.initialize() | |
def load_doc(self,document): | |
v = [] | |
doc = document[0] | |
for sent in doc.split("."): | |
if not sent: continue | |
s = [] | |
v.append(s) | |
for w in get_words(sent, stop=False): | |
w = w.lower() | |
if not w in self.reverse_map: | |
self.reverse_map[w] = self.Nwords | |
self.all_words.append(w) | |
self.Nwords += 1 | |
s.append(self.reverse_map[w]) | |
self.doc_map[doc] = len(self.documents) | |
self.docs.append(document) | |
self.documents.append(v) | |
def load_docs(self, classes): | |
prod_map = {} | |
self.Nproducts = 0 | |
self.prod = [] | |
self.docs = [] | |
for c in classes: | |
self.goodness_threshold = len(self.documents) | |
for prod,docs in c: | |
prod_map[prod] = self.Nproducts | |
for doc in docs: | |
self.prod.append(self.Nproducts) | |
self.load_doc(doc) | |
self.Nproducts += 1 | |
def initialize(self): | |
self.t = np.zeros((self.Nt,self.Nt)) | |
self.topics = np.zeros((self.Nt,self.Nwords)) | |
self.ntopics = np.zeros(self.Nt) | |
self.assignments = [] | |
self.documents = [map(np.array, di) for di in self.documents] | |
self.msample = np.zeros((self.Ndocuments,self.Nt)) | |
for d in xrange(self.Ndocuments): | |
ad = [] | |
self.assignments.append(ad) | |
p = 0 | |
for i in xrange(len(self.documents[d])): | |
y = random.randint(1,self.Nt-1) | |
ad.append(y) | |
for w in self.documents[d][i]: | |
self.ntopics[y] += 1 | |
self.topics[y,w] += 1 | |
self.t[p,y] += 1 | |
p = y | |
self.t[y,0] += 1 | |
self.assignments = map(np.array, self.assignments) | |
def c_add(self,dist,norm,wordset, args=["Nws", "alpha", "w", "dist", "norm", "p"]): | |
p = 1. | |
dn = len(dist)*self.alpha | |
norm += float(dn) | |
Nws = len(wordset) | |
w = wordset | |
alpha = self.alpha | |
p = weave.inline(""" | |
for (int i = 0; i < Nws; ++i) { | |
p *= ((double)(dist(w(i))+alpha))/((double)norm); | |
norm = (double)norm + 1.; | |
dist(w(i)) = dist(w(i))+1.; | |
} | |
for (int i = 0; i < Nws; ++i) { | |
dist(w(i)) = dist(w(i))-1.; | |
} | |
return_value = p; | |
""", arg_names=args, type_converters=weave.converters.blitz) | |
return p | |
def resample_sentence(self, d,i,s,pt): | |
"Resamples the topic assignments of a sentence" | |
y = self.assignments[d][i] | |
if i == 0: | |
ym1 = 0 | |
else: | |
ym1 = self.assignments[d][i-1] | |
if i == len(self.assignments[d]) -1: | |
yp1 = 0 | |
else: | |
yp1 = self.assignments[d][i+1] | |
self.t[ym1,y] -= 1 | |
assert self.t[ym1,y] >= 0 | |
self.t[y,yp1] -= 1 | |
assert self.t[yp1,y] >= 0 | |
for w in s: | |
self.topics[y,w] -= 1 | |
self.ntopics[y] -= 1 | |
assert self.topics[y,w] >= 0, "%s %s"%(y,w) | |
assert self.ntopics[y] >= 0 | |
pt.fill(1) | |
pt *= (self.t[ym1]+self.beta) | |
pt *= (self.t.T[yp1]+self.beta) | |
for j in xrange(1,self.Nt): | |
pt[j] *= self.c_add(self.topics[j],self.ntopics[j],s) | |
pt[0] *=0 | |
pt /= np.sum(pt) | |
nt = discrete(pt) | |
self.assignments[d][i] = nt | |
for w in s: | |
self.topics[nt,w] += 1 | |
self.ntopics[nt] += 1 | |
self.t[ym1,nt] += 1 | |
self.t[nt,yp1] += 1 | |
def proportions(self, d): | |
p = np.zeros(self.Nt)+self.beta | |
for a in self.assignments[d]: | |
p[a] += 1 | |
return p/np.sum(p) | |
def resample_beta(self): | |
def partial_lik(t0): | |
self.beta = t0 | |
return scl(self.beta, 2, 10, self.t) | |
self.beta = slice_sample(partial_lik, self.beta) | |
def resample_alpha(self): | |
def partial_lik(t0): | |
self.alpha = t0 | |
return scl(self.alpha, 2, 10, self.topics) | |
self.alpha = slice_sample(partial_lik, self.alpha) | |
def iterate(self): | |
self.resample_beta() | |
print self.beta | |
self.resample_alpha() | |
print self.alpha | |
pt = np.zeros(self.Nt) | |
for document in xrange(self.Ndocuments): | |
for i,word in enumerate(self.documents[document]): | |
self.resample_sentence(document,i,word,pt) | |
self.msample[document] += self.proportions(document) | |
def run(self,its): | |
"The sampler itself." | |
iteration = 0 | |
print "iterating.." | |
for i in xrange(its): | |
iteration += 1 | |
self.iterate() | |
def print_topic(model, t, n): | |
s = np.argsort(-t) | |
for w in s[:n]: | |
print " ",model.all_words[w] | |
def print_keyw_topic(model, t, n): | |
tt = np.zeros(len(model.all_words)) | |
for k,v in t.items(): | |
tt[k] = v | |
print_topic(model, tt, n) | |
def top_keyw_topic(model, t, n): | |
tt = np.zeros(len(model.all_words)) | |
for k,v in t.items(): | |
tt[k] = v | |
s = np.argsort(-tt) | |
return [model.all_words[i] for i in s[:n]] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment