SnowMasaya · December 5, 2017 23:11
diff --git a/file0.txt b/file0.txt
 J(\theta) = \frac{1}{2}\sum^{W}_{i,j=1}f(P_{ij})(u^{T}_{i}v_j - \log{P_(ij)})^2
diff --git a/file1.py b/file1.py
        try:
            x_ij = X_ik[(w_i, w_j)]
        except:
            x_ij = 1

        x_max = 100
        alpha = 0.75

        if x_ij < x_max:
            result = (x_ij / x_max) ** alpha
        else:
            result = 1
        return result
diff --git a/file10.py b/file10.py
    def forward(self, center_words, target_words, coocs, weights):
        center_embeds = self.embedding_v(center_words)
        target_embeds = self.embedding_u(target_words)

        # Reference(squeeze)
        #    http://pytorch.org/docs/master/torch.html#torch.squeeze
        center_bias = self.v_bias(center_words).squeeze(1)
        target_bias = self.u_bias(target_words).squeeze(1)

        inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2)  # noqa

        loss = weights * torch.pow(inner_product + center_bias + target_bias - coocs, 2)  # noqa

        return torch.sum(loss)
diff --git a/file11.txt b/file11.txt
 x

 -0.9697  0.1701 -0.5611
 0.0019 -0.1810  0.1066
 [torch.FloatTensor of size 2x3]
diff --git a/file12.txt b/file12.txt
 torch.cat(x)

 -0.9697
 0.1701
 -0.5611
 0.0019
 -0.1810
 0.1066
 [torch.FloatTensor of size 6]

diff --git a/file13.py b/file13.py
        for epoch in range(self.epoch):
            for i, batch in enumerate(get_batch(batch_size=self.batch_size,
                                                train_data=train_data)):
                inputs, targets, coocs, weights = zip(*batch)

                inputs = torch.cat(inputs)
                targets = torch.cat(targets)
                coocs = torch.cat(coocs)
                weights = torch.cat(weights)
                self.model.zero_grad()

                loss = self.model(inputs, targets, coocs, weights)

                loss.backward()
                self.optimizer.step()

                losses.append(loss.data.tolist()[0])
diff --git a/file2.py b/file2.py
 def read_file(file_name: str):
    with codecs.open(file_name, 'r', encoding='utf-8', errors='ignore') as f:
        read_data = f.read().split('\n')
    read_data = list(map(methodcaller("split", " "), read_data))
    return read_data
diff --git a/file3.txt b/file3.txt
 [['"', 'I', 'thought', 'so', '.'], ['All', 'right', ';', 'take', 'a', 'seat', '.'], ['Supper', '?--', 'you', 'want', 'supper', '?'], ['Supper', "'", 'll', 'be', 'ready', 'directly', '."']]
diff --git a/file4.txt b/file4.txt
 {'': 0, '</s>': 1, '、': 2, '。': 3, 'が': 4}
diff --git a/file5.txt b/file5.txt
 {0: '', 1: '</s>', 2: '、', 3: '。', 4: 'が'}
diff --git a/file6.py b/file6.py
    def __make_word2index(self, vocab: list=[]):
        word2index = {}
        for vo in vocab:
            if vo not in word2index.keys():
                word2index[vo] = len(word2index)
        index2word = {v: k for k, v in word2index.items()}
        word2index = dict(collections.OrderedDict(sorted(word2index.items(),
                                                         key=lambda t: t[1])))
        index2word = dict(collections.OrderedDict(sorted(index2word.items(),
                                                         key=lambda t: t[0])))
        return word2index, index2word
diff --git a/file7.py b/file7.py
    def __make_window_data(self, window_size: int=5,
                           corpus: list=[]):
        windows = flatten([list(nltk.ngrams(['<DUMMY>'] * window_size + c +
                                            ['<DUMMY>'] * window_size,
                                            window_size*2+1)) for c in corpus])
        window_data = []
        for window in windows:
            for i in range(window_size*2 + 1):
                if i == window_size or window[i] == '<DUMMY>':
                    continue
                window_data.append((window[window_size], window[i]))
        return window_dat
diff --git a/file8.py b/file8.py

 >>> list(itertools.combinations_with_replacement(A, 3))
 [('a', 'a', 'a'),
 ('a', 'a', 'b'),
 ('a', 'a', 'c'),
 ('a', 'b', 'b'),
 ('a', 'b', 'c'),
 ('a', 'c', 'c'),
 ('b', 'b', 'b'),
 ('b', 'b', 'c'),
 ('b', 'c', 'c'),
 ('c', 'c', 'c')]
diff --git a/file9.py b/file9.py
 def __make_co_occurence_matrix(self,
                                   window_data: list=[],
                                   vocab: list=[]):
        X_ik_window_5 = Counter(window_data)
        X_ik = {}
        weightinhg_dict = {}
        for bigram in combinations_with_replacement(vocab, 2):
            if bigram in X_ik_window_5.keys():
                co_occer = X_ik_window_5[bigram]
                X_ik[bigram] = co_occer + 1
                X_ik[bigram[1], bigram[0]] = co_occer + 1
            else:
                pass
            weightinhg_dict[bigram] = self.__weighting(X_ik=X_ik,
                                                       w_i=bigram[0],
                                                       w_j=bigram[1])
            weightinhg_dict[bigram[1], bigram[0]] = \
                self.__weighting(X_ik=X_ik, w_i=bigram[1], w_j=bigram[0])
        weightinhg_dict = dict(collections.OrderedDict(
            sorted(weightinhg_dict.items(), key=lambda t: t[1])))
        return X_ik, weightinhg_dict
	try:
	x_ij = X_ik[(w_i, w_j)]
	except:
	x_ij = 1

	x_max = 100
	alpha = 0.75

	if x_ij < x_max:
	result = (x_ij / x_max) ** alpha
	else:
	result = 1
	return result
	def forward(self, center_words, target_words, coocs, weights):
	center_embeds = self.embedding_v(center_words)
	target_embeds = self.embedding_u(target_words)

	# Reference(squeeze)
	# http://pytorch.org/docs/master/torch.html#torch.squeeze
	center_bias = self.v_bias(center_words).squeeze(1)
	target_bias = self.u_bias(target_words).squeeze(1)

	inner_product = target_embeds.bmm(center_embeds.transpose(1, 2)).squeeze(2) # noqa

	loss = weights * torch.pow(inner_product + center_bias + target_bias - coocs, 2) # noqa

	return torch.sum(loss)
	x

	-0.9697 0.1701 -0.5611
	0.0019 -0.1810 0.1066
	[torch.FloatTensor of size 2x3]
	torch.cat(x)

	-0.9697
	0.1701
	-0.5611
	0.0019
	-0.1810
	0.1066
	[torch.FloatTensor of size 6]
	for epoch in range(self.epoch):
	for i, batch in enumerate(get_batch(batch_size=self.batch_size,
	train_data=train_data)):
	inputs, targets, coocs, weights = zip(*batch)

	inputs = torch.cat(inputs)
	targets = torch.cat(targets)
	coocs = torch.cat(coocs)
	weights = torch.cat(weights)
	self.model.zero_grad()

	loss = self.model(inputs, targets, coocs, weights)

	loss.backward()
	self.optimizer.step()

	losses.append(loss.data.tolist()[0])
	def read_file(file_name: str):
	with codecs.open(file_name, 'r', encoding='utf-8', errors='ignore') as f:
	read_data = f.read().split('\n')
	read_data = list(map(methodcaller("split", " "), read_data))
	return read_data
	def __make_word2index(self, vocab: list=[]):
	word2index = {}
	for vo in vocab:
	if vo not in word2index.keys():
	word2index[vo] = len(word2index)
	index2word = {v: k for k, v in word2index.items()}
	word2index = dict(collections.OrderedDict(sorted(word2index.items(),
	key=lambda t: t[1])))
	index2word = dict(collections.OrderedDict(sorted(index2word.items(),
	key=lambda t: t[0])))
	return word2index, index2word
	def __make_window_data(self, window_size: int=5,
	corpus: list=[]):
	windows = flatten([list(nltk.ngrams(['<DUMMY>'] * window_size + c +
	['<DUMMY>'] * window_size,
	window_size*2+1)) for c in corpus])
	window_data = []
	for window in windows:
	for i in range(window_size*2 + 1):
	if i == window_size or window[i] == '<DUMMY>':
	continue
	window_data.append((window[window_size], window[i]))
	return window_dat

	>>> list(itertools.combinations_with_replacement(A, 3))
	[('a', 'a', 'a'),
	('a', 'a', 'b'),
	('a', 'a', 'c'),
	('a', 'b', 'b'),
	('a', 'b', 'c'),
	('a', 'c', 'c'),
	('b', 'b', 'b'),
	('b', 'b', 'c'),
	('b', 'c', 'c'),
	('c', 'c', 'c')]
	def __make_co_occurence_matrix(self,
	window_data: list=[],
	vocab: list=[]):
	X_ik_window_5 = Counter(window_data)
	X_ik = {}
	weightinhg_dict = {}
	for bigram in combinations_with_replacement(vocab, 2):
	if bigram in X_ik_window_5.keys():
	co_occer = X_ik_window_5[bigram]
	X_ik[bigram] = co_occer + 1
	X_ik[bigram[1], bigram[0]] = co_occer + 1
	else:
	pass
	weightinhg_dict[bigram] = self.__weighting(X_ik=X_ik,
	w_i=bigram[0],
	w_j=bigram[1])
	weightinhg_dict[bigram[1], bigram[0]] = \
	self.__weighting(X_ik=X_ik, w_i=bigram[1], w_j=bigram[0])
	weightinhg_dict = dict(collections.OrderedDict(
	sorted(weightinhg_dict.items(), key=lambda t: t[1])))
	return X_ik, weightinhg_dict