andcarnivorous · January 5, 2019 17:40
diff --git a/text-selfsim-matrix.py b/text-selfsim-matrix.py
 from matplotlib import cm as cm
 import matplotlib.pyplot as plt
 import numpy as np
 import seaborn as sns
 from nltk.tokenize import word_tokenize
 import re
 from scipy import sparse

 def repetitionMatrix(_input, title = "", kind = False, cmap = "Reds"):
  
        _input = _input.lower()
        _input = re.sub("[\(\)\-,;:\"\.\?\!\_\[\]]", " ", _input)
        _input = re.sub("[\n']", " ", _input)

        x = word_tokenize(_input)
        y = x

        word_freq = dict()
        set_of_x = set(x)
        for word in set_of_x:
                val = x.count(word)
                word_freq.update({word : val})
        
        all_words = []

        for i in x:
                for j in y:
                        if i == j:
                                all_words.append(word_freq.get(i))
                        else:
                                all_words.append(0)

        divider = int(len(all_words)/len(x))

        arrays = []

        for element in range(0, len(all_words), divider):
                arrays.append(np.array(all_words[element-divider:element]))

        colmap = cm.get_cmap(cmap)
        arrays = np.vstack(arrays[1:])
        sparsematrix = sparse.csr_matrix(arrays)

        if kind == "sns":
                # Plot using seaborn
                sns.heatmap(arrays, cbar = False, square = True,
                            xticklabels = 50, yticklabels = 50).set_title(title)
        elif kind == "sparse":
                plt.spy(sparsematrix, markersize=4, precision = 3)
                
        else:
                plt.imshow(arrays)

        plt.title(title)
	from matplotlib import cm as cm
	import matplotlib.pyplot as plt
	import numpy as np
	import seaborn as sns
	from nltk.tokenize import word_tokenize
	import re
	from scipy import sparse

	def repetitionMatrix(_input, title = "", kind = False, cmap = "Reds"):

	_input = _input.lower()
	_input = re.sub("[\(\)\-,;:\"\.\?\!\_\[\]]", " ", _input)
	_input = re.sub("[\n']", " ", _input)

	x = word_tokenize(_input)
	y = x

	word_freq = dict()
	set_of_x = set(x)
	for word in set_of_x:
	val = x.count(word)
	word_freq.update({word : val})

	all_words = []

	for i in x:
	for j in y:
	if i == j:
	all_words.append(word_freq.get(i))
	else:
	all_words.append(0)

	divider = int(len(all_words)/len(x))

	arrays = []

	for element in range(0, len(all_words), divider):
	arrays.append(np.array(all_words[element-divider:element]))

	colmap = cm.get_cmap(cmap)
	arrays = np.vstack(arrays[1:])
	sparsematrix = sparse.csr_matrix(arrays)

	if kind == "sns":
	# Plot using seaborn
	sns.heatmap(arrays, cbar = False, square = True,
	xticklabels = 50, yticklabels = 50).set_title(title)
	elif kind == "sparse":
	plt.spy(sparsematrix, markersize=4, precision = 3)

	else:
	plt.imshow(arrays)

	plt.title(title)