Created
November 23, 2020 11:51
-
-
Save sarthakpranesh/21482a0d5363fe9a252934de36f66396 to your computer and use it in GitHub Desktop.
Language Classification using nnGo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
import ( | |
"encoding/csv" | |
"fmt" | |
"os" | |
"github.com/sarthakpranesh/nnGo" | |
) | |
const ( | |
testIndex = 122 | |
ngram = 5 | |
) | |
var ( | |
langs = []string{"hindi", "tamil", "telugu", "marathi", "kannada"} | |
langsHot = map[string][]float64{ | |
"hindi": []float64{1, 0, 0, 0, 0}, | |
"tamil": []float64{0, 1, 0, 0, 0}, | |
"telugu": []float64{0, 0, 1, 0, 0}, | |
"marathi": []float64{0, 0, 0, 1, 0}, | |
"kannada": []float64{0, 0, 0, 0, 1}, | |
} | |
) | |
type RawData struct { | |
titles []string | |
langs []string | |
} | |
type GramData struct { | |
titles [][]string | |
langs []string | |
} | |
// LoadCsv helps in importing csv data into a RawData type | |
func LoadCsv(name string) (RawData, error) { | |
f, err := os.Open(name) | |
if err != nil { | |
return RawData{}, err | |
} | |
defer f.Close() | |
lines, err := csv.NewReader(f).ReadAll() | |
if err != nil { | |
return RawData{}, err | |
} | |
var titles []string | |
var langs []string | |
for i, line := range lines { | |
if i == 0 { | |
continue | |
} | |
titles = append(titles, line[0]) | |
langs = append(langs, line[1]) | |
} | |
return RawData{ | |
titles: titles, | |
langs: langs, | |
}, nil | |
} | |
// Tokenize helps convert the string tweet into a list of words | |
func NGramConvertion(rawData RawData) GramData { | |
gramData := GramData{ | |
langs: rawData.langs, | |
} | |
for _, title := range rawData.titles { | |
var gram []string | |
for i := ngram; i <= len(title); i++ { | |
gram = append(gram, title[i-ngram:i]) | |
} | |
gramData.titles = append(gramData.titles, gram) | |
} | |
return gramData | |
} | |
// CreateDictionaries creates two dictionaries, one for positive (non hate speech) and the other for negative (hate speech) | |
func CreateDictionaries(gramData GramData) map[string]map[string]float64 { | |
dict := make(map[string]map[string]float64) | |
for _, val := range langs { | |
dict[val] = make(map[string]float64) | |
} | |
for i, title := range gramData.titles { | |
for _, gram := range title { | |
val, _ := dict[gramData.langs[i]][gram] | |
val = val + 1 | |
dict[gramData.langs[i]][gram] = val | |
} | |
} | |
return dict | |
} | |
// NormalizeDictionary converts the given map into a probabilistic map | |
func NormalizeDictionary(dict map[string]float64) map[string]float64 { | |
var max float64 | |
for _, v := range dict { | |
if max < v { | |
max = v | |
} | |
} | |
for k, v := range dict { | |
dict[k] = v / max | |
} | |
return dict | |
} | |
func GenerateNNData(gramData GramData, dicts map[string]map[string]float64) ([][]float64, [][]float64) { | |
var tData, tLabel [][]float64 | |
for i, title := range gramData.titles { | |
tLabel = append(tLabel, langsHot[gramData.langs[i]]) | |
inputData := []float64{1, 0, 0, 0, 0, 0} | |
for _, gram := range title { | |
for i, val := range langs { | |
inputData[i+1] = inputData[i+1] + dicts[val][gram] | |
} | |
} | |
tData = append(tData, inputData) | |
} | |
return tData, tLabel | |
} | |
func main() { | |
// loading csv data | |
rawCsvData, err := LoadCsv("combined.csv") | |
if err != nil { | |
fmt.Println(err) | |
return | |
} | |
fmt.Println("Title:", rawCsvData.titles[testIndex], "\tLanguage:", rawCsvData.langs[testIndex]) | |
// Tokenize raw data | |
gramData := NGramConvertion(rawCsvData) | |
fmt.Println("Title:", gramData.titles[testIndex], "\tLanguage:", gramData.langs[testIndex]) | |
// Create Dictionaries | |
dicts := CreateDictionaries(gramData) | |
// Normalizing Dictionaries - creating probabilities | |
for _, val := range langs { | |
dicts[val] = NormalizeDictionary(dicts[val]) | |
} | |
movieTitles, movieLangs := GenerateNNData(gramData, dicts) | |
fmt.Println("Title:", movieTitles[testIndex], "\tLanguage:", movieLangs[testIndex]) | |
trainData := movieTitles[:5000] | |
trainLabel := movieLangs[:5000] | |
testData := movieTitles[5000:] | |
testLabel := movieLangs[5000:] | |
// Neural Network Model | |
nn := nnGo.NewNN(6, 100, len(langs), 0.00000006, "sgd", 2000) | |
nn.Train(trainData, trainLabel) | |
fmt.Println("Actual Encoded Movie Title:", testData[21]) | |
fmt.Println("Actual Language:", testLabel[21]) | |
nn.Predict(testData[21]) | |
fmt.Println("Actual Encoded Movie Title:", testData[22]) | |
fmt.Println("Actual Language:", testLabel[22]) | |
nn.Predict(testData[22]) | |
var correct, wrong float64 | |
for i, val := range testData { | |
tLabel := testLabel[i] | |
pred := nn.Predict(val) | |
var max float64 | |
var maxIndex int | |
for i, val := range pred { | |
if val[0] > max { | |
max = val[0] | |
maxIndex = i | |
} | |
} | |
if tLabel[maxIndex] == 1 { | |
correct++ | |
} else { | |
wrong++ | |
} | |
} | |
fmt.Println("Number of Correct results:", correct) | |
fmt.Println("Number of Wrong results:", wrong) | |
fmt.Println("Accuracy:", correct*100/(correct+wrong)) | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment