Created
August 28, 2021 19:46
-
-
Save pazzo83/0d8419764cf5f51fa0bfd3be12a64ea2 to your computer and use it in GitHub Desktop.
pure julia tokenizer similar to Tokenizer in Keras
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
using TextAnalysis | |
mutable struct Tokenizer | |
max_vocab::Int | |
word_index::Dict{String, Int} | |
index_word::Dict{Int, String} | |
filters::Regex | |
lower::Bool | |
Tokenizer( | |
max_vocab::Int = 0, | |
filters::Regex = r"!\"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n", | |
lower::Bool = true | |
) = new(max_vocab, Dict{String, Int}(), Dict{Int, String}(), filters, lower) | |
end | |
function fit_on_texts!(tokenizer::Tokenizer, texts::Vector{String}) | |
corpus = Corpus(StringDocument.(texts)) | |
if tokenizer.lower | |
remove_case!(corpus) | |
end | |
remove_patterns!(corpus, tokenizer.filters) | |
update_lexicon!(corpus) | |
if tokenizer.max_vocab > 0 | |
vocab = getindex.(sort(collect(corpus.lexicon), by=x->x[2], rev=true), 1)[1:tokenizer.max_vocab] | |
tokenizer.word_index = TextAnalysis.columnindices(vocab) | |
else | |
tokenizer.word_index = TextAnalysis.columnindices(sort(collect(keys(lexicon(corpus))))) | |
end | |
tokenizer.index_word = Dict(idx => word for (word, idx) = tokenizer.word_index) | |
return tokenizer | |
end | |
function texts_to_sequences(tokenizer::Tokenizer, texts::Vector{String}) | |
corpus = Corpus(StringDocument.(texts)) | |
if tokenizer.lower | |
remove_case!(corpus) | |
end | |
remove_patterns!(corpus, tokenizer.filters) | |
sequences = Vector{Vector{Int}}(undef, length(corpus.documents)) | |
for i in eachindex(sequences) | |
doc_tokens = tokens(corpus.documents[i]) | |
sequence = zeros(Int, length(doc_tokens)) | |
for k in eachindex(sequence) | |
sequence[k] = get(tokenizer.word_index, doc_tokens[k], 0) | |
end | |
sequences[i] = sequence | |
end | |
return sequences | |
end | |
function sequences_to_texts(tokenizer, sequences::Vector{Vector{Int}}) | |
text_sequences = Vector{Vector{String}}(undef, length(sequences)) | |
for i = eachindex(sequences) | |
text_sequences[i] = [tokenizer.index_word[w] for w = sequences[i]] | |
end | |
return text_sequences | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment