Created
August 9, 2024 16:28
-
-
Save kevin-m-kent/4470a5f76aacc2e87e4a1c8237faf875 to your computer and use it in GitHub Desktop.
Latent Semantic Analysis with SVD
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Goal: to explore document and term loadings with SVD, Latent Semantic Analysis | |
# Following the procedures outlined in | |
# https://sites.socsci.uci.edu/~lpearl/courses/readings/Evangelopoulos2013_LatentSemAnalysis.pdf | |
library(janeaustenr) | |
library(dplyr) | |
library(tidyr) | |
library(tidytext) | |
library(stringr) | |
# create the document-term matrix dataframe, normalized by tf-idf | |
# documents defined in chunks of 50 lines | |
document_term <- janeaustenr::austen_books() |> | |
group_by(book) |> | |
mutate(line = row_number()) |> | |
mutate(chunk = (line %/% 50) + 1) |> | |
ungroup() |> | |
unnest_tokens(word,text) |> | |
count(book, chunk, word) |> | |
anti_join(stop_words) |> | |
filter(!str_detect(word, "[0-9]")) |> | |
mutate(document = paste(book, chunk, sep = "_")) |> | |
select(-book, -chunk) |> | |
bind_tf_idf(word, document, n) |> | |
select(document, word, tf_idf) |> | |
replace_na(list(tf_idf = 0)) |> | |
filter(is.finite(tf_idf)) |> | |
pivot_wider(id_cols = c("document"), names_from = "word", | |
values_from = "tf_idf", values_fill = 0, | |
names_repair = "minimal") | |
all_terms <- document_term |> names() | |
all_terms <- all_terms[2:length(all_terms)] | |
svd_result <- document_term |> | |
select(-document) |> | |
svd() | |
# Extract singular values | |
singular_values <- svd_result$d | |
# Calculate the variance explained by each singular value | |
variance_explained <- (singular_values^2) / sum(singular_values^2) | |
# Calculate the cumulative percentage of variance explained | |
cumulative_variance_explained <- cumsum(variance_explained) * 100 | |
# let's take the first 100 elements (k = 100) | |
k <- 100 | |
# Extract the first k singular values | |
singular_values_k <- svd_result$d[1:k] | |
# Extract the first k columns of u and v | |
u_k <- svd_result$u[, 1:k] | |
v_k <- svd_result$v[, 1:k] | |
# Calculate term and document loadings | |
term_loadings <- u_k %*% diag(singular_values_k) # n x k matrix where n = number of terms | |
document_loadings <- v_k %*% diag(singular_values_k) # m x k matrix where m = number of documents | |
calc_cosine_sim <- function(A, B) { | |
a_denom <- sum(A^2) |> sqrt() | |
b_denom <- sum(B^2) |> sqrt() | |
numerator <- sum(A*B) | |
numerator/(a_denom*b_denom) | |
} | |
find_closest_term <- function(term_index) { | |
result <- lapply(1:nrow(term_loadings), \(x) { | |
if (x == term_index) { # don't want to match against the term itself | |
0.0 | |
} else { | |
calc_cosine_sim(term_loadings[term_index,], term_loadings[x,]) | |
} | |
}) | |
all_terms[which.max(result)] | |
} | |
# let's find the closest term to housekeeper (term 50) | |
term_test <- 50 | |
all_terms[term_test] | |
find_closest_term(term_test) | |
# closest term is 'home', very cool! | |
# let's try with feelings, term 150 | |
term_test <- 150 | |
all_terms[term_test] | |
find_closest_term(term_test) | |
# attachment is the answer, seems like its working pretty well :) | |
## comparing documents 1 and 1000 | |
calc_cosine_sim(document_loadings[1, ], document_loadings[1000, ]) # result is 0.20 | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment