Skip to content

Instantly share code, notes, and snippets.

@kevin-m-kent
Created August 9, 2024 16:28
Show Gist options
  • Save kevin-m-kent/4470a5f76aacc2e87e4a1c8237faf875 to your computer and use it in GitHub Desktop.
Save kevin-m-kent/4470a5f76aacc2e87e4a1c8237faf875 to your computer and use it in GitHub Desktop.
Latent Semantic Analysis with SVD
# Goal: to explore document and term loadings with SVD, Latent Semantic Analysis
# Following the procedures outlined in
# https://sites.socsci.uci.edu/~lpearl/courses/readings/Evangelopoulos2013_LatentSemAnalysis.pdf
library(janeaustenr)
library(dplyr)
library(tidyr)
library(tidytext)
library(stringr)
# create the document-term matrix dataframe, normalized by tf-idf
# documents defined in chunks of 50 lines
document_term <- janeaustenr::austen_books() |>
group_by(book) |>
mutate(line = row_number()) |>
mutate(chunk = (line %/% 50) + 1) |>
ungroup() |>
unnest_tokens(word,text) |>
count(book, chunk, word) |>
anti_join(stop_words) |>
filter(!str_detect(word, "[0-9]")) |>
mutate(document = paste(book, chunk, sep = "_")) |>
select(-book, -chunk) |>
bind_tf_idf(word, document, n) |>
select(document, word, tf_idf) |>
replace_na(list(tf_idf = 0)) |>
filter(is.finite(tf_idf)) |>
pivot_wider(id_cols = c("document"), names_from = "word",
values_from = "tf_idf", values_fill = 0,
names_repair = "minimal")
all_terms <- document_term |> names()
all_terms <- all_terms[2:length(all_terms)]
svd_result <- document_term |>
select(-document) |>
svd()
# Extract singular values
singular_values <- svd_result$d
# Calculate the variance explained by each singular value
variance_explained <- (singular_values^2) / sum(singular_values^2)
# Calculate the cumulative percentage of variance explained
cumulative_variance_explained <- cumsum(variance_explained) * 100
# let's take the first 100 elements (k = 100)
k <- 100
# Extract the first k singular values
singular_values_k <- svd_result$d[1:k]
# Extract the first k columns of u and v
u_k <- svd_result$u[, 1:k]
v_k <- svd_result$v[, 1:k]
# Calculate term and document loadings
term_loadings <- u_k %*% diag(singular_values_k) # n x k matrix where n = number of terms
document_loadings <- v_k %*% diag(singular_values_k) # m x k matrix where m = number of documents
calc_cosine_sim <- function(A, B) {
a_denom <- sum(A^2) |> sqrt()
b_denom <- sum(B^2) |> sqrt()
numerator <- sum(A*B)
numerator/(a_denom*b_denom)
}
find_closest_term <- function(term_index) {
result <- lapply(1:nrow(term_loadings), \(x) {
if (x == term_index) { # don't want to match against the term itself
0.0
} else {
calc_cosine_sim(term_loadings[term_index,], term_loadings[x,])
}
})
all_terms[which.max(result)]
}
# let's find the closest term to housekeeper (term 50)
term_test <- 50
all_terms[term_test]
find_closest_term(term_test)
# closest term is 'home', very cool!
# let's try with feelings, term 150
term_test <- 150
all_terms[term_test]
find_closest_term(term_test)
# attachment is the answer, seems like its working pretty well :)
## comparing documents 1 and 1000
calc_cosine_sim(document_loadings[1, ], document_loadings[1000, ]) # result is 0.20
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment