kevin-m-kent · August 9, 2024 16:28
diff --git a/latent_semantic_analysis_svd.R b/latent_semantic_analysis_svd.R
 # Goal: to explore document and term loadings with SVD, Latent Semantic Analysis
 # Following the procedures outlined in 
 # https://sites.socsci.uci.edu/~lpearl/courses/readings/Evangelopoulos2013_LatentSemAnalysis.pdf

 library(janeaustenr)
 library(dplyr)
 library(tidyr)
 library(tidytext)
 library(stringr)

 # create the document-term matrix dataframe, normalized by tf-idf
 # documents defined in chunks of 50 lines
 document_term <- janeaustenr::austen_books() |>
    group_by(book) |>
    mutate(line = row_number()) |>
    mutate(chunk = (line %/% 50) + 1) |>
    ungroup() |>
    unnest_tokens(word,text) |>
    count(book, chunk, word) |>
    anti_join(stop_words) |>
    filter(!str_detect(word, "[0-9]")) |>
    mutate(document = paste(book, chunk, sep = "_")) |>
    select(-book, -chunk) |>
    bind_tf_idf(word, document, n) |>
    select(document, word, tf_idf) |>
    replace_na(list(tf_idf = 0)) |>
    filter(is.finite(tf_idf)) |>
    pivot_wider(id_cols = c("document"), names_from = "word",
                values_from = "tf_idf", values_fill = 0,
                names_repair = "minimal") 
  

 all_terms <- document_term |> names()
 all_terms <- all_terms[2:length(all_terms)]

 svd_result <- document_term |>
  select(-document) |>
  svd()

 # Extract singular values
 singular_values <- svd_result$d

 # Calculate the variance explained by each singular value
 variance_explained <- (singular_values^2) / sum(singular_values^2)

 # Calculate the cumulative percentage of variance explained
 cumulative_variance_explained <- cumsum(variance_explained) * 100

 # let's take the first 100 elements (k = 100)

 k <- 100
 # Extract the first k singular values
 singular_values_k <- svd_result$d[1:k]

 # Extract the first k columns of u and v
 u_k <- svd_result$u[, 1:k]
 v_k <- svd_result$v[, 1:k]

 # Calculate term and document loadings
 term_loadings <- u_k %*% diag(singular_values_k) # n x k matrix where n = number of terms
 document_loadings <- v_k %*% diag(singular_values_k) # m x k matrix where m = number of documents


 calc_cosine_sim <- function(A, B) {
  
 a_denom <- sum(A^2) |> sqrt()
 b_denom <- sum(B^2) |> sqrt()
 numerator <- sum(A*B)
 
 numerator/(a_denom*b_denom)
  
 }



 find_closest_term <- function(term_index) {
  
  result <- lapply(1:nrow(term_loadings), \(x) {
    if (x == term_index) { # don't want to match against the term itself
      0.0
    } else {
      calc_cosine_sim(term_loadings[term_index,], term_loadings[x,])
    }
  })
  
  all_terms[which.max(result)]
  
 }

 # let's find the closest term to housekeeper (term 50)

 term_test <- 50
 all_terms[term_test]

 find_closest_term(term_test)

 # closest term is 'home', very cool!
 # let's try with feelings, term 150
 term_test <- 150
 all_terms[term_test]

 find_closest_term(term_test)

 # attachment is the answer, seems like its working pretty well :)

 ## comparing documents 1 and 1000

 calc_cosine_sim(document_loadings[1, ], document_loadings[1000, ]) # result is 0.20
	# Goal: to explore document and term loadings with SVD, Latent Semantic Analysis
	# Following the procedures outlined in
	# https://sites.socsci.uci.edu/~lpearl/courses/readings/Evangelopoulos2013_LatentSemAnalysis.pdf

	library(janeaustenr)
	library(dplyr)
	library(tidyr)
	library(tidytext)
	library(stringr)

	# create the document-term matrix dataframe, normalized by tf-idf
	# documents defined in chunks of 50 lines
	document_term <- janeaustenr::austen_books() \|>
	group_by(book) \|>
	mutate(line = row_number()) \|>
	mutate(chunk = (line %/% 50) + 1) \|>
	ungroup() \|>
	unnest_tokens(word,text) \|>
	count(book, chunk, word) \|>
	anti_join(stop_words) \|>
	filter(!str_detect(word, "[0-9]")) \|>
	mutate(document = paste(book, chunk, sep = "_")) \|>
	select(-book, -chunk) \|>
	bind_tf_idf(word, document, n) \|>
	select(document, word, tf_idf) \|>
	replace_na(list(tf_idf = 0)) \|>
	filter(is.finite(tf_idf)) \|>
	pivot_wider(id_cols = c("document"), names_from = "word",
	values_from = "tf_idf", values_fill = 0,
	names_repair = "minimal")


	all_terms <- document_term \|> names()
	all_terms <- all_terms[2:length(all_terms)]

	svd_result <- document_term \|>
	select(-document) \|>
	svd()

	# Extract singular values
	singular_values <- svd_result$d

	# Calculate the variance explained by each singular value
	variance_explained <- (singular_values^2) / sum(singular_values^2)

	# Calculate the cumulative percentage of variance explained
	cumulative_variance_explained <- cumsum(variance_explained) * 100

	# let's take the first 100 elements (k = 100)

	k <- 100
	# Extract the first k singular values
	singular_values_k <- svd_result$d[1:k]

	# Extract the first k columns of u and v
	u_k <- svd_result$u[, 1:k]
	v_k <- svd_result$v[, 1:k]

	# Calculate term and document loadings
	term_loadings <- u_k %*% diag(singular_values_k) # n x k matrix where n = number of terms
	document_loadings <- v_k %*% diag(singular_values_k) # m x k matrix where m = number of documents


	calc_cosine_sim <- function(A, B) {

	a_denom <- sum(A^2) \|> sqrt()
	b_denom <- sum(B^2) \|> sqrt()
	numerator <- sum(A*B)

	numerator/(a_denom*b_denom)

	}



	find_closest_term <- function(term_index) {

	result <- lapply(1:nrow(term_loadings), \(x) {
	if (x == term_index) { # don't want to match against the term itself
	0.0
	} else {
	calc_cosine_sim(term_loadings[term_index,], term_loadings[x,])
	}
	})

	all_terms[which.max(result)]

	}

	# let's find the closest term to housekeeper (term 50)

	term_test <- 50
	all_terms[term_test]

	find_closest_term(term_test)

	# closest term is 'home', very cool!
	# let's try with feelings, term 150
	term_test <- 150
	all_terms[term_test]

	find_closest_term(term_test)

	# attachment is the answer, seems like its working pretty well :)

	## comparing documents 1 and 1000

	calc_cosine_sim(document_loadings[1, ], document_loadings[1000, ]) # result is 0.20