sergiospagnuolo · November 23, 2022 21:34
diff --git a/wordcloud_gen_agenda.R b/wordcloud_gen_agenda.R
 suppressMessages(library(odbc))
 suppressMessages(library(DBI))
 suppressMessages(library(RPostgreSQL))
 suppressMessages(library(tidyverse))
 suppressMessages(library(tidytext))
 suppressMessages(library(wordcloud2))
 suppressMessages(library(lubridate))
 library(config)
 library(ngram)

 print(paste("Batch Wordcloud iniciado - ", Sys.time() - hours(3)))

 db <- dbConnect(RPostgres::Postgres(),
                dbname = "postgres",
                host = "",
                port = 5432,
                user = "",
                password = "")

 d <- dbGetQuery(db, 'SELECT "id", "title", "date", "authorityId"
                     FROM "new_agenda-transparente"."scraped_schedule"
                     WHERE "date" > NOW()::date - INTERVAL \'720 days\'
                     ORDER BY "date" DESC')

 ######################################################################################################################################################################################### STOPWORDS

 stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)

 bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram

 #################################
 ############ GERA OS BIGRAMAS
 #################################
 # bigrama <- function(d) {
 #   c <- d %>%
 #     unnest_tokens(bigram, title, token = "ngrams", n = 2) %>%
 #     separate(bigram, c("word1", "word2"), sep = " ") %>%
 #     filter(
 #       !word1 %in% stopwords_pt$word,         # remove stopwords from both words in bi-gram
 #       !word2 %in% stopwords_pt$word,
 #       !str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
 #       !str_detect(word2, pattern = "[[:digit:]]"),
 #       !str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
 #       !str_detect(word2, pattern = "[[:punct:]]"),
 #       !str_detect(word1, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
 #       !str_detect(word2, pattern = "(.)\\1{2,}"),
 #       !str_detect(word1, pattern = "\\b(.)\\b"),   # removes any remaining single letter words
 #       !str_detect(word2, pattern = "\\b(.)\\b")
 #     ) %>%
 #     unite("bigram", c(word1, word2), sep = " ") %>%
 #     count(bigram, authorityId) %>%
 #     filter(n >= 2) %>%
 #     #slice_max(n, n = 100) %>%
 #     #filter(!bigram %in% bigram_stop) %>%
 #     mutate(bigram = str_to_upper(bigram)) %>%
 #     arrange(desc(n)) %>%
 #     rename(freq = n, words = bigram)
 # }


 unigrama <- function(d) {
  d %>%
    unnest_tokens(unigram, title, token = "ngrams", n = 1) %>%
    filter(
      !unigram %in% stopwords_pt$word,         # remove stopwords from both words in bi-gram
      !str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
      !str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
      !str_detect(unigram, pattern = "(.)\\1{2,}"),  # removes any words with 3 or more repeated letters
      !str_detect(unigram, pattern = "\\b(.)\\b"),   # removes any remaining single letter words
    ) %>%
    count(unigram, authorityId) %>%
    filter(n >= 6) %>%
    #slice_max(n, n = 100) %>%
    filter(!unigram %in% stopwords_pt) %>%
    mutate(unigram = str_to_upper(unigram)) %>%
    arrange(desc(n)) %>%
    rename(freq = n, words = unigram)
 }

 #################################
 ############ GERA O DATA FRAME
 #################################
 a <- d %>%
  group_split(authorityId) %>%
  purrr::map_df(unigrama) 
  #purrr::map_df(bigrama) 

 # a <- a %>% filter(freq > 15)
 # set.seed(seed = 14412)
 # a$id <- as.vector(sample(1000000000, size = nrow(a), replace = TRUE))
 # a <- a %>% select(id, words, authorityId, freq)

 #####################################
 ############ GRAVA NO BANCO DE DADOS
 #####################################

 # DELETE TEMPORARY TABLE (before writing it)
 dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')

 # WRITE TABLE WITH TEMPORARY DATA FRESH FROM THE DB
 dbWriteTable(db, 'wordcloud_agendas_temp', a, row.names=F)

 # INSERE NOVOS DADOS NA TABELA PRINCIPAL
 dbExecute(db, '
          DELETE FROM public.wordcloud_agendas;
          ')

 dbExecute(db, '
         INSERT INTO public.wordcloud_agendas (words,"authorityId",freq)
 SELECT "words", "authorityId", "freq"::integer
            FROM public.wordcloud_agendas_temp;
          ')

 # DELETE TEMPORARY TABLE (after writing it)
 dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')


 #################################
 ############ FECHA DB
 #################################

 dbDisconnect(db)

 rm(list = ls())

 print(paste("Batch Wordcloud encerrado - ", Sys.time() - hours(3)))
	suppressMessages(library(odbc))
	suppressMessages(library(DBI))
	suppressMessages(library(RPostgreSQL))
	suppressMessages(library(tidyverse))
	suppressMessages(library(tidytext))
	suppressMessages(library(wordcloud2))
	suppressMessages(library(lubridate))
	library(config)
	library(ngram)

	print(paste("Batch Wordcloud iniciado - ", Sys.time() - hours(3)))

	db <- dbConnect(RPostgres::Postgres(),
	dbname = "postgres",
	host = "",
	port = 5432,
	user = "",
	password = "")

	d <- dbGetQuery(db, 'SELECT "id", "title", "date", "authorityId"
	FROM "new_agenda-transparente"."scraped_schedule"
	WHERE "date" > NOW()::date - INTERVAL \'720 days\'
	ORDER BY "date" DESC')

	######################################################################################################################################################################################### STOPWORDS

	stopwords_pt <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=1009958428&single=true&output=csv", header = T)

	bigram_stop <- read.csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vRutQtQqbFVYYP8uwytSyewxtxn19smtWWxsoNai9G6uEg6ytF7Z4IVhYZ5rXx4bgN-IYkSnsF8bSAe/pub?gid=58185156&single=true&output=csv", header = T) %>% .$bigram

	#################################
	############ GERA OS BIGRAMAS
	#################################
	# bigrama <- function(d) {
	# c <- d %>%
	# unnest_tokens(bigram, title, token = "ngrams", n = 2) %>%
	# separate(bigram, c("word1", "word2"), sep = " ") %>%
	# filter(
	# !word1 %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
	# !word2 %in% stopwords_pt$word,
	# !str_detect(word1, pattern = "[[:digit:]]"), # removes any words with numeric digits
	# !str_detect(word2, pattern = "[[:digit:]]"),
	# !str_detect(word1, pattern = "[[:punct:]]"), # removes any remaining punctuations
	# !str_detect(word2, pattern = "[[:punct:]]"),
	# !str_detect(word1, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
	# !str_detect(word2, pattern = "(.)\\1{2,}"),
	# !str_detect(word1, pattern = "\\b(.)\\b"), # removes any remaining single letter words
	# !str_detect(word2, pattern = "\\b(.)\\b")
	# ) %>%
	# unite("bigram", c(word1, word2), sep = " ") %>%
	# count(bigram, authorityId) %>%
	# filter(n >= 2) %>%
	# #slice_max(n, n = 100) %>%
	# #filter(!bigram %in% bigram_stop) %>%
	# mutate(bigram = str_to_upper(bigram)) %>%
	# arrange(desc(n)) %>%
	# rename(freq = n, words = bigram)
	# }


	unigrama <- function(d) {
	d %>%
	unnest_tokens(unigram, title, token = "ngrams", n = 1) %>%
	filter(
	!unigram %in% stopwords_pt$word, # remove stopwords from both words in bi-gram
	!str_detect(unigram, pattern = "[[:digit:]]"), # removes any words with numeric digits
	!str_detect(unigram, pattern = "[[:punct:]]"), # removes any remaining punctuations
	!str_detect(unigram, pattern = "(.)\\1{2,}"), # removes any words with 3 or more repeated letters
	!str_detect(unigram, pattern = "\\b(.)\\b"), # removes any remaining single letter words
	) %>%
	count(unigram, authorityId) %>%
	filter(n >= 6) %>%
	#slice_max(n, n = 100) %>%
	filter(!unigram %in% stopwords_pt) %>%
	mutate(unigram = str_to_upper(unigram)) %>%
	arrange(desc(n)) %>%
	rename(freq = n, words = unigram)
	}

	#################################
	############ GERA O DATA FRAME
	#################################
	a <- d %>%
	group_split(authorityId) %>%
	purrr::map_df(unigrama)
	#purrr::map_df(bigrama)

	# a <- a %>% filter(freq > 15)
	# set.seed(seed = 14412)
	# a$id <- as.vector(sample(1000000000, size = nrow(a), replace = TRUE))
	# a <- a %>% select(id, words, authorityId, freq)

	#####################################
	############ GRAVA NO BANCO DE DADOS
	#####################################

	# DELETE TEMPORARY TABLE (before writing it)
	dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')

	# WRITE TABLE WITH TEMPORARY DATA FRESH FROM THE DB
	dbWriteTable(db, 'wordcloud_agendas_temp', a, row.names=F)

	# INSERE NOVOS DADOS NA TABELA PRINCIPAL
	dbExecute(db, '
	DELETE FROM public.wordcloud_agendas;
	')

	dbExecute(db, '
	INSERT INTO public.wordcloud_agendas (words,"authorityId",freq)
	SELECT "words", "authorityId", "freq"::integer
	FROM public.wordcloud_agendas_temp;
	')

	# DELETE TEMPORARY TABLE (after writing it)
	dbExecute(db, 'DROP TABLE IF EXISTS wordcloud_agendas_temp CASCADE')


	#################################
	############ FECHA DB
	#################################

	dbDisconnect(db)

	rm(list = ls())

	print(paste("Batch Wordcloud encerrado - ", Sys.time() - hours(3)))