favstats · July 12, 2022 15:59
diff --git a/extract_dat.R b/extract_dat.R
 #===============================================================================
 # 2022-07-12
 # Extract data from Latent GOLD within R
 # Fabio Votta (@favstats)
 #===============================================================================

 library(tidyverse)
 library(data.table)

 setwd(here::here())



 ## Funktion findet kommazahlen
 identify_commas <- function(x) {
  
  # x <- "0,997973"
  
  nums <- str_split(x, ",") %>%
    unlist()
  
  if(any(str_count(nums) >= 2)){
    science_part <- as.numeric(paste0(nums[which(diff(str_count(nums) >= 2)==1)], ".", nums[str_count(nums) >= 2]))
  }
  
  which_are_not_science <- which(str_count(nums) == 1)
  
  which_are_not_science_t <- which_are_not_science %>% 
    discard(~{magrittr::is_in(.x, which(diff(str_count(nums) >= 2)==1))})
  
  
  which_are_science <- c(which(diff(str_count(nums) >= 2)==1))
  
  
  science_dat <- tibble(type = "science", value = science_part) %>% 
    mutate_all(as.character)
  
  if(nrow(science_dat)!=0){
    science_dat <- science_dat %>%
      mutate(id = 1:n())      
  }
  
  if(nrow(science_dat)>=2){
    science_dat <- science_dat %>% mutate(type = paste0(type, id))
  }
  
  not_science_dat <- tibble(type = "not_science", value = nums[which_are_not_science_t])  %>% 
    mutate_all(as.character)
  
  if(nrow(not_science_dat)!=0){
    not_science_dat <- not_science_dat %>% 
      mutate(id = 1:n())    
  }
  
  if(nrow(not_science_dat)>=2){
    not_science_dat <- not_science_dat %>% mutate(type = paste0(type, id))
  }  
  
  
  ordaaa <- c(not_science = which_are_not_science_t, science = which_are_science) %>% 
    sort() %>% 
    as.data.frame() %>% 
    rownames_to_column("type") %>% 
    set_names(c("type", "order")) %>% 
    left_join(bind_rows(science_dat, not_science_dat), by = "type") %>% 
    as_tibble() %>% 
    mutate(value = as.numeric(value))
  
  ordaaa$value
  
 }

 # examples <- c("0,8,23379e-295", "4,89544e-13,1,40608e-117", 
 # "0,00409152", "1,6864e-25", "0,1,96165e-284,3", "0,997973", "0,23")
 # 
 # debugonce(identify_commas)
 # 
 # identify_commas("0")

 ## parser funktion
 parse_dat <- function(x, verbose = T) {
  
  x %>% 
    # slice(1:19) %>% 
    split(1:nrow(.)) %>% 
    map_dfr(~{
      
      if(verbose){
        counter <- paste0(.x$internal_id/nrow(x)*100) %>% 
          as.numeric() %>% 
          round(2) %>% 
          format() %>% 
          paste0("%")
        
        print(counter)        
      }

      
      fin <- .x %>% 
        select(contains("clu")) %>% 
        flatten_chr() %>% 
        na.omit() %>% as.character() %>% 
        discard(~magrittr::equals(.x, "")) %>% 
        map(identify_commas) %>% 
        unlist() %>% 
        tibble() %>% 
        t() %>% 
        as_tibble() %>% 
        set_names(c(paste0("cluster", 1:(ncol(.)-1)), "cluster")) 
      
      
      if(verbose){
        print(fin)
      }
      
      return(fin)
    })
  
  
 }

 ## funktion liest .dat ein und extrahiert cluster
 fix_it_all <- function(filepath, verbose = T) {
  
  yo <- read_lines(filepath) 
  
  yo %>% 
    str_replace_all(",clu", " clu") %>% 
    write_lines(file = "fix.txt", sep = "\n")
  
  raw_txt <- data.table::fread("fix.txt", sep = " ")  %>% 
    janitor::clean_names()  %>% 
    mutate(internal_id = 1:n()) %>% 
    mutate_all(~ifelse(str_ends(.x, ","), str_sub(.x, 1, nchar(.x)-1), .x))
  
  fin <- parse_dat(raw_txt, verbose)

  return(fin)
  
 }

 ## achtung das dauert *sehr* lang
 soweit_so_gut <- fix_it_all("../../../Downloads/data7.dat") 

 write_csv(soweit_so_gut, file = "soweit_so_gut.csv")
	#===============================================================================
	# 2022-07-12
	# Extract data from Latent GOLD within R
	# Fabio Votta (@favstats)
	#===============================================================================

	library(tidyverse)
	library(data.table)

	setwd(here::here())



	## Funktion findet kommazahlen
	identify_commas <- function(x) {

	# x <- "0,997973"

	nums <- str_split(x, ",") %>%
	unlist()

	if(any(str_count(nums) >= 2)){
	science_part <- as.numeric(paste0(nums[which(diff(str_count(nums) >= 2)==1)], ".", nums[str_count(nums) >= 2]))
	}

	which_are_not_science <- which(str_count(nums) == 1)

	which_are_not_science_t <- which_are_not_science %>%
	discard(~{magrittr::is_in(.x, which(diff(str_count(nums) >= 2)==1))})


	which_are_science <- c(which(diff(str_count(nums) >= 2)==1))


	science_dat <- tibble(type = "science", value = science_part) %>%
	mutate_all(as.character)

	if(nrow(science_dat)!=0){
	science_dat <- science_dat %>%
	mutate(id = 1:n())
	}

	if(nrow(science_dat)>=2){
	science_dat <- science_dat %>% mutate(type = paste0(type, id))
	}

	not_science_dat <- tibble(type = "not_science", value = nums[which_are_not_science_t]) %>%
	mutate_all(as.character)

	if(nrow(not_science_dat)!=0){
	not_science_dat <- not_science_dat %>%
	mutate(id = 1:n())
	}

	if(nrow(not_science_dat)>=2){
	not_science_dat <- not_science_dat %>% mutate(type = paste0(type, id))
	}


	ordaaa <- c(not_science = which_are_not_science_t, science = which_are_science) %>%
	sort() %>%
	as.data.frame() %>%
	rownames_to_column("type") %>%
	set_names(c("type", "order")) %>%
	left_join(bind_rows(science_dat, not_science_dat), by = "type") %>%
	as_tibble() %>%
	mutate(value = as.numeric(value))

	ordaaa$value

	}

	# examples <- c("0,8,23379e-295", "4,89544e-13,1,40608e-117",
	# "0,00409152", "1,6864e-25", "0,1,96165e-284,3", "0,997973", "0,23")
	#
	# debugonce(identify_commas)
	#
	# identify_commas("0")

	## parser funktion
	parse_dat <- function(x, verbose = T) {

	x %>%
	# slice(1:19) %>%
	split(1:nrow(.)) %>%
	map_dfr(~{

	if(verbose){
	counter <- paste0(.x$internal_id/nrow(x)*100) %>%
	as.numeric() %>%
	round(2) %>%
	format() %>%
	paste0("%")

	print(counter)
	}


	fin <- .x %>%
	select(contains("clu")) %>%
	flatten_chr() %>%
	na.omit() %>% as.character() %>%
	discard(~magrittr::equals(.x, "")) %>%
	map(identify_commas) %>%
	unlist() %>%
	tibble() %>%
	t() %>%
	as_tibble() %>%
	set_names(c(paste0("cluster", 1:(ncol(.)-1)), "cluster"))


	if(verbose){
	print(fin)
	}

	return(fin)
	})


	}

	## funktion liest .dat ein und extrahiert cluster
	fix_it_all <- function(filepath, verbose = T) {

	yo <- read_lines(filepath)

	yo %>%
	str_replace_all(",clu", " clu") %>%
	write_lines(file = "fix.txt", sep = "\n")

	raw_txt <- data.table::fread("fix.txt", sep = " ") %>%
	janitor::clean_names() %>%
	mutate(internal_id = 1:n()) %>%
	mutate_all(~ifelse(str_ends(.x, ","), str_sub(.x, 1, nchar(.x)-1), .x))

	fin <- parse_dat(raw_txt, verbose)

	return(fin)

	}

	## achtung das dauert sehr lang
	soweit_so_gut <- fix_it_all("../../../Downloads/data7.dat")

	write_csv(soweit_so_gut, file = "soweit_so_gut.csv")