Created
July 12, 2022 15:59
-
-
Save favstats/83e52375e9e8196013b644ef326da44e to your computer and use it in GitHub Desktop.
Latent GOLD exports data in a very weird way because commas are kept for decimals in a comma-separated data file which leads to issues. This script helps recovering the data by extracting the data row-by-row.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#=============================================================================== | |
# 2022-07-12 | |
# Extract data from Latent GOLD within R | |
# Fabio Votta (@favstats) | |
#=============================================================================== | |
library(tidyverse) | |
library(data.table) | |
setwd(here::here()) | |
## Funktion findet kommazahlen | |
identify_commas <- function(x) { | |
# x <- "0,997973" | |
nums <- str_split(x, ",") %>% | |
unlist() | |
if(any(str_count(nums) >= 2)){ | |
science_part <- as.numeric(paste0(nums[which(diff(str_count(nums) >= 2)==1)], ".", nums[str_count(nums) >= 2])) | |
} | |
which_are_not_science <- which(str_count(nums) == 1) | |
which_are_not_science_t <- which_are_not_science %>% | |
discard(~{magrittr::is_in(.x, which(diff(str_count(nums) >= 2)==1))}) | |
which_are_science <- c(which(diff(str_count(nums) >= 2)==1)) | |
science_dat <- tibble(type = "science", value = science_part) %>% | |
mutate_all(as.character) | |
if(nrow(science_dat)!=0){ | |
science_dat <- science_dat %>% | |
mutate(id = 1:n()) | |
} | |
if(nrow(science_dat)>=2){ | |
science_dat <- science_dat %>% mutate(type = paste0(type, id)) | |
} | |
not_science_dat <- tibble(type = "not_science", value = nums[which_are_not_science_t]) %>% | |
mutate_all(as.character) | |
if(nrow(not_science_dat)!=0){ | |
not_science_dat <- not_science_dat %>% | |
mutate(id = 1:n()) | |
} | |
if(nrow(not_science_dat)>=2){ | |
not_science_dat <- not_science_dat %>% mutate(type = paste0(type, id)) | |
} | |
ordaaa <- c(not_science = which_are_not_science_t, science = which_are_science) %>% | |
sort() %>% | |
as.data.frame() %>% | |
rownames_to_column("type") %>% | |
set_names(c("type", "order")) %>% | |
left_join(bind_rows(science_dat, not_science_dat), by = "type") %>% | |
as_tibble() %>% | |
mutate(value = as.numeric(value)) | |
ordaaa$value | |
} | |
# examples <- c("0,8,23379e-295", "4,89544e-13,1,40608e-117", | |
# "0,00409152", "1,6864e-25", "0,1,96165e-284,3", "0,997973", "0,23") | |
# | |
# debugonce(identify_commas) | |
# | |
# identify_commas("0") | |
## parser funktion | |
parse_dat <- function(x, verbose = T) { | |
x %>% | |
# slice(1:19) %>% | |
split(1:nrow(.)) %>% | |
map_dfr(~{ | |
if(verbose){ | |
counter <- paste0(.x$internal_id/nrow(x)*100) %>% | |
as.numeric() %>% | |
round(2) %>% | |
format() %>% | |
paste0("%") | |
print(counter) | |
} | |
fin <- .x %>% | |
select(contains("clu")) %>% | |
flatten_chr() %>% | |
na.omit() %>% as.character() %>% | |
discard(~magrittr::equals(.x, "")) %>% | |
map(identify_commas) %>% | |
unlist() %>% | |
tibble() %>% | |
t() %>% | |
as_tibble() %>% | |
set_names(c(paste0("cluster", 1:(ncol(.)-1)), "cluster")) | |
if(verbose){ | |
print(fin) | |
} | |
return(fin) | |
}) | |
} | |
## funktion liest .dat ein und extrahiert cluster | |
fix_it_all <- function(filepath, verbose = T) { | |
yo <- read_lines(filepath) | |
yo %>% | |
str_replace_all(",clu", " clu") %>% | |
write_lines(file = "fix.txt", sep = "\n") | |
raw_txt <- data.table::fread("fix.txt", sep = " ") %>% | |
janitor::clean_names() %>% | |
mutate(internal_id = 1:n()) %>% | |
mutate_all(~ifelse(str_ends(.x, ","), str_sub(.x, 1, nchar(.x)-1), .x)) | |
fin <- parse_dat(raw_txt, verbose) | |
return(fin) | |
} | |
## achtung das dauert *sehr* lang | |
soweit_so_gut <- fix_it_all("../../../Downloads/data7.dat") | |
write_csv(soweit_so_gut, file = "soweit_so_gut.csv") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment