Created
August 19, 2015 20:36
-
-
Save cdeterman/6c151f1b3f56ee8aaf3f to your computer and use it in GitHub Desktop.
load the airlines dataset with bigmemory
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
for (year in 1987:2008) { | |
file.name <- paste(year, "csv.bz2", sep = ".") | |
if ( !file.exists(file.name) ) { | |
url.text <- paste("http://stat-computing.org/dataexpo/2009/", | |
year, ".csv.bz2", sep = "") | |
cat("Downloading missing data file ", file.name, "\n", sep = "") | |
download.file(url.text, file.name) | |
} | |
} | |
## Read sample file to get column names and types | |
d <- read.csv("2008.csv.bz2") | |
integer.columns <- sapply(d, is.integer) | |
factor.columns <- sapply(d, is.factor) | |
factor.levels <- lapply(d[, factor.columns], levels) | |
n.rows <- 0L | |
## Process each file determining the factor levels | |
## TODO: Combine with next loop | |
for (year in 1987:2008) { | |
file.name <- paste(year, "csv.bz2", sep = ".") | |
cat("Processing ", file.name, "\n", sep = "") | |
d <- read.csv(file.name) | |
n.rows <- n.rows + NROW(d) | |
new.levels <- lapply(d[, factor.columns], levels) | |
for ( i in seq(1, length(factor.levels)) ) { | |
factor.levels[[i]] <- c(factor.levels[[i]], new.levels[[i]]) | |
} | |
rm(d) | |
gc() | |
} | |
save(integer.columns, factor.columns, factor.levels, file = "factors.RData") | |
## Now convert all factors to integers so we can create a bigmatrix of the data | |
col.classes <- rep("integer", length(integer.columns)) | |
col.classes[factor.columns] <- "character" | |
cols <- which(factor.columns) | |
first <- TRUE | |
csv.file <- "airlines.csv" # Write combined integer-only data to this file | |
csv.con <- file(csv.file, open = "w") | |
for (year in 1987:2008) { | |
file.name <- paste(year, "csv.bz2", sep = ".") | |
cat("Processing ", file.name, "\n", sep = "") | |
d <- read.csv(file.name, colClasses = col.classes) | |
## Convert the strings to integers | |
for ( i in seq(1, length(factor.levels)) ) { | |
col <- cols[i] | |
d[, col] <- match(d[, col], factor.levels[[i]]) | |
} | |
write.table(d, file = csv.con, sep = ",", | |
row.names = FALSE, col.names = first) | |
first <- FALSE | |
rm(d) | |
gc() | |
} | |
close(csv.con) | |
backing.file <- "airlines.bin" | |
descriptor.file <- "airlines.des" | |
data <- read.big.matrix("airlines.csv", header = TRUE, | |
type = "integer", | |
backingfile = backing.file, | |
descriptorfile = descriptor.file, | |
extraCols = c("age")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment