cdeterman · August 19, 2015 20:36
diff --git a/airline_data_test.R b/airline_data_test.R
 for (year in 1987:2008) {
    file.name <- paste(year, "csv.bz2", sep = ".")
    if ( !file.exists(file.name) ) {
        url.text <- paste("http://stat-computing.org/dataexpo/2009/",
                          year, ".csv.bz2", sep = "")
        cat("Downloading missing data file ", file.name, "\n", sep = "")
        download.file(url.text, file.name)
    }
 }

 ## Read sample file to get column names and types
 d <- read.csv("2008.csv.bz2")
 integer.columns <- sapply(d, is.integer)
 factor.columns  <- sapply(d, is.factor)
 factor.levels   <- lapply(d[, factor.columns], levels)
 n.rows <- 0L

 ## Process each file determining the factor levels
 ## TODO: Combine with next loop
 for (year in 1987:2008) {
    file.name <- paste(year, "csv.bz2", sep = ".")
    cat("Processing ", file.name, "\n", sep = "")
    d <- read.csv(file.name)
    n.rows <- n.rows + NROW(d)
    new.levels <- lapply(d[, factor.columns], levels)
    for ( i in seq(1, length(factor.levels)) ) {
        factor.levels[[i]] <- c(factor.levels[[i]], new.levels[[i]])
    }
    rm(d)
    gc()
 }
 save(integer.columns, factor.columns, factor.levels, file = "factors.RData")

 ## Now convert all factors to integers so we can create a bigmatrix of the data
 col.classes <- rep("integer", length(integer.columns))
 col.classes[factor.columns] <- "character"
 cols  <- which(factor.columns)
 first <- TRUE
 csv.file <- "airlines.csv"   # Write combined integer-only data to this file
 csv.con  <- file(csv.file, open = "w")

 for (year in 1987:2008) {
    file.name <- paste(year, "csv.bz2", sep = ".")
    cat("Processing ", file.name, "\n", sep = "")
    d <- read.csv(file.name, colClasses = col.classes)
    ## Convert the strings to integers
    for ( i in seq(1, length(factor.levels)) ) {
        col <- cols[i]
        d[, col] <- match(d[, col], factor.levels[[i]])
    }
    write.table(d, file = csv.con, sep = ",", 
                row.names = FALSE, col.names = first)
    first <- FALSE
    rm(d)
    gc()
 }
 close(csv.con)

 backing.file    <- "airlines.bin"
 descriptor.file <- "airlines.des"
 data <- read.big.matrix("airlines.csv", header = TRUE,
                        type = "integer",
                        backingfile = backing.file,
                        descriptorfile = descriptor.file,
                        extraCols = c("age"))
	for (year in 1987:2008) {
	file.name <- paste(year, "csv.bz2", sep = ".")
	if ( !file.exists(file.name) ) {
	url.text <- paste("http://stat-computing.org/dataexpo/2009/",
	year, ".csv.bz2", sep = "")
	cat("Downloading missing data file ", file.name, "\n", sep = "")
	download.file(url.text, file.name)
	}
	}

	## Read sample file to get column names and types
	d <- read.csv("2008.csv.bz2")
	integer.columns <- sapply(d, is.integer)
	factor.columns <- sapply(d, is.factor)
	factor.levels <- lapply(d[, factor.columns], levels)
	n.rows <- 0L

	## Process each file determining the factor levels
	## TODO: Combine with next loop
	for (year in 1987:2008) {
	file.name <- paste(year, "csv.bz2", sep = ".")
	cat("Processing ", file.name, "\n", sep = "")
	d <- read.csv(file.name)
	n.rows <- n.rows + NROW(d)
	new.levels <- lapply(d[, factor.columns], levels)
	for ( i in seq(1, length(factor.levels)) ) {
	factor.levels[[i]] <- c(factor.levels[[i]], new.levels[[i]])
	}
	rm(d)
	gc()
	}
	save(integer.columns, factor.columns, factor.levels, file = "factors.RData")

	## Now convert all factors to integers so we can create a bigmatrix of the data
	col.classes <- rep("integer", length(integer.columns))
	col.classes[factor.columns] <- "character"
	cols <- which(factor.columns)
	first <- TRUE
	csv.file <- "airlines.csv" # Write combined integer-only data to this file
	csv.con <- file(csv.file, open = "w")

	for (year in 1987:2008) {
	file.name <- paste(year, "csv.bz2", sep = ".")
	cat("Processing ", file.name, "\n", sep = "")
	d <- read.csv(file.name, colClasses = col.classes)
	## Convert the strings to integers
	for ( i in seq(1, length(factor.levels)) ) {
	col <- cols[i]
	d[, col] <- match(d[, col], factor.levels[[i]])
	}
	write.table(d, file = csv.con, sep = ",",
	row.names = FALSE, col.names = first)
	first <- FALSE
	rm(d)
	gc()
	}
	close(csv.con)

	backing.file <- "airlines.bin"
	descriptor.file <- "airlines.des"
	data <- read.big.matrix("airlines.csv", header = TRUE,
	type = "integer",
	backingfile = backing.file,
	descriptorfile = descriptor.file,
	extraCols = c("age"))