-
-
Save johndharrison/8008477 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
library(XML) | |
library(plyr) | |
library(pbapply) | |
xlsxToR <- function(file, keep_sheets = NULL, header = FALSE) { | |
temp_dir <- file.path(tempdir(), "xlsxToRtemp") | |
suppressWarnings(dir.create(temp_dir)) | |
file.copy(file, temp_dir) | |
new_file <- list.files(temp_dir, full.name = TRUE, pattern = basename(file)) | |
unzip(new_file, exdir = temp_dir) | |
# Get OS | |
# These lines are included because R documentation states that Excel handles | |
# date origins differently on Mac than on Windows. However, manual inspection | |
# of Excel files created on Windows and Mac indicated that in fact the origin | |
# is handled the same across both platforms. I've kept the original code here | |
# commented out in case it can be of use in the future. | |
# mac <- xmlToList(xmlParse(list.files( | |
# paste0(temp_dir, "/docProps"), full.name = TRUE, pattern = "app.xml"))) | |
# mac <- grepl("Macintosh", mac$Application) | |
# if(mac) { | |
# os_origin <- "1899-12-30" # documentation says should be "1904-01-01" | |
# } else { | |
# os_origin <- "1899-12-30" | |
# } | |
# Get names of sheets | |
sheet_names <- xmlToList(xmlParse(list.files( | |
paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "workbook.xml"))) | |
sheet_names <- rbind.fill(lapply(sheet_names$sheets, function(x) { | |
as.data.frame(as.list(x), stringsAsFactors = FALSE) | |
})) | |
rownames(sheet_names) <- NULL | |
sheet_names <- as.data.frame(sheet_names,stringsAsFactors = FALSE) | |
sheet_names$id <- gsub("\\D", "", sheet_names$id) | |
# Get column classes | |
styles <- xmlParse(list.files( | |
paste0(temp_dir, "/xl"), full.name = TRUE, pattern = "styles.xml")) | |
styles <- xpathApply(styles, "//x:xf[@applyNumberFormat and @numFmtId]", | |
namespaces = "x", xmlAttrs) | |
styles <- lapply(styles, function(x) { | |
x[grepl("applyNumberFormat|numFmtId", names(x))]}) | |
styles <- do.call("rbind", (lapply(styles, | |
function(x) as.data.frame(as.list(x[c("applyNumberFormat", "numFmtId")]), | |
stringsAsFactors = FALSE)))) | |
if(!is.null(keep_sheets)) { | |
sheet_names <- sheet_names[sheet_names$name %in% keep_sheets,] | |
} | |
worksheet_paths <- list.files( | |
paste0(temp_dir, "/xl/worksheets"), | |
full.name = TRUE, | |
pattern = paste0( | |
"sheet(", | |
paste(sheet_names$id, collapse = "|"), | |
")\\.xml$")) | |
worksheets <- lapply(worksheet_paths, function(x) xmlRoot(xmlParse(x))[["sheetData"]]) | |
worksheets <- pblapply(seq_along(worksheets), function(i) { | |
x <- xpathApply(worksheets[[i]], "//x:c", namespaces = "x", function(node) { | |
c("v" = xmlValue(node[["v"]]), xmlAttrs(node)) | |
}) | |
if(length(x) > 0) { | |
x_rows <- unlist(lapply(seq_along(x), function(i) rep(i, length(x[[i]])))) | |
x <- unlist(x) | |
x <- reshape( | |
data.frame( | |
"row" = x_rows, | |
"ind" = names(x), | |
"value" = x, | |
stringsAsFactors = FALSE), | |
idvar = "row", timevar = "ind", direction = "wide") | |
x$sheet <- sheet_names[sheet_names$id == i, "name"] | |
colnames(x) <- gsub("^value\\.", "", colnames(x)) | |
} | |
x | |
}) | |
worksheets <- do.call("rbind.fill", | |
worksheets[sapply(worksheets, class) == "data.frame"]) | |
entries <- xmlParse(list.files(paste0(temp_dir, "/xl"), full.name = TRUE, | |
pattern = "sharedStrings.xml$")) | |
entries <- xpathSApply(entries, "//x:si", namespaces = "x", xmlValue) | |
names(entries) <- seq_along(entries) - 1 | |
entries_match <- entries[ | |
match(worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)], | |
names(entries))] | |
worksheets$v[worksheets$t == "s" & !is.na(worksheets$t)] <- entries_match | |
worksheets$cols <- match(gsub("\\d", "", worksheets$r), LETTERS) | |
worksheets$rows <- as.numeric(gsub("\\D", "", worksheets$r)) | |
if(!any(grepl("^s$", colnames(worksheets)))) { | |
worksheets$s <- NA | |
} | |
workbook <- lapply(unique(worksheets$sheet), function(x) { | |
y <- worksheets[worksheets$sheet == x,] | |
y_style <- as.data.frame(tapply(y$s, list(y$rows, y$cols), identity), | |
stringsAsFactors = FALSE) | |
y <- as.data.frame(tapply(y$v, list(y$rows, y$cols), identity), | |
stringsAsFactors = FALSE) | |
if(header) { | |
colnames(y) <- y[1,] | |
y <- y[-1,] | |
y_style <- y_style[-1,] | |
} | |
y_style <- sapply(y_style, function(x) { | |
out <- names(which.max(table(x))) | |
out[is.null(out)] <- NA | |
out | |
}) | |
if(length(styles) > 0) { | |
y_style <- styles$numFmtId[match(y_style, styles$applyNumberFormat)] | |
} | |
y_style[y_style %in% 14:17] <- "date" | |
y_style[y_style %in% c(18:21, 45:47)] <- "time" | |
y_style[y_style %in% 22] <- "datetime" | |
y_style[is.na(y_style) & !sapply(y, function(x)any(grepl("\\D", x)))] <- "numeric" | |
y_style[is.na(y_style)] <- "character" | |
y_style[!(y_style %in% c("date", "time", "datetime", "numeric"))] <- "character" | |
y[] <- lapply(seq_along(y), function(i) { | |
switch(y_style[i], | |
character = y[,i], | |
numeric = as.numeric(y[,i]), | |
date = as.Date(as.numeric(y[,i]), origin = os_origin), | |
time = strftime(as.POSIXct(as.numeric(y[,i]), origin = os_origin), format = "%H:%M:%S"), | |
datetime = as.POSIXct(as.numeric(y[,i]), origin = os_origin)) | |
}) | |
y | |
}) | |
if(length(workbook) == 1) { | |
workbook <- workbook[[1]] | |
} else { | |
names(workbook) <- sheet_names$name | |
} | |
workbook | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment