Last active
July 16, 2019 09:40
-
-
Save hrbrmstr/be3bf6e2b7e8b06648fd to your computer and use it in GitHub Desktop.
test for and/or remove byte order mark in R
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Tests whether a raw httr response or character vector has a byte order mark (BOM) | |
#' | |
#' \href{https://en.wikipedia.org/wiki/Byte_order_mark}{What is a byte order mark?} | |
#' | |
#' @param resp \code{httr} response object or a character vector | |
#' @param encoding can be one of \code{UTF-8}, \code{UTF-16} or \code{UTF-16BE}. | |
#' Although a BOM could be used with UTF-32 and other encodings, such | |
#' encodings are rarely used for data transmission and the three supported | |
#' encodings are the most likely ones folks in R will be working with from | |
#' web APIs.\cr\cr | |
#' This function defaults to looking for \code{UTF-8} BOM, but you can | |
#' override it. | |
#' @return \code{TRUE} if response contains a BOM, \code{NA} if an unsupported encoding | |
#' was passed (along with a message) | |
#' @export | |
#' @references \href{http://www.unicode.org/faq/utf_bom.html}{UTF-8, UTF-16, UTF-32 & BOM} | |
#' @author @@hrbrmstr | |
#' @examples | |
#' library(httr) | |
#' foo <- GET("http://www.claritin.com/webservice/allergyforecast.php?zip=33956") | |
#' has_bom(foo) | |
#' | |
#' cfoo <- content(foo, as="text") | |
#' has_bom(cfoo) | |
#' | |
#' has_bom(GET("http://google.com")) | |
has_bom <- function(resp, encoding="UTF-8") { | |
if (inherits(resp, "response")) { | |
F <- resp$content[1:4] | |
switch(encoding, | |
`UTF-8`=F[1]==as.raw(0xef) & F[2]==as.raw(0xbb) & F[3]==as.raw(0xbf), | |
`UTF-16`=F[1]==as.raw(0xff) & F[2]==as.raw(0xfe), | |
`UTF-16BE`=F[1]==as.raw(0xfe) & F[2]==as.raw(0xff), | |
{ message("Unsupported encoding") ; return(NA) } | |
) | |
} else if (inherits(resp, "character")) { | |
switch(encoding, | |
`UTF-8`=grepl("^", resp[1]), | |
`UTF-16`=grepl("^ÿþ", resp[1]), | |
`UTF-16BE`=grepl("^þÿ", resp[1]), | |
{ message("Unsupported encoding") ; return(NA) } | |
) | |
} else { | |
message("Expected either an httr::response object or a character") | |
return(NA) | |
} | |
} | |
#' Remove byte order mark (BOM) from \code{httr::response} object or character vector | |
#' | |
#' Performs the equivalent of \code{content(resp, type="text")} but removes the | |
#' BOM before doing so. If a BOM is not detected, the function will return an error. | |
#' | |
#' \href{https://en.wikipedia.org/wiki/Byte_order_mark}{What is a byte order mark?} | |
#' | |
#' @param resp \code{httr} response object or a character vector | |
#' @return Converted content | |
#' @export | |
#' @references \href{http://www.unicode.org/faq/utf_bom.html}{UTF-8, UTF-16, UTF-32 & BOM} | |
#' @author @@hrbrmstr | |
#' @examples | |
#' library(httr) | |
#' foo <- GET("http://www.claritin.com/webservice/allergyforecast.php?zip=33956") | |
#' sans_bom(foo) | |
sans_bom <- function(resp) { | |
if (inherits(resp, "response")) { | |
F <- resp$content[1:4] | |
if (F[1]==as.raw(0xef) & F[2]==as.raw(0xbb) & F[3]==as.raw(0xbf)) { | |
iconv(readBin(resp$content[4:length(resp$content)], character()), from="UTF-8", to="UTF-8") | |
} else if (F[1]==as.raw(0xff) & F[2]==as.raw(0xfe)) { | |
iconv(readBin(resp$content[3:length(resp$content)], character()), from="UTF-16", to="UTF-8") | |
} else if (F[1]==as.raw(0xfe) & F[2]==as.raw(0xff)) { | |
iconv(readBin(resp$content[3:length(resp$content)], character()), from="UTF-16BE", to="UTF-8") | |
} else { | |
stop("Did not detect a BOM in the httr::response object content.", call.=FALSE) | |
} | |
} else if (inherits(resp, "character")) { | |
if (grepl("^", resp[1])) { | |
iconv(readBin(sub("^", "", resp), character()), from="UTF-8", to="UTF-8") | |
} else if (grepl("^ÿþ", resp[1])) { | |
iconv(readBin(sub("^ÿþ", "", resp), character()), from="UTF-16", to="UTF-8") | |
} else if (grepl("^þÿ", resp[1])) { | |
iconv(readBin(sub("^þÿ", "", resp), character()), from="UTF-16BE", to="UTF-8") | |
} else { | |
stop("Did not detect a BOM in the content.", call.=FALSE) | |
} | |
} else { | |
stop("Expected either an httr::response object or a character", call.=FALSE) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment