Created
November 19, 2013 09:40
-
-
Save brianmario/7542814 to your computer and use it in GitHub Desktop.
A simple binary and text encoding detector wrapping libicu's encoding detection API
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package main | |
/* | |
#cgo CFLAGS: -I/opt/boxen/homebrew/opt/icu4c/include | |
#cgo LDFLAGS: -L/opt/boxen/homebrew/opt/icu4c/lib -licui18n -licuuc | |
#include <stdlib.h> | |
#include <unicode/ucsdet.h> | |
*/ | |
import "C" | |
import ( | |
"fmt" | |
"unsafe" | |
"os" | |
"bytes" | |
"strings" | |
) | |
type Detection struct { | |
Encoding string | |
Confidence int32 | |
Language string | |
} | |
/* You'll obviously want to handle this other than exit(n) lol | |
* | |
* We can also get an error string back from the code, but it's not | |
* super helpful unfortunately. I think ideally we'd just have a map | |
* of ICU error codes to Go errors so we could quickly return an error | |
* or nil. | |
*/ | |
func checkStatus(status C.UErrorCode) { | |
if status > C.U_ZERO_ERROR { | |
fmt.Printf("Error %q\n", status) | |
os.Exit(int(status)) | |
} | |
} | |
type Bom struct { | |
sig []byte | |
name string | |
} | |
var validBoms = []*Bom{ | |
&Bom{[]byte("\xEF\xBB\xBF"), "UTF-8"}, | |
&Bom{[]byte("\xFE\xFF"), "UTF-16BE"}, | |
&Bom{[]byte("\xFF\xFE"), "UTF-16LE"}, | |
&Bom{[]byte("\x00\x00\xFE\xFF"), "UTF-32BE"}, | |
&Bom{[]byte("\xFF\xFE\x00\x00"), "UTF-32LE"}, | |
} | |
/* Public: An extremely basic binary detector. | |
* It's a little smarter than just checking for a null byte within | |
* the first 1kb but not by much. | |
* | |
* Returns a bool. true if binary, false if not. | |
*/ | |
func IsBinary(data string) bool { | |
// First check and see if it's got a BOM | |
for _, bom := range validBoms { | |
if bytes.HasPrefix([]byte(data), bom.sig) { | |
return false | |
} | |
} | |
// Next, check and see if there are any null bytes | |
if strings.ContainsRune(data[:1024], 0) { | |
return true | |
} | |
// If no null byte was found, it's pretty unlikely that it's binary | |
// but this is just a really dumb guess. | |
return false | |
} | |
/* Public: Detect the encoding of a string of text. | |
* This could maybe be refactored to run in it's own goroutine, | |
* but encoding detection is REALLY, really fast. Especially on a | |
* fixed size of data. And given the sizes we're talking about | |
* (1k or 32k or whatever) the speed should be pretty consitent. | |
* | |
* Also we don't need to allocate the entire Detection struct | |
* if all you really care about is the encoding name string. | |
* | |
* Returns a newly allocated Detection with it's Encoding, Confidence | |
* and Language fields set. | |
*/ | |
func DetectEncoding(data string) *Detection { | |
var status C.UErrorCode = C.U_ZERO_ERROR | |
var match *C.UCharsetMatch | |
// create a new detector and schedule it's free | |
detector := C.ucsdet_open(&status) | |
checkStatus(status) | |
defer C.ucsdet_close(detector) | |
// This kinda sucks because it makes a copy of input for use in C | |
// as a result we must free the string ourselves once we're done | |
cStr := C.CString(data) | |
defer C.free(unsafe.Pointer(cStr)) | |
cStrLen := C.int32_t(len(data)) | |
/** | |
* Set the input byte data whose charset is to detected. | |
* | |
* Ownership of the input text byte array remains with the caller. | |
* The input string must not be altered or deleted until the charset | |
* detector is either closed or reset to refer to different input text. | |
*/ | |
status = C.U_ZERO_ERROR | |
C.ucsdet_setText(detector, cStr, cStrLen, &status) | |
checkStatus(status) | |
/** Set the declared encoding for charset detection. | |
* The declared encoding of an input text is an encoding obtained | |
* by the user from an http header or xml declaration or similar source that | |
* can be provided as an additional hint to the charset detector. | |
*/ | |
// cEncHint := C.CString("UTF-8") | |
// defer C.free(unsafe.Pointer(cEncHint)) | |
// C.ucsdet_setDeclaredEncoding(detector, cEncHint, 5, &status) | |
/** | |
* Enable filtering of input text. If filtering is enabled, | |
* text within angle brackets ("<" and ">") will be removed | |
* before detection, which will remove most HTML or xml markup. | |
* | |
* the second parameter is a bool 1/0 for true/false | |
*/ | |
// C.ucsdet_enableInputFilter(detector, 1) | |
// perform the actual detection | |
status = C.U_ZERO_ERROR | |
match = C.ucsdet_detect(detector, &status) | |
checkStatus(status) | |
status = C.U_ZERO_ERROR | |
enc := C.ucsdet_getName(match, &status) | |
checkStatus(status) | |
status = C.U_ZERO_ERROR | |
lang := C.ucsdet_getLanguage(match, &status) | |
checkStatus(status) | |
detection := new(Detection) | |
detection.Encoding = C.GoString(enc) | |
detection.Confidence = int32(C.ucsdet_getConfidence(match, &status)) | |
detection.Language = C.GoString(lang) | |
return detection | |
} | |
/* This basically replicates what charlock_holmes does. | |
* 1. try and detect if the content is binary | |
* 2. if not, try and detect it's text encoding | |
* | |
* The main difference is that the binary detection isn't using | |
* libmagic here, like in charlock_holmes. That library is atrocious | |
* and I want to stop using it in charlock_holmes anyway so the | |
* binary detection code used here is what I was planning on doing | |
* in charlock to replace libmagic. It won't match as accurately in | |
* some cases but I'm hopeful it will for most. | |
*/ | |
func main() { | |
// this would come from blober or whatever | |
input := make([]byte, 1024) | |
_, _ = os.Stdin.Read(input) | |
binary := IsBinary(string(input)) | |
if binary { | |
fmt.Printf("binary: %v\n", binary) | |
} else { | |
detection := DetectEncoding(string(input)) | |
fmt.Printf("encoding: %s\n", detection.Encoding) | |
fmt.Printf("confience: %d\n", detection.Confidence) | |
fmt.Printf("language: %s\n", detection.Language) | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment