brianmario · November 19, 2013 09:40
diff --git a/detector.go b/detector.go
 package main

 /*
 #cgo CFLAGS: -I/opt/boxen/homebrew/opt/icu4c/include
 #cgo LDFLAGS: -L/opt/boxen/homebrew/opt/icu4c/lib -licui18n -licuuc
 #include <stdlib.h>
 #include <unicode/ucsdet.h>
 */
 import "C"

 import (
 	"fmt"
 	"unsafe"
 	"os"
 	"bytes"
 	"strings"
 )

 type Detection struct {
 	Encoding string
 	Confidence int32
 	Language string
 }

 /* You'll obviously want to handle this other than exit(n) lol
 *
 * We can also get an error string back from the code, but it's not
 * super helpful unfortunately. I think ideally we'd just have a map
 * of ICU error codes to Go errors so we could quickly return an error
 * or nil.
 */
 func checkStatus(status C.UErrorCode) {
 	if status > C.U_ZERO_ERROR {
 		fmt.Printf("Error %q\n", status)
 		os.Exit(int(status))
 	}
 }

 type Bom struct {
 	sig []byte
 	name string
 }

 var validBoms = []*Bom{
 	&Bom{[]byte("\xEF\xBB\xBF"), "UTF-8"},
 	&Bom{[]byte("\xFE\xFF"), "UTF-16BE"},
 	&Bom{[]byte("\xFF\xFE"), "UTF-16LE"},
 	&Bom{[]byte("\x00\x00\xFE\xFF"), "UTF-32BE"},
 	&Bom{[]byte("\xFF\xFE\x00\x00"), "UTF-32LE"},
 }

 /* Public: An extremely basic binary detector.
 * It's a little smarter than just checking for a null byte within
 * the first 1kb but not by much.
 *
 * Returns a bool. true if binary, false if not.
 */
 func IsBinary(data string) bool {
 	// First check and see if it's got a BOM
 	for _, bom := range validBoms {
 		if bytes.HasPrefix([]byte(data), bom.sig) {
 			return false
 		}
 	}

 	// Next, check and see if there are any null bytes
 	if strings.ContainsRune(data[:1024], 0) {
 		return true
 	}

 	// If no null byte was found, it's pretty unlikely that it's binary
 	// but this is just a really dumb guess.
 	return false
 }

 /* Public: Detect the encoding of a string of text.
 * This could maybe be refactored to run in it's own goroutine,
 * but encoding detection is REALLY, really fast. Especially on a
 * fixed size of data. And given the sizes we're talking about
 * (1k or 32k or whatever) the speed should be pretty consitent.
 *
 * Also we don't need to allocate the entire Detection struct
 * if all you really care about is the encoding name string.
 *
 * Returns a newly allocated Detection with it's Encoding, Confidence
 * and Language fields set.
 */
 func DetectEncoding(data string) *Detection {
 	var status C.UErrorCode = C.U_ZERO_ERROR
 	var match *C.UCharsetMatch

 	// create a new detector and schedule it's free
 	detector := C.ucsdet_open(&status)
 	checkStatus(status)
 	defer C.ucsdet_close(detector)

 	// This kinda sucks because it makes a copy of input for use in C
 	// as a result we must free the string ourselves once we're done
 	cStr := C.CString(data)
 	defer C.free(unsafe.Pointer(cStr))
 	cStrLen := C.int32_t(len(data))

 	/**
 	 * Set the input byte data whose charset is to detected.
 	 *
 	 * Ownership of the input  text byte array remains with the caller.
 	 * The input string must not be altered or deleted until the charset
 	 * detector is either closed or reset to refer to different input text.
 	 */
 	status = C.U_ZERO_ERROR
 	C.ucsdet_setText(detector, cStr, cStrLen, &status)
 	checkStatus(status)

 	/** Set the declared encoding for charset detection.
 	 * The declared encoding of an input text is an encoding obtained
 	 * by the user from an http header or xml declaration or similar source that
 	 * can be provided as an additional hint to the charset detector.
 	 */
 	// cEncHint := C.CString("UTF-8")
 	// defer C.free(unsafe.Pointer(cEncHint))
 	// C.ucsdet_setDeclaredEncoding(detector, cEncHint, 5, &status)

 	/**
 	 * Enable filtering of input text. If filtering is enabled,
 	 * text within angle brackets ("<" and ">") will be removed
 	 * before detection, which will remove most HTML or xml markup.
 	 *
 	 * the second parameter is a bool 1/0 for true/false
 	 */
 	// C.ucsdet_enableInputFilter(detector, 1)

 	// perform the actual detection
 	status = C.U_ZERO_ERROR
 	match = C.ucsdet_detect(detector, &status)
 	checkStatus(status)

 	status = C.U_ZERO_ERROR
 	enc := C.ucsdet_getName(match, &status)
 	checkStatus(status)

 	status = C.U_ZERO_ERROR
 	lang := C.ucsdet_getLanguage(match, &status)
 	checkStatus(status)

 	detection := new(Detection)
 	detection.Encoding = C.GoString(enc)
 	detection.Confidence = int32(C.ucsdet_getConfidence(match, &status))
 	detection.Language = C.GoString(lang)

 	return detection
 }

 /* This basically replicates what charlock_holmes does.
 *   1. try and detect if the content is binary
 *   2. if not, try and detect it's text encoding
 *
 * The main difference is that the binary detection isn't using
 * libmagic here, like in charlock_holmes. That library is atrocious
 * and I want to stop using it in charlock_holmes anyway so the
 * binary detection code used here is what I was planning on doing
 * in charlock to replace libmagic. It won't match as accurately in
 * some cases but I'm hopeful it will for most.
 */
 func main() {
 	// this would come from blober or whatever
 	input := make([]byte, 1024)
 	_, _ = os.Stdin.Read(input)

 	binary := IsBinary(string(input))
 	if binary {
 		fmt.Printf("binary: %v\n", binary)
 	} else {
 		detection := DetectEncoding(string(input))
 		fmt.Printf("encoding: %s\n", detection.Encoding)
 		fmt.Printf("confience: %d\n", detection.Confidence)
 		fmt.Printf("language: %s\n", detection.Language)
 	}
 }
	package main

	/*
	#cgo CFLAGS: -I/opt/boxen/homebrew/opt/icu4c/include
	#cgo LDFLAGS: -L/opt/boxen/homebrew/opt/icu4c/lib -licui18n -licuuc
	#include <stdlib.h>
	#include <unicode/ucsdet.h>
	*/
	import "C"

	import (
	"fmt"
	"unsafe"
	"os"
	"bytes"
	"strings"
	)

	type Detection struct {
	Encoding string
	Confidence int32
	Language string
	}

	/* You'll obviously want to handle this other than exit(n) lol
	*
	* We can also get an error string back from the code, but it's not
	* super helpful unfortunately. I think ideally we'd just have a map
	* of ICU error codes to Go errors so we could quickly return an error
	* or nil.
	*/
	func checkStatus(status C.UErrorCode) {
	if status > C.U_ZERO_ERROR {
	fmt.Printf("Error %q\n", status)
	os.Exit(int(status))
	}
	}

	type Bom struct {
	sig []byte
	name string
	}

	var validBoms = []*Bom{
	&Bom{[]byte("\xEF\xBB\xBF"), "UTF-8"},
	&Bom{[]byte("\xFE\xFF"), "UTF-16BE"},
	&Bom{[]byte("\xFF\xFE"), "UTF-16LE"},
	&Bom{[]byte("\x00\x00\xFE\xFF"), "UTF-32BE"},
	&Bom{[]byte("\xFF\xFE\x00\x00"), "UTF-32LE"},
	}

	/* Public: An extremely basic binary detector.
	* It's a little smarter than just checking for a null byte within
	* the first 1kb but not by much.
	*
	* Returns a bool. true if binary, false if not.
	*/
	func IsBinary(data string) bool {
	// First check and see if it's got a BOM
	for _, bom := range validBoms {
	if bytes.HasPrefix([]byte(data), bom.sig) {
	return false
	}
	}

	// Next, check and see if there are any null bytes
	if strings.ContainsRune(data[:1024], 0) {
	return true
	}

	// If no null byte was found, it's pretty unlikely that it's binary
	// but this is just a really dumb guess.
	return false
	}

	/* Public: Detect the encoding of a string of text.
	* This could maybe be refactored to run in it's own goroutine,
	* but encoding detection is REALLY, really fast. Especially on a
	* fixed size of data. And given the sizes we're talking about
	* (1k or 32k or whatever) the speed should be pretty consitent.
	*
	* Also we don't need to allocate the entire Detection struct
	* if all you really care about is the encoding name string.
	*
	* Returns a newly allocated Detection with it's Encoding, Confidence
	* and Language fields set.
	*/
	func DetectEncoding(data string) *Detection {
	var status C.UErrorCode = C.U_ZERO_ERROR
	var match *C.UCharsetMatch

	// create a new detector and schedule it's free
	detector := C.ucsdet_open(&status)
	checkStatus(status)
	defer C.ucsdet_close(detector)

	// This kinda sucks because it makes a copy of input for use in C
	// as a result we must free the string ourselves once we're done
	cStr := C.CString(data)
	defer C.free(unsafe.Pointer(cStr))
	cStrLen := C.int32_t(len(data))

	/**
	* Set the input byte data whose charset is to detected.
	*
	* Ownership of the input text byte array remains with the caller.
	* The input string must not be altered or deleted until the charset
	* detector is either closed or reset to refer to different input text.
	*/
	status = C.U_ZERO_ERROR
	C.ucsdet_setText(detector, cStr, cStrLen, &status)
	checkStatus(status)

	/** Set the declared encoding for charset detection.
	* The declared encoding of an input text is an encoding obtained
	* by the user from an http header or xml declaration or similar source that
	* can be provided as an additional hint to the charset detector.
	*/
	// cEncHint := C.CString("UTF-8")
	// defer C.free(unsafe.Pointer(cEncHint))
	// C.ucsdet_setDeclaredEncoding(detector, cEncHint, 5, &status)

	/**
	* Enable filtering of input text. If filtering is enabled,
	* text within angle brackets ("<" and ">") will be removed
	* before detection, which will remove most HTML or xml markup.
	*
	* the second parameter is a bool 1/0 for true/false
	*/
	// C.ucsdet_enableInputFilter(detector, 1)

	// perform the actual detection
	status = C.U_ZERO_ERROR
	match = C.ucsdet_detect(detector, &status)
	checkStatus(status)

	status = C.U_ZERO_ERROR
	enc := C.ucsdet_getName(match, &status)
	checkStatus(status)

	status = C.U_ZERO_ERROR
	lang := C.ucsdet_getLanguage(match, &status)
	checkStatus(status)

	detection := new(Detection)
	detection.Encoding = C.GoString(enc)
	detection.Confidence = int32(C.ucsdet_getConfidence(match, &status))
	detection.Language = C.GoString(lang)

	return detection
	}

	/* This basically replicates what charlock_holmes does.
	* 1. try and detect if the content is binary
	* 2. if not, try and detect it's text encoding
	*
	* The main difference is that the binary detection isn't using
	* libmagic here, like in charlock_holmes. That library is atrocious
	* and I want to stop using it in charlock_holmes anyway so the
	* binary detection code used here is what I was planning on doing
	* in charlock to replace libmagic. It won't match as accurately in
	* some cases but I'm hopeful it will for most.
	*/
	func main() {
	// this would come from blober or whatever
	input := make([]byte, 1024)
	_, _ = os.Stdin.Read(input)

	binary := IsBinary(string(input))
	if binary {
	fmt.Printf("binary: %v\n", binary)
	} else {
	detection := DetectEncoding(string(input))
	fmt.Printf("encoding: %s\n", detection.Encoding)
	fmt.Printf("confience: %d\n", detection.Confidence)
	fmt.Printf("language: %s\n", detection.Language)
	}
	}