Created
June 4, 2012 16:05
-
-
Save gerdr/2869236 to your computer and use it in GitHub Desktop.
UTF-8 encoder
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
Copyright 2012 Gerhard R. <[email protected]> | |
Permission is granted to use, modify, and / or redistribute at will. | |
This includes removing authorship notices, re-use of code parts in | |
other software (with or without giving credit), and / or creating a | |
commercial product based on it. | |
This permission is not revocable by the author. | |
This software is provided as-is. Use it at your own risk. There is | |
no warranty whatsoever, neither expressed nor implied, and by using | |
this software you accept that the author(s) shall not be held liable | |
for any loss of data, loss of service, or other damages, be they | |
incidental or consequential. Your only option other than accepting | |
this is not to use the software at all. | |
*/ | |
#include <stddef.h> | |
#include <stdint.h> | |
extern void *utf8_encode(void *bytes, uint32_t cp); | |
enum | |
{ | |
CP_CHAR = 1 << 0, | |
CP_LOW_SURROGATE = 1 << 1, | |
CP_HIGH_SURROGATE = 1 << 2, | |
CP_NONCHAR = 1 << 3, | |
CP_OVERFLOW = 1 << 4, | |
U8_SINGLE = 1 << 5, | |
U8_DOUBLE = 1 << 6, | |
U8_TRIPLE = 1 << 7, | |
U8_QUAD = 1 << 8 | |
}; | |
static unsigned classify(uint32_t cp) | |
{ | |
if(cp == 0) | |
return CP_CHAR | U8_DOUBLE; | |
if(cp <= 0x7F) | |
return CP_CHAR | U8_SINGLE; | |
if(cp <= 0x07FF) | |
return CP_CHAR | U8_DOUBLE; | |
if(0xD800 <= cp && cp <= 0xDBFF) | |
return CP_HIGH_SURROGATE | U8_TRIPLE; | |
if(0xDC00 <= cp && cp <= 0xDFFF) | |
return CP_LOW_SURROGATE | U8_TRIPLE; | |
if(0xFDD0 <= cp && cp <= 0xFDEF) | |
return CP_NONCHAR | U8_TRIPLE; | |
if(cp <= 0xFFFD) | |
return CP_CHAR | U8_TRIPLE; | |
if(cp == 0xFFFE || cp == 0xFFFF) | |
return CP_NONCHAR | U8_TRIPLE; | |
if(cp <= 0x10FFFF && ((cp & 0xFFFF) == 0xFFFE || (cp & 0xFFFF) == 0xFFFF)) | |
return CP_NONCHAR | U8_QUAD; | |
if(cp <= 0x10FFFF) | |
return CP_CHAR | U8_QUAD; | |
if(cp <= 0x1FFFFF) | |
return CP_OVERFLOW | U8_QUAD; | |
return 0; | |
} | |
void *utf8_encode(void *bytes, uint32_t cp) | |
{ | |
unsigned cc = classify(cp); | |
uint8_t *bp = bytes; | |
if(!(cc & CP_CHAR)) | |
return NULL; | |
if(cc & U8_SINGLE) | |
{ | |
bp[0] = (uint8_t)cp; | |
return bp + 1; | |
} | |
if(cc & U8_DOUBLE) | |
{ | |
bp[0] = (uint8_t)((6 << 5) | (cp >> 6)); | |
bp[1] = (uint8_t)((2 << 6) | (cp & 0x3F)); | |
return bp + 2; | |
} | |
if(cc & U8_TRIPLE) | |
{ | |
bp[0] = (uint8_t)((14 << 4) | (cp >> 12)); | |
bp[1] = (uint8_t)(( 2 << 6) | ((cp >> 6) & 0x3F)); | |
bp[2] = (uint8_t)(( 2 << 6) | ( cp & 0x3F)); | |
return bp + 3; | |
} | |
if(cc & U8_QUAD) | |
{ | |
bp[0] = (uint8_t)((30 << 3) | (cp >> 18)); | |
bp[1] = (uint8_t)(( 2 << 6) | ((cp >> 12) & 0x3F)); | |
bp[2] = (uint8_t)(( 2 << 6) | ((cp >> 6) & 0x3F)); | |
bp[3] = (uint8_t)(( 2 << 6) | ( cp & 0x3F)); | |
return bp + 4; | |
} | |
return NULL; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment