Last active
January 30, 2018 19:00
-
-
Save zihengCat/1a80a31b671e5bb3db6eb7af5a4b30f7 to your computer and use it in GitHub Desktop.
UTF-8 (ANSI C)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
int get_utf8_size(const unsigned char *p_input) { | |
unsigned char c = *p_input; /* get UTF-8 first Byte */ | |
/* | |
* 0xxxxxxx --> 1 | |
* 10xxxxxx --> -1 (invalid) | |
* 110xxxxx --> 2 | |
* 1110xxxx --> 3 | |
* 11110xxx --> 4 | |
* 111110xx --> 5 | |
* 1111110x --> 6 | |
*/ | |
if(c >= 0x00 && c < 0x80){return 1;} | |
if(c >= 0x80 && c < 0xC0){return -1;} | |
if(c >= 0xC0 && c < 0xE0){return 2;} | |
if(c >= 0xE0 && c < 0xF0){return 3;} | |
if(c >= 0xF0 && c < 0xF8){return 4;} | |
if(c >= 0xF8 && c < 0xFC){return 5;} | |
if(c >= 0xFC){return 6;} | |
return -1; | |
} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/***************************************************************************** | |
* 将一个UTF8编码的字符转换成Unicode(UCS-4)编码 | |
* | |
* 参数: | |
* p_input 指向输入缓冲区, 保存字符的UTF-8编码 | |
* p_output 指向输出缓冲区, 其保存的数据即是Unicode编码值, | |
* 类型为unsigned int (4字节) | |
* | |
* 返回值: | |
* 成功则返回该字符的UTF-8编码所占用的字节数 | |
* 失败则返回 -1 | |
* | |
* 注意: | |
* 1. UTF-8没有字节序问题, 但是Unicode有字节序要求 | |
* 2. 字节序分为大端(Big Endian)和小端(Little Endian)两种 | |
* 3. Intel处理器采用小端法, 在此采用小端法表示(低地址存低位) | |
****************************************************************************/ | |
#define UTF8_MAX_BYTES 6 | |
int utf8_to_unicode(const unsigned char *p_input, unsigned int *p_output){ | |
*p_output = 0x0; | |
int size = get_utf8_size(p_input); | |
char byte[UTF8_MAX_BYTES] = {0}; | |
unsigned char *p_out = (unsigned char*)p_output; | |
switch( size ){ | |
case 1: | |
*(p_out + 0) = *p_input; | |
break; | |
case 2: | |
byte[0] = *(p_input + 0); | |
byte[1] = *(p_input + 1); | |
*(p_out + 0) = (byte[1] & 0x3F) + (byte[0] << 6); | |
*(p_out + 1) = (byte[0] >> 2) & 0x07; | |
break; | |
case 3: | |
byte[0] = *(p_input + 0); | |
byte[1] = *(p_input + 1); | |
byte[2] = *(p_input + 2); | |
*(p_out + 0) = (byte[1] << 6) + (byte[2] & 0x3F); | |
*(p_out + 1) = (byte[0] << 4) + ((byte[1] >> 2) & 0x0F); | |
break; | |
case 4: | |
byte[0] = *(p_input + 0); | |
byte[1] = *(p_input + 1); | |
byte[2] = *(p_input + 2); | |
byte[3] = *(p_input + 3); | |
*(p_out + 0) = (byte[2] << 6) + (byte[3] & 0x3F); | |
*(p_out + 1) = ((byte[2] << 4) & 0x0F) + (byte[1] & 0x0F); | |
*(p_out + 2) = ((byte[1] >> 4) & 0x03) + ((byte[0] << 2) & 0x1C) | |
break; | |
case 5: | |
case 6: | |
break; | |
default: | |
break; | |
} | |
return size; | |
} | |
#undef UTF8_MAX_BYTES |
Author
zihengCat
commented
Aug 23, 2017
•
UTF-8 编码字节数 | Unicode 符号范围 | UTF-8 编码方式 |
---|---|---|
n (Bytes) | 十六进制 (Hexadecimal) | 二进制 (Binary) |
1 | 0000 0000 - 0000 007F | 0xxxxxxx |
2 | 0000 0080 - 0000 07FF | 110xxxxx 10xxxxxx |
3 | 0000 0800 - 0000 FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
4 | 0001 0000 - 0010 FFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
5 | 0020 0000 - 03FF FFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
6 | 0400 0000 - 7FFF FFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment