Skip to content

Instantly share code, notes, and snippets.

@masakielastic
Last active July 9, 2025 05:59
Show Gist options
  • Save masakielastic/ac8363436d6e53318e902476baf7fc6a to your computer and use it in GitHub Desktop.
Save masakielastic/ac8363436d6e53318e902476baf7fc6a to your computer and use it in GitHub Desktop.
ICU4C で書記素クラスターを1つずつ表示する

ICU4C で書記素クラスターを1つずつ表示する

gcc -o show_graphemes show_graphemes.c $(pkg-config --cflags --libs icu-uc)
./show_graphemes
Original string: こんにちは👋é🇯🇵サンプル
----------------------------------
Iterating through grapheme clusters:
  Grapheme  1: こ
  Grapheme  2: ん
  Grapheme  3: に
  Grapheme  4: ち
  Grapheme  5: は
  Grapheme  6: 👋
  Grapheme  7: é
  Grapheme  8: 🇯🇵
  Grapheme  9: サ
  Grapheme 10: ン
  Grapheme 11: プ
  Grapheme 12: ル
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unicode/utypes.h>
#include <unicode/ubrk.h>
#include <unicode/ustring.h>
#include <unicode/uclean.h>
int main(void) {
// 1. 処理対象の文字列 (UTF-8)
// 絵文字、結合文字、国旗絵文字などを含む
const char* utf8_string = "こんにちは👋é🇯🇵サンプル";
printf("Original string: %s\n", utf8_string);
printf("----------------------------------\n");
printf("Iterating through grapheme clusters:\n");
UErrorCode status = U_ZERO_ERROR;
// 2. UTF-8文字列をICUが扱うUChar配列 (UTF-16) に変換
UChar* uchar_text = NULL;
int32_t uchar_len = 0;
// 必要なバッファサイズを計算 (preflighting)
u_strFromUTF8(NULL, 0, &uchar_len, utf8_string, -1, &status);
if (status != U_BUFFER_OVERFLOW_ERROR && U_FAILURE(status)) {
fprintf(stderr, "u_strFromUTF8 (preflight) failed: %s\n", u_errorName(status));
return 1;
}
status = U_ZERO_ERROR; // エラーコードをリセット
// メモリを確保して変換を実行
uchar_text = (UChar*)malloc(sizeof(UChar) * (uchar_len + 1));
if (!uchar_text) {
fprintf(stderr, "Failed to allocate memory for UChar string.\n");
return 1;
}
u_strFromUTF8(uchar_text, uchar_len + 1, NULL, utf8_string, -1, &status);
if (U_FAILURE(status)) {
fprintf(stderr, "u_strFromUTF8 failed: %s\n", u_errorName(status));
free(uchar_text);
return 1;
}
// 3. 書記素クラスタ用のBreak Iteratorを作成
// ロケールは "ja_JP" を指定
UBreakIterator* bi = ubrk_open(UBRK_CHARACTER, "ja_JP", uchar_text, uchar_len, &status);
if (U_FAILURE(status)) {
fprintf(stderr, "ubrk_open failed: %s\n", u_errorName(status));
free(uchar_text);
return 1;
}
// 4. 書記素クラスタを一つずつ反復処理
int32_t start = ubrk_first(bi);
int count = 1;
for (int32_t end = ubrk_next(bi); end != UBRK_DONE; start = end, end = ubrk_next(bi)) {
int32_t grapheme_len_uchar = end - start;
// 5. 書記素クラスタ (UChar*) をUTF-8に変換して表示
char utf8_buffer[128]; // 1つの書記素クラスタには十分なサイズ
int32_t utf8_buffer_len = 0;
status = U_ZERO_ERROR;
u_strToUTF8(utf8_buffer, sizeof(utf8_buffer), &utf8_buffer_len, uchar_text + start, grapheme_len_uchar, &status);
if (U_SUCCESS(status)) {
printf(" Grapheme %2d: %s\n", count, utf8_buffer);
} else {
// バッファが足りない場合など
fprintf(stderr, "Could not convert grapheme to UTF-8: %s\n", u_errorName(status));
}
count++;
}
// 6. リソースの解放
ubrk_close(bi);
free(uchar_text);
// ICUが確保したグローバルリソースをクリーンアップ(アプリケーション終了時に一度だけ呼ぶ)
u_cleanup();
return 0;
}
#include <stdio.h>
#include <stdlib.h>
#include <unicode/ubrk.h>
#include <unicode/utext.h>
#include <unicode/uloc.h>
#include <unicode/ustring.h>
void print_each_grapheme(const char *str);
int main(void)
{
const char* str = "葛\U000E0101飾区";
print_each_grapheme(str);
return 0;
}
void print_each_grapheme(const char *str)
{
UErrorCode status = U_ZERO_ERROR;
UText *ut = utext_openUTF8(NULL, str, -1, &status);
if (U_FAILURE(status)) {
fprintf(stderr, "utext_openUTF8 error: %s\n", u_errorName(status));
return;
}
UBreakIterator *bi = ubrk_open(UBRK_CHARACTER, uloc_getDefault(), NULL, 0, &status);
if (U_FAILURE(status)) {
fprintf(stderr, "ubrk_open error: %s\n", u_errorName(status));
utext_close(ut);
return;
}
ubrk_setUText(bi, ut, &status);
if (U_FAILURE(status)) {
fprintf(stderr, "ubrk_setUText error: %s\n", u_errorName(status));
ubrk_close(bi);
utext_close(ut);
return;
}
int32_t previous = ubrk_first(bi);
if (previous == UBRK_DONE) {
fprintf(stderr, "ubrk_first returned UBRK_DONE (empty string?)\n");
ubrk_close(bi);
utext_close(ut);
return;
}
int32_t current;
while ((current = ubrk_next(bi)) != UBRK_DONE) {
int32_t size = current - previous;
// UTF-8 の範囲外を保護
if (size > 0) {
printf("%.*s\n", size, str + previous);
} else {
fprintf(stderr, "Detected non-positive cluster size: %d (previous=%d, current=%d)\n", size, previous, current);
}
previous = current;
}
ubrk_close(bi);
utext_close(ut);
}
#include <stdio.h>
#include <unicode/uloc.h>
int main(void) {
const char *locale = uloc_getDefault();
printf("Default locale is: %s\n", locale);
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment