Skip to content

Instantly share code, notes, and snippets.

@Riey
Created January 2, 2021 13:08
Show Gist options
  • Save Riey/7c1724cc2c9cd10359af3b199f780606 to your computer and use it in GitHub Desktop.
Save Riey/7c1724cc2c9cd10359af3b199f780606 to your computer and use it in GitHub Desktop.
#include <cstdint>
#include <optional>
#include <string>
#include <vector>
#include <iostream>
using utf8 = std::u8string_view;
using utf8_iter = std::u8string_view::const_iterator&;
#define CONT_MASK 0b00111111
constexpr auto utf8_first_byte(uint8_t byte, uint32_t width) -> uint32_t {
return (byte & (0x7F >> width));
}
constexpr auto utf8_acc_cont_byte(uint32_t ch, uint8_t byte) -> uint32_t {
return (ch << 6) | (byte & CONT_MASK);
}
constexpr auto unwrap_or_0(utf8_iter iter, utf8_iter end) -> uint8_t {
if (iter == end) {
return 0;
} else {
return *iter++;
}
}
auto next_code_point(utf8_iter iter, utf8_iter end) -> std::optional<uint32_t> {
if (iter == end) {
return std::nullopt;
}
auto x = *iter++;
if (x < 128) {
return x;
}
auto init = utf8_first_byte(x, 2);
auto y = unwrap_or_0(iter, end);
auto ch = utf8_acc_cont_byte(init, y);
if (x >= 0xE0) {
auto z = unwrap_or_0(iter, end);
auto y_z = utf8_acc_cont_byte((y & CONT_MASK), z);
ch = init << 12 | y_z;
if (x >= 0xF0) {
auto w = unwrap_or_0(iter, end);
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w);
}
}
return ch;
}
auto convert(const utf8& bytes) -> std::u32string {
auto iter = bytes.begin();
auto end = bytes.end();
auto out = std::u32string{};
while(true) {
auto next = next_code_point(iter, end);
if (!next.has_value()) {
break;
}
out.push_back(next.value());
}
return out;
}
auto main() -> int {
std::u8string_view utf8 = u8"가나다라";
auto utf32 = convert(utf8);
for (auto ch: utf32) {
std::cout << static_cast<uint32_t>(ch) << std::endl;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment