Created
January 2, 2021 13:08
-
-
Save Riey/7c1724cc2c9cd10359af3b199f780606 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cstdint> | |
#include <optional> | |
#include <string> | |
#include <vector> | |
#include <iostream> | |
using utf8 = std::u8string_view; | |
using utf8_iter = std::u8string_view::const_iterator&; | |
#define CONT_MASK 0b00111111 | |
constexpr auto utf8_first_byte(uint8_t byte, uint32_t width) -> uint32_t { | |
return (byte & (0x7F >> width)); | |
} | |
constexpr auto utf8_acc_cont_byte(uint32_t ch, uint8_t byte) -> uint32_t { | |
return (ch << 6) | (byte & CONT_MASK); | |
} | |
constexpr auto unwrap_or_0(utf8_iter iter, utf8_iter end) -> uint8_t { | |
if (iter == end) { | |
return 0; | |
} else { | |
return *iter++; | |
} | |
} | |
auto next_code_point(utf8_iter iter, utf8_iter end) -> std::optional<uint32_t> { | |
if (iter == end) { | |
return std::nullopt; | |
} | |
auto x = *iter++; | |
if (x < 128) { | |
return x; | |
} | |
auto init = utf8_first_byte(x, 2); | |
auto y = unwrap_or_0(iter, end); | |
auto ch = utf8_acc_cont_byte(init, y); | |
if (x >= 0xE0) { | |
auto z = unwrap_or_0(iter, end); | |
auto y_z = utf8_acc_cont_byte((y & CONT_MASK), z); | |
ch = init << 12 | y_z; | |
if (x >= 0xF0) { | |
auto w = unwrap_or_0(iter, end); | |
ch = (init & 7) << 18 | utf8_acc_cont_byte(y_z, w); | |
} | |
} | |
return ch; | |
} | |
auto convert(const utf8& bytes) -> std::u32string { | |
auto iter = bytes.begin(); | |
auto end = bytes.end(); | |
auto out = std::u32string{}; | |
while(true) { | |
auto next = next_code_point(iter, end); | |
if (!next.has_value()) { | |
break; | |
} | |
out.push_back(next.value()); | |
} | |
return out; | |
} | |
auto main() -> int { | |
std::u8string_view utf8 = u8"가나다라"; | |
auto utf32 = convert(utf8); | |
for (auto ch: utf32) { | |
std::cout << static_cast<uint32_t>(ch) << std::endl; | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment