Last active
January 17, 2025 18:49
-
-
Save PAMinerva/8b506f360389b80b878b84f99eefb8a7 to your computer and use it in GitHub Desktop.
Convert between std::string and std::wstring without using deprecated functionalities
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "strconv.h" | |
// Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere: | |
// https://en.wikipedia.org/wiki/UTF-8 | |
// https://en.wikipedia.org/wiki/UTF-16 | |
// https://en.wikipedia.org/wiki/UTF-32 | |
// UTF-8 encodes code points in one to four bytes, depending on the value of the code point. | |
// In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz: | |
// Code point ↔ UTF-8 conversion | |
// First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4 | |
// U+000000 U+00007F 0yyyzzzz | |
// U+000080 U+0007FF 110xxxyy 10yyzzzz | |
// U+000800 U+00FFFF 1110wwww 10xxxxyy 10yyzzzz | |
// U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz | |
// | |
// Esempio: | |
// Il carattere alef (א), che corrispondente all'Unicode U+0005D0, viene rappresentato in UTF-8 con questo procedimento: | |
// - ricade nell'intervallo da 0x0080 a 0x07FF. Secondo la tabella verrà rappresentato con due byte (110xxxyy 10yyzzzz); | |
// - l'esadecimale 0x05D0 equivale al binario 101-1101-0000 (xxx=101=5, yyyy=1101=D, zzzz=0000=0); | |
// - gli undici bit vengono copiati in ordine nelle posizioni marcate con x ed y: 110-10111 10-010000; | |
// - il risultato finale è la coppia di byte 11010111 10010000, o in esadecimale 0xD7 0x90. | |
// UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below) | |
// as single 16-bit code units that are numerically equal to the corresponding code points | |
// | |
// UTF-16 encodes code points in the range U+10000 to U+10FFFF as two 16-bit code units called surrogate pair. | |
// The first code unit is a high surrogate and the second is a low surrogate. | |
// The high surrogate is in the range U+D800 to U+DBFF, and the low surrogate is in the range U+DC00 to U+DFFF. | |
// | |
// - 0x10000 is subtracted from the code point (U), leaving a 20-bit number (U') in the hex number range 0x00000–0xFFFFF. | |
// - The high ten bits (in the range 0x000–0x3FF) are added to 0xD800 to give the first 16-bit code unit or high surrogate (W1), | |
// which will be in the range 0xD800–0xDBFF. | |
// - The low ten bits (also in the range 0x000–0x3FF) are added to 0xDC00 to give the second 16-bit code unit or low surrogate (W2), | |
// which will be in the range 0xDC00–0xDFFF. | |
// | |
// Illustrated visually, the distribution of U' between W1 and W2 looks like: | |
// | |
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000 | |
// W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy | |
// W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx | |
// | |
// Examples: | |
// To encode U+10437 (𐐷) to UTF-16: | |
// - Subtract 0x10000 from the code point, leaving 0x0437. | |
// - For the high surrogate, shift right by 10 (divide by 0x400), then add 0xD800, resulting in 0x0001 + 0xD800 = 0xD801. | |
// - For the low surrogate, take the low 10 bits (remainder of dividing by 0x400), then add 0xDC00, resulting in 0x0037 + 0xDC00 = 0xDC37. | |
// | |
// To decode U+10437 (𐐷) from UTF-16: | |
// - Take the high surrogate (0xD801) and subtract 0xD800, then shift left by 10 (multiply by 0x400), resulting in 0x0001 × 0x400 = 0x0400. | |
// - Take the low surrogate (0xDC37) and subtract 0xDC00, resulting in 0x37. | |
// - Add these two results together (0x0437), and finally add 0x10000 to get the final code point, 0x10437. | |
void StringConvert(const std::string& from, std::wstring& to) { | |
to.clear(); | |
size_t i = 0; | |
while (i < from.size()) { | |
uint32_t codepoint = 0; | |
unsigned char c = from[i]; | |
if (c < 0x80) { // Se il byte è minore di 0x80 (128), allora è un carattere ASCII | |
codepoint = c; // in quel caso, valore di c è uguale al codepoint | |
i += 1; | |
} else if ((c & 0xE0) == 0xC0) { // 0xE0 = 11100000, 0xC0 = 11000000 | |
if (i + 1 >= from.size()) break; | |
// combina tramite OR i 5 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] per ottenere il codepoint | |
// in modo che i 5 bit meno significativi di c vengano prima e i 6 bit meno significativi di from[i + 1] vengano dopo. | |
codepoint = ((c & 0x1F) << 6) | // 0x1F = 00011111, quindi (c & 0x1F) << 6) prende i 5 bit meno significativi di c e li sposta a sinistra di 6 posizioni | |
(from[i + 1] & 0x3F); // 0x3F = 00111111, quindi (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1] | |
i += 2; | |
} else if ((c & 0xF0) == 0xE0) { // 0xF0 = 11110000, 0xE0 = 11100000 | |
if (i + 2 >= from.size()) break; | |
// combina tramite OR i 4 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] e i 6 bit meno significativi di from[i + 2] per ottenere il codepoint | |
// in modo che i 4 bit meno significativi di c vengano prima, i 6 bit meno significativi di from[i + 1] vengano dopo e i 6 bit meno significativi di from[i + 2] vengano alla fine. | |
codepoint = ((c & 0x0F) << 12) | // 0x0F = 00001111, quindi (c & 0x0F) << 12 prende i 4 bit meno significativi di c e li sposta a sinistra di 12 posizioni | |
((from[i + 1] & 0x3F) << 6) | // (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 6 posizioni | |
(from[i + 2] & 0x3F); // (from[i + 2] & 0x3F) prende i 6 bit meno significativi di from[i + 2] | |
i += 3; | |
} else if ((c & 0xF8) == 0xF0) { // 0xF8 = 11111000, 0xF0 = 11110000 | |
if (i + 3 >= from.size()) break; | |
codepoint = ((c & 0x07) << 18) | // 0x07 = 00000111, quindi ((c & 0x07) << 18) prende i 3 bit meno significativi di c e li sposta a sinistra di 18 posizioni | |
((from[i + 1] & 0x3F) << 12) | // ((from[i + 1] & 0x3F) << 12) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 12 posizioni | |
((from[i + 2] & 0x3F) << 6) | // ((from[i + 2] & 0x3F) << 6) prende i 6 bit meno significativi di from[i + 2] e li sposta a sinistra di 6 posizioni | |
(from[i + 3] & 0x3F); | |
i += 4; | |
} else { | |
// Sequenza UTF-8 non valida | |
++i; | |
continue; | |
} | |
if constexpr (sizeof(wchar_t) >= 4) { | |
// wchar_t a 32 bit (es. Linux) | |
to += static_cast<wchar_t>(codepoint); | |
} else { | |
// wchar_t a 16 bit (es. Windows) | |
if (codepoint <= 0xFFFF) { | |
to += static_cast<wchar_t>(codepoint); | |
} else { | |
// Converti in surrogati UTF-16 | |
codepoint -= 0x10000; | |
to += static_cast<wchar_t>((codepoint >> 10) + 0xD800); | |
to += static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00); | |
} | |
} | |
} | |
} | |
void StringConvert(const std::wstring& from, std::string& to) { | |
to.clear(); | |
for (size_t i = 0; i < from.size(); ++i) { | |
uint32_t codepoint = 0; | |
wchar_t wc = from[i]; | |
if constexpr (sizeof(wchar_t) >= 4) { | |
// wchar_t a 32 bit (es. Linux) | |
codepoint = static_cast<uint32_t>(wc); | |
} else { | |
// wchar_t a 16 bit (es. Windows) | |
if (wc >= 0xD800 && wc <= 0xDBFF) { | |
// Parte alta del surrogato | |
if (i + 1 < from.size()) { | |
wchar_t wc_low = from[i + 1]; | |
if (wc_low >= 0xDC00 && wc_low <= 0xDFFF) { | |
// Parte bassa del surrogato | |
codepoint = ((static_cast<uint32_t>(wc - 0xD800) << 10) | | |
(static_cast<uint32_t>(wc_low - 0xDC00))) + 0x10000; | |
++i; // Salta la parte bassa del surrogato | |
} | |
} | |
} else { | |
codepoint = static_cast<uint32_t>(wc); | |
} | |
} | |
if (codepoint <= 0x7F) { | |
to += static_cast<char>(codepoint); | |
} else if (codepoint <= 0x7FF) { | |
to += static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F)); | |
to += static_cast<char>(0x80 | (codepoint & 0x3F)); | |
} else if (codepoint <= 0xFFFF) { | |
to += static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F)); | |
to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)); | |
to += static_cast<char>(0x80 | (codepoint & 0x3F)); | |
} else if (codepoint <= 0x10FFFF) { | |
to += static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07)); | |
to += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F)); | |
to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F)); | |
to += static_cast<char>(0x80 | (codepoint & 0x3F)); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment