Skip to content

Instantly share code, notes, and snippets.

@PAMinerva
Last active January 17, 2025 18:49
Show Gist options
  • Save PAMinerva/8b506f360389b80b878b84f99eefb8a7 to your computer and use it in GitHub Desktop.
Save PAMinerva/8b506f360389b80b878b84f99eefb8a7 to your computer and use it in GitHub Desktop.
Convert between std::string and std::wstring without using deprecated functionalities
#include "strconv.h"
// Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere:
// https://en.wikipedia.org/wiki/UTF-8
// https://en.wikipedia.org/wiki/UTF-16
// https://en.wikipedia.org/wiki/UTF-32
// UTF-8 encodes code points in one to four bytes, depending on the value of the code point.
// In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
// Code point ↔ UTF-8 conversion
// First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4
// U+000000 U+00007F 0yyyzzzz
// U+000080 U+0007FF 110xxxyy 10yyzzzz
// U+000800 U+00FFFF 1110wwww 10xxxxyy 10yyzzzz
// U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
//
// Esempio:
// Il carattere alef (א), che corrispondente all'Unicode U+0005D0, viene rappresentato in UTF-8 con questo procedimento:
// - ricade nell'intervallo da 0x0080 a 0x07FF. Secondo la tabella verrà rappresentato con due byte (110xxxyy 10yyzzzz);
// - l'esadecimale 0x05D0 equivale al binario 101-1101-0000 (xxx=101=5, yyyy=1101=D, zzzz=0000=0);
// - gli undici bit vengono copiati in ordine nelle posizioni marcate con x ed y: 110-10111 10-010000;
// - il risultato finale è la coppia di byte 11010111 10010000, o in esadecimale 0xD7 0x90.
// UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below)
// as single 16-bit code units that are numerically equal to the corresponding code points
//
// UTF-16 encodes code points in the range U+10000 to U+10FFFF as two 16-bit code units called surrogate pair.
// The first code unit is a high surrogate and the second is a low surrogate.
// The high surrogate is in the range U+D800 to U+DBFF, and the low surrogate is in the range U+DC00 to U+DFFF.
//
// - 0x10000 is subtracted from the code point (U), leaving a 20-bit number (U') in the hex number range 0x00000–0xFFFFF.
// - The high ten bits (in the range 0x000–0x3FF) are added to 0xD800 to give the first 16-bit code unit or high surrogate (W1),
// which will be in the range 0xD800–0xDBFF.
// - The low ten bits (also in the range 0x000–0x3FF) are added to 0xDC00 to give the second 16-bit code unit or low surrogate (W2),
// which will be in the range 0xDC00–0xDFFF.
//
// Illustrated visually, the distribution of U' between W1 and W2 looks like:
//
// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
// W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
// W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx
//
// Examples:
// To encode U+10437 (𐐷) to UTF-16:
// - Subtract 0x10000 from the code point, leaving 0x0437.
// - For the high surrogate, shift right by 10 (divide by 0x400), then add 0xD800, resulting in 0x0001 + 0xD800 = 0xD801.
// - For the low surrogate, take the low 10 bits (remainder of dividing by 0x400), then add 0xDC00, resulting in 0x0037 + 0xDC00 = 0xDC37.
//
// To decode U+10437 (𐐷) from UTF-16:
// - Take the high surrogate (0xD801) and subtract 0xD800, then shift left by 10 (multiply by 0x400), resulting in 0x0001 × 0x400 = 0x0400.
// - Take the low surrogate (0xDC37) and subtract 0xDC00, resulting in 0x37.
// - Add these two results together (0x0437), and finally add 0x10000 to get the final code point, 0x10437.
void StringConvert(const std::string& from, std::wstring& to) {
to.clear();
size_t i = 0;
while (i < from.size()) {
uint32_t codepoint = 0;
unsigned char c = from[i];
if (c < 0x80) { // Se il byte è minore di 0x80 (128), allora è un carattere ASCII
codepoint = c; // in quel caso, valore di c è uguale al codepoint
i += 1;
} else if ((c & 0xE0) == 0xC0) { // 0xE0 = 11100000, 0xC0 = 11000000
if (i + 1 >= from.size()) break;
// combina tramite OR i 5 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] per ottenere il codepoint
// in modo che i 5 bit meno significativi di c vengano prima e i 6 bit meno significativi di from[i + 1] vengano dopo.
codepoint = ((c & 0x1F) << 6) | // 0x1F = 00011111, quindi (c & 0x1F) << 6) prende i 5 bit meno significativi di c e li sposta a sinistra di 6 posizioni
(from[i + 1] & 0x3F); // 0x3F = 00111111, quindi (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1]
i += 2;
} else if ((c & 0xF0) == 0xE0) { // 0xF0 = 11110000, 0xE0 = 11100000
if (i + 2 >= from.size()) break;
// combina tramite OR i 4 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] e i 6 bit meno significativi di from[i + 2] per ottenere il codepoint
// in modo che i 4 bit meno significativi di c vengano prima, i 6 bit meno significativi di from[i + 1] vengano dopo e i 6 bit meno significativi di from[i + 2] vengano alla fine.
codepoint = ((c & 0x0F) << 12) | // 0x0F = 00001111, quindi (c & 0x0F) << 12 prende i 4 bit meno significativi di c e li sposta a sinistra di 12 posizioni
((from[i + 1] & 0x3F) << 6) | // (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 6 posizioni
(from[i + 2] & 0x3F); // (from[i + 2] & 0x3F) prende i 6 bit meno significativi di from[i + 2]
i += 3;
} else if ((c & 0xF8) == 0xF0) { // 0xF8 = 11111000, 0xF0 = 11110000
if (i + 3 >= from.size()) break;
codepoint = ((c & 0x07) << 18) | // 0x07 = 00000111, quindi ((c & 0x07) << 18) prende i 3 bit meno significativi di c e li sposta a sinistra di 18 posizioni
((from[i + 1] & 0x3F) << 12) | // ((from[i + 1] & 0x3F) << 12) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 12 posizioni
((from[i + 2] & 0x3F) << 6) | // ((from[i + 2] & 0x3F) << 6) prende i 6 bit meno significativi di from[i + 2] e li sposta a sinistra di 6 posizioni
(from[i + 3] & 0x3F);
i += 4;
} else {
// Sequenza UTF-8 non valida
++i;
continue;
}
if constexpr (sizeof(wchar_t) >= 4) {
// wchar_t a 32 bit (es. Linux)
to += static_cast<wchar_t>(codepoint);
} else {
// wchar_t a 16 bit (es. Windows)
if (codepoint <= 0xFFFF) {
to += static_cast<wchar_t>(codepoint);
} else {
// Converti in surrogati UTF-16
codepoint -= 0x10000;
to += static_cast<wchar_t>((codepoint >> 10) + 0xD800);
to += static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00);
}
}
}
}
void StringConvert(const std::wstring& from, std::string& to) {
to.clear();
for (size_t i = 0; i < from.size(); ++i) {
uint32_t codepoint = 0;
wchar_t wc = from[i];
if constexpr (sizeof(wchar_t) >= 4) {
// wchar_t a 32 bit (es. Linux)
codepoint = static_cast<uint32_t>(wc);
} else {
// wchar_t a 16 bit (es. Windows)
if (wc >= 0xD800 && wc <= 0xDBFF) {
// Parte alta del surrogato
if (i + 1 < from.size()) {
wchar_t wc_low = from[i + 1];
if (wc_low >= 0xDC00 && wc_low <= 0xDFFF) {
// Parte bassa del surrogato
codepoint = ((static_cast<uint32_t>(wc - 0xD800) << 10) |
(static_cast<uint32_t>(wc_low - 0xDC00))) + 0x10000;
++i; // Salta la parte bassa del surrogato
}
}
} else {
codepoint = static_cast<uint32_t>(wc);
}
}
if (codepoint <= 0x7F) {
to += static_cast<char>(codepoint);
} else if (codepoint <= 0x7FF) {
to += static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F));
to += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint <= 0xFFFF) {
to += static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F));
to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
to += static_cast<char>(0x80 | (codepoint & 0x3F));
} else if (codepoint <= 0x10FFFF) {
to += static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07));
to += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
to += static_cast<char>(0x80 | (codepoint & 0x3F));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment