PAMinerva · January 17, 2025 18:49
diff --git a/strconv.cpp b/strconv.cpp
 #include "strconv.h"


 // Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere:
 // https://en.wikipedia.org/wiki/UTF-8
 // https://en.wikipedia.org/wiki/UTF-16
 // https://en.wikipedia.org/wiki/UTF-32


 // UTF-8 encodes code points in one to four bytes, depending on the value of the code point. 
 // In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
 // Code point ↔ UTF-8 conversion
 // First code point  Last code point  Byte 1      Byte 2       Byte 3      Byte 4
 // U+000000          U+00007F         0yyyzzzz
 // U+000080          U+0007FF         110xxxyy    10yyzzzz
 // U+000800          U+00FFFF         1110wwww    10xxxxyy     10yyzzzz
 // U+010000          U+10FFFF         11110uvv    10vvwwww     10xxxxyy   10yyzzzz
 //
 // Esempio:
 // Il carattere alef (א), che corrispondente all'Unicode U+0005D0, viene rappresentato in UTF-8 con questo procedimento:
 // - ricade nell'intervallo da 0x0080 a 0x07FF. Secondo la tabella verrà rappresentato con due byte (110xxxyy 10yyzzzz);
 // - l'esadecimale 0x05D0 equivale al binario 101-1101-0000 (xxx=101=5, yyyy=1101=D, zzzz=0000=0);
 // - gli undici bit vengono copiati in ordine nelle posizioni marcate con x ed y: 110-10111 10-010000;
 // - il risultato finale è la coppia di byte 11010111 10010000, o in esadecimale 0xD7 0x90.


 // UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below) 
 // as single 16-bit code units that are numerically equal to the corresponding code points
 //
 // UTF-16 encodes code points in the range U+10000 to U+10FFFF as two 16-bit code units called surrogate pair.
 // The first code unit is a high surrogate and the second is a low surrogate.
 // The high surrogate is in the range U+D800 to U+DBFF, and the low surrogate is in the range U+DC00 to U+DFFF.
 //
 // - 0x10000 is subtracted from the code point (U), leaving a 20-bit number (U') in the hex number range 0x00000–0xFFFFF.
 // - The high ten bits (in the range 0x000–0x3FF) are added to 0xD800 to give the first 16-bit code unit or high surrogate (W1), 
 //   which will be in the range 0xD800–0xDBFF.
 // - The low ten bits (also in the range 0x000–0x3FF) are added to 0xDC00 to give the second 16-bit code unit or low surrogate (W2), 
 //   which will be in the range 0xDC00–0xDFFF.
 //
 // Illustrated visually, the distribution of U' between W1 and W2 looks like:
 //
 // U' = yyyyyyyyyyxxxxxxxxxx  // U - 0x10000
 // W1 = 110110yyyyyyyyyy      // 0xD800 + yyyyyyyyyy
 // W2 = 110111xxxxxxxxxx      // 0xDC00 + xxxxxxxxxx
 //
 // Examples:
 // To encode U+10437 (𐐷) to UTF-16:
 // - Subtract 0x10000 from the code point, leaving 0x0437.
 // - For the high surrogate, shift right by 10 (divide by 0x400), then add 0xD800, resulting in 0x0001 + 0xD800 = 0xD801.
 // - For the low surrogate, take the low 10 bits (remainder of dividing by 0x400), then add 0xDC00, resulting in 0x0037 + 0xDC00 = 0xDC37.
 //
 // To decode U+10437 (𐐷) from UTF-16:
 // - Take the high surrogate (0xD801) and subtract 0xD800, then shift left by 10 (multiply by 0x400), resulting in 0x0001 × 0x400 = 0x0400.
 // - Take the low surrogate (0xDC37) and subtract 0xDC00, resulting in 0x37.
 // - Add these two results together (0x0437), and finally add 0x10000 to get the final code point, 0x10437.


 void StringConvert(const std::string& from, std::wstring& to) {
    to.clear();
    size_t i = 0;
    while (i < from.size()) {
        uint32_t codepoint = 0;
        unsigned char c = from[i];
        
        if (c < 0x80) { // Se il byte è minore di 0x80 (128), allora è un carattere ASCII
            codepoint = c; // in quel caso, valore di c è uguale al codepoint
            i += 1;
        } else if ((c & 0xE0) == 0xC0) { // 0xE0 = 11100000, 0xC0 = 11000000
            if (i + 1 >= from.size()) break;
            // combina tramite OR i 5 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] per ottenere il codepoint
            // in modo che i 5 bit meno significativi di c vengano prima e i 6 bit meno significativi di from[i + 1] vengano dopo.
            codepoint = ((c & 0x1F) << 6) | // 0x1F = 00011111, quindi (c & 0x1F) << 6) prende i 5 bit meno significativi di c e li sposta a sinistra di 6 posizioni
                        (from[i + 1] & 0x3F); // 0x3F = 00111111, quindi (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1]
            i += 2;
        } else if ((c & 0xF0) == 0xE0) { // 0xF0 = 11110000, 0xE0 = 11100000
            if (i + 2 >= from.size()) break;
            // combina tramite OR i 4 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] e i 6 bit meno significativi di from[i + 2] per ottenere il codepoint
            // in modo che i 4 bit meno significativi di c vengano prima, i 6 bit meno significativi di from[i + 1] vengano dopo e i 6 bit meno significativi di from[i + 2] vengano alla fine.
            codepoint = ((c & 0x0F) << 12) | // 0x0F = 00001111, quindi (c & 0x0F) << 12 prende i 4 bit meno significativi di c e li sposta a sinistra di 12 posizioni
                        ((from[i + 1] & 0x3F) << 6) | // (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 6 posizioni
                        (from[i + 2] & 0x3F); // (from[i + 2] & 0x3F) prende i 6 bit meno significativi di from[i + 2]
            i += 3;
        } else if ((c & 0xF8) == 0xF0) { // 0xF8 = 11111000, 0xF0 = 11110000
            if (i + 3 >= from.size()) break;
            codepoint = ((c & 0x07) << 18) | // 0x07 = 00000111, quindi ((c & 0x07) << 18) prende i 3 bit meno significativi di c e li sposta a sinistra di 18 posizioni
                        ((from[i + 1] & 0x3F) << 12) | // ((from[i + 1] & 0x3F) << 12) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 12 posizioni
                        ((from[i + 2] & 0x3F) << 6) | // ((from[i + 2] & 0x3F) << 6) prende i 6 bit meno significativi di from[i + 2] e li sposta a sinistra di 6 posizioni
                        (from[i + 3] & 0x3F);
            i += 4;
        } else {
            // Sequenza UTF-8 non valida
            ++i;
            continue;
        }

        if constexpr (sizeof(wchar_t) >= 4) {
            // wchar_t a 32 bit (es. Linux)
            to += static_cast<wchar_t>(codepoint);
        } else {
            // wchar_t a 16 bit (es. Windows)
            if (codepoint <= 0xFFFF) {
                to += static_cast<wchar_t>(codepoint);
            } else {
                // Converti in surrogati UTF-16
                codepoint -= 0x10000;
                to += static_cast<wchar_t>((codepoint >> 10) + 0xD800);
                to += static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00);
            }
        }
    }
 }

 void StringConvert(const std::wstring& from, std::string& to) {
    to.clear();
    for (size_t i = 0; i < from.size(); ++i) {
        uint32_t codepoint = 0;
        wchar_t wc = from[i];

        if constexpr (sizeof(wchar_t) >= 4) {
            // wchar_t a 32 bit (es. Linux)
            codepoint = static_cast<uint32_t>(wc);
        } else {
            // wchar_t a 16 bit (es. Windows)
            if (wc >= 0xD800 && wc <= 0xDBFF) {
                // Parte alta del surrogato
                if (i + 1 < from.size()) {
                    wchar_t wc_low = from[i + 1];
                    if (wc_low >= 0xDC00 && wc_low <= 0xDFFF) {
                        // Parte bassa del surrogato
                        codepoint = ((static_cast<uint32_t>(wc - 0xD800) << 10) |
                                     (static_cast<uint32_t>(wc_low - 0xDC00))) + 0x10000;
                        ++i; // Salta la parte bassa del surrogato
                    }
                }
            } else {
                codepoint = static_cast<uint32_t>(wc);
            }
        }

        if (codepoint <= 0x7F) {
            to += static_cast<char>(codepoint);
        } else if (codepoint <= 0x7FF) {
            to += static_cast<char>(0xC0 | ((codepoint >> 6) & 0x1F));
            to += static_cast<char>(0x80 | (codepoint & 0x3F));
        } else if (codepoint <= 0xFFFF) {
            to += static_cast<char>(0xE0 | ((codepoint >> 12) & 0x0F));
            to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
            to += static_cast<char>(0x80 | (codepoint & 0x3F));
        } else if (codepoint <= 0x10FFFF) {
            to += static_cast<char>(0xF0 | ((codepoint >> 18) & 0x07));
            to += static_cast<char>(0x80 | ((codepoint >> 12) & 0x3F));
            to += static_cast<char>(0x80 | ((codepoint >> 6) & 0x3F));
            to += static_cast<char>(0x80 | (codepoint & 0x3F));
        }
    }
 }
	#include "strconv.h"


	// Per comprendere meglio il funzionamento di queste funzioni, si consiglia di leggere:
	// https://en.wikipedia.org/wiki/UTF-8
	// https://en.wikipedia.org/wiki/UTF-16
	// https://en.wikipedia.org/wiki/UTF-32


	// UTF-8 encodes code points in one to four bytes, depending on the value of the code point.
	// In the following table, the characters u to z are replaced by the bits of the code point, from the positions U+uvwxyz:
	// Code point ↔ UTF-8 conversion
	// First code point Last code point Byte 1 Byte 2 Byte 3 Byte 4
	// U+000000 U+00007F 0yyyzzzz
	// U+000080 U+0007FF 110xxxyy 10yyzzzz
	// U+000800 U+00FFFF 1110wwww 10xxxxyy 10yyzzzz
	// U+010000 U+10FFFF 11110uvv 10vvwwww 10xxxxyy 10yyzzzz
	//
	// Esempio:
	// Il carattere alef (א), che corrispondente all'Unicode U+0005D0, viene rappresentato in UTF-8 con questo procedimento:
	// - ricade nell'intervallo da 0x0080 a 0x07FF. Secondo la tabella verrà rappresentato con due byte (110xxxyy 10yyzzzz);
	// - l'esadecimale 0x05D0 equivale al binario 101-1101-0000 (xxx=101=5, yyyy=1101=D, zzzz=0000=0);
	// - gli undici bit vengono copiati in ordine nelle posizioni marcate con x ed y: 110-10111 10-010000;
	// - il risultato finale è la coppia di byte 11010111 10010000, o in esadecimale 0xD7 0x90.


	// UTF-16 encodes code points in the range U+0000 to U+D7FF and U+E000 to U+FFFF (U+D800 to U+DFFF have a special purpose, see below)
	// as single 16-bit code units that are numerically equal to the corresponding code points
	//
	// UTF-16 encodes code points in the range U+10000 to U+10FFFF as two 16-bit code units called surrogate pair.
	// The first code unit is a high surrogate and the second is a low surrogate.
	// The high surrogate is in the range U+D800 to U+DBFF, and the low surrogate is in the range U+DC00 to U+DFFF.
	//
	// - 0x10000 is subtracted from the code point (U), leaving a 20-bit number (U') in the hex number range 0x00000–0xFFFFF.
	// - The high ten bits (in the range 0x000–0x3FF) are added to 0xD800 to give the first 16-bit code unit or high surrogate (W1),
	// which will be in the range 0xD800–0xDBFF.
	// - The low ten bits (also in the range 0x000–0x3FF) are added to 0xDC00 to give the second 16-bit code unit or low surrogate (W2),
	// which will be in the range 0xDC00–0xDFFF.
	//
	// Illustrated visually, the distribution of U' between W1 and W2 looks like:
	//
	// U' = yyyyyyyyyyxxxxxxxxxx // U - 0x10000
	// W1 = 110110yyyyyyyyyy // 0xD800 + yyyyyyyyyy
	// W2 = 110111xxxxxxxxxx // 0xDC00 + xxxxxxxxxx
	//
	// Examples:
	// To encode U+10437 (𐐷) to UTF-16:
	// - Subtract 0x10000 from the code point, leaving 0x0437.
	// - For the high surrogate, shift right by 10 (divide by 0x400), then add 0xD800, resulting in 0x0001 + 0xD800 = 0xD801.
	// - For the low surrogate, take the low 10 bits (remainder of dividing by 0x400), then add 0xDC00, resulting in 0x0037 + 0xDC00 = 0xDC37.
	//
	// To decode U+10437 (𐐷) from UTF-16:
	// - Take the high surrogate (0xD801) and subtract 0xD800, then shift left by 10 (multiply by 0x400), resulting in 0x0001 × 0x400 = 0x0400.
	// - Take the low surrogate (0xDC37) and subtract 0xDC00, resulting in 0x37.
	// - Add these two results together (0x0437), and finally add 0x10000 to get the final code point, 0x10437.


	void StringConvert(const std::string& from, std::wstring& to) {
	to.clear();
	size_t i = 0;
	while (i < from.size()) {
	uint32_t codepoint = 0;
	unsigned char c = from[i];

	if (c < 0x80) { // Se il byte è minore di 0x80 (128), allora è un carattere ASCII
	codepoint = c; // in quel caso, valore di c è uguale al codepoint
	i += 1;
	} else if ((c & 0xE0) == 0xC0) { // 0xE0 = 11100000, 0xC0 = 11000000
	if (i + 1 >= from.size()) break;
	// combina tramite OR i 5 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] per ottenere il codepoint
	// in modo che i 5 bit meno significativi di c vengano prima e i 6 bit meno significativi di from[i + 1] vengano dopo.
	codepoint = ((c & 0x1F) << 6) \| // 0x1F = 00011111, quindi (c & 0x1F) << 6) prende i 5 bit meno significativi di c e li sposta a sinistra di 6 posizioni
	(from[i + 1] & 0x3F); // 0x3F = 00111111, quindi (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1]
	i += 2;
	} else if ((c & 0xF0) == 0xE0) { // 0xF0 = 11110000, 0xE0 = 11100000
	if (i + 2 >= from.size()) break;
	// combina tramite OR i 4 bit meno significativi di c con i 6 bit meno significativi di from[i + 1] e i 6 bit meno significativi di from[i + 2] per ottenere il codepoint
	// in modo che i 4 bit meno significativi di c vengano prima, i 6 bit meno significativi di from[i + 1] vengano dopo e i 6 bit meno significativi di from[i + 2] vengano alla fine.
	codepoint = ((c & 0x0F) << 12) \| // 0x0F = 00001111, quindi (c & 0x0F) << 12 prende i 4 bit meno significativi di c e li sposta a sinistra di 12 posizioni
	((from[i + 1] & 0x3F) << 6) \| // (from[i + 1] & 0x3F) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 6 posizioni
	(from[i + 2] & 0x3F); // (from[i + 2] & 0x3F) prende i 6 bit meno significativi di from[i + 2]
	i += 3;
	} else if ((c & 0xF8) == 0xF0) { // 0xF8 = 11111000, 0xF0 = 11110000
	if (i + 3 >= from.size()) break;
	codepoint = ((c & 0x07) << 18) \| // 0x07 = 00000111, quindi ((c & 0x07) << 18) prende i 3 bit meno significativi di c e li sposta a sinistra di 18 posizioni
	((from[i + 1] & 0x3F) << 12) \| // ((from[i + 1] & 0x3F) << 12) prende i 6 bit meno significativi di from[i + 1] e li sposta a sinistra di 12 posizioni
	((from[i + 2] & 0x3F) << 6) \| // ((from[i + 2] & 0x3F) << 6) prende i 6 bit meno significativi di from[i + 2] e li sposta a sinistra di 6 posizioni
	(from[i + 3] & 0x3F);
	i += 4;
	} else {
	// Sequenza UTF-8 non valida
	++i;
	continue;
	}

	if constexpr (sizeof(wchar_t) >= 4) {
	// wchar_t a 32 bit (es. Linux)
	to += static_cast<wchar_t>(codepoint);
	} else {
	// wchar_t a 16 bit (es. Windows)
	if (codepoint <= 0xFFFF) {
	to += static_cast<wchar_t>(codepoint);
	} else {
	// Converti in surrogati UTF-16
	codepoint -= 0x10000;
	to += static_cast<wchar_t>((codepoint >> 10) + 0xD800);
	to += static_cast<wchar_t>((codepoint & 0x3FF) + 0xDC00);
	}
	}
	}
	}

	void StringConvert(const std::wstring& from, std::string& to) {
	to.clear();
	for (size_t i = 0; i < from.size(); ++i) {
	uint32_t codepoint = 0;
	wchar_t wc = from[i];

	if constexpr (sizeof(wchar_t) >= 4) {
	// wchar_t a 32 bit (es. Linux)
	codepoint = static_cast<uint32_t>(wc);
	} else {
	// wchar_t a 16 bit (es. Windows)
	if (wc >= 0xD800 && wc <= 0xDBFF) {
	// Parte alta del surrogato
	if (i + 1 < from.size()) {
	wchar_t wc_low = from[i + 1];
	if (wc_low >= 0xDC00 && wc_low <= 0xDFFF) {
	// Parte bassa del surrogato
	codepoint = ((static_cast<uint32_t>(wc - 0xD800) << 10) \|
	(static_cast<uint32_t>(wc_low - 0xDC00))) + 0x10000;
	++i; // Salta la parte bassa del surrogato
	}
	}
	} else {
	codepoint = static_cast<uint32_t>(wc);
	}
	}

	if (codepoint <= 0x7F) {
	to += static_cast<char>(codepoint);
	} else if (codepoint <= 0x7FF) {
	to += static_cast<char>(0xC0 \| ((codepoint >> 6) & 0x1F));
	to += static_cast<char>(0x80 \| (codepoint & 0x3F));
	} else if (codepoint <= 0xFFFF) {
	to += static_cast<char>(0xE0 \| ((codepoint >> 12) & 0x0F));
	to += static_cast<char>(0x80 \| ((codepoint >> 6) & 0x3F));
	to += static_cast<char>(0x80 \| (codepoint & 0x3F));
	} else if (codepoint <= 0x10FFFF) {
	to += static_cast<char>(0xF0 \| ((codepoint >> 18) & 0x07));
	to += static_cast<char>(0x80 \| ((codepoint >> 12) & 0x3F));
	to += static_cast<char>(0x80 \| ((codepoint >> 6) & 0x3F));
	to += static_cast<char>(0x80 \| (codepoint & 0x3F));
	}
	}
	}