Last active
April 8, 2023 18:44
-
-
Save prot0man/a2552cfcb9cc9aef368c415c12a6effa to your computer and use it in GitHub Desktop.
Detects whether a string has any invalid utf-8 characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// Stolen from chatgpt | |
#include <stdbool.h> | |
#include <stddef.h> | |
#include <stdint.h> | |
bool has_invalid_utf8(const char *s) | |
{ | |
const uint8_t *p = (const uint8_t *)s; | |
while (*p != '\0') { | |
if (*p < 0x80) { | |
// 1-byte sequence (ASCII) | |
p++; | |
} else if ((*p & 0xE0) == 0xC0) { | |
// 2-byte sequence | |
if ((p[1] & 0xC0) != 0x80 || (p[1] & 0x3E) == 0) { | |
return true; // invalid sequence | |
} | |
p += 2; | |
} else if ((*p & 0xF0) == 0xE0) { | |
// 3-byte sequence | |
if ((p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 || | |
(*p == 0xE0 && (p[1] & 0x1F) == 0)) { | |
return true; // invalid sequence | |
} | |
p += 3; | |
} else if ((*p & 0xF8) == 0xF0) { | |
// 4-byte sequence | |
if ((p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 || | |
(p[3] & 0xC0) != 0x80 || (*p == 0xF0 && (p[1] & 0x0F) == 0)) { | |
return true; // invalid sequence | |
} | |
p += 4; | |
} else { | |
return true; // invalid start byte | |
} | |
} | |
return false; // no invalid sequence found | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment