Skip to content

Instantly share code, notes, and snippets.

@prot0man
Last active April 8, 2023 18:44
Show Gist options
  • Save prot0man/a2552cfcb9cc9aef368c415c12a6effa to your computer and use it in GitHub Desktop.
Save prot0man/a2552cfcb9cc9aef368c415c12a6effa to your computer and use it in GitHub Desktop.
Detects whether a string has any invalid utf-8 characters
// Stolen from chatgpt
#include <stdbool.h>
#include <stddef.h>
#include <stdint.h>
bool has_invalid_utf8(const char *s)
{
const uint8_t *p = (const uint8_t *)s;
while (*p != '\0') {
if (*p < 0x80) {
// 1-byte sequence (ASCII)
p++;
} else if ((*p & 0xE0) == 0xC0) {
// 2-byte sequence
if ((p[1] & 0xC0) != 0x80 || (p[1] & 0x3E) == 0) {
return true; // invalid sequence
}
p += 2;
} else if ((*p & 0xF0) == 0xE0) {
// 3-byte sequence
if ((p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 ||
(*p == 0xE0 && (p[1] & 0x1F) == 0)) {
return true; // invalid sequence
}
p += 3;
} else if ((*p & 0xF8) == 0xF0) {
// 4-byte sequence
if ((p[1] & 0xC0) != 0x80 || (p[2] & 0xC0) != 0x80 ||
(p[3] & 0xC0) != 0x80 || (*p == 0xF0 && (p[1] & 0x0F) == 0)) {
return true; // invalid sequence
}
p += 4;
} else {
return true; // invalid start byte
}
}
return false; // no invalid sequence found
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment