Last active
May 16, 2019 01:53
-
-
Save RickyCook/b569e62f0827b2ccbab448669f00e689 to your computer and use it in GitHub Desktop.
Trim valid UTF8 bytes to a given max length, ensuring valid UTF8 afterwards
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import unittest | |
CONTINUATION_HIGH_BITS = 0b10 | |
CONTINUATION_SHIFT_RIGHT = 8 - CONTINUATION_HIGH_BITS.bit_length() | |
def is_continuation(byte): | |
return byte >> CONTINUATION_SHIFT_RIGHT == CONTINUATION_HIGH_BITS | |
def trim_utf8(raw, maxlen): | |
""" Trims a UTF8 string to a maximum length, presenting a valid UTF8 output | |
back (dropping off incomplete multi-byte sequences) | |
:param raw: initial ``bytes`` that you want to trim | |
:param maxlen: maximum length of the output | |
""" | |
trimmed = raw[:maxlen] | |
tail_is_continuation = is_continuation(trimmed[-1]) | |
# Drop off bytes until non-continuation | |
while is_continuation(trimmed[-1]): | |
trimmed = trimmed[:-1] | |
# If initially a continuation, drop 1 more byte (the start byte) | |
if tail_is_continuation: | |
trimmed = trimmed[:-1] | |
return trimmed | |
class TestIt(unittest.TestCase): | |
def run_test(self, rawstr): | |
rawbytes = rawstr.encode() | |
assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == rawstr[:-1] | |
def test_emojis(self): | |
self.run_test('😇🤭🤬') | |
@unittest.expectedFailure | |
def test_multi_codepoint_emoji(self): | |
# Rainbow flag is a white flag with an extra modifier byte | |
# We don't handle this case; it decodes without exception | |
rawstr = '🏳️🌈🏳️🌈' | |
rawbytes = rawstr.encode() | |
expstr = '🏳️🌈' | |
assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == expstr | |
# Following byte sequences are from | |
# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt | |
def test_first_possible_4_byte(self): | |
# Note this is different to the 6-bytes; it's 4x 1 char | |
self.run_test('𐀀𐀀𐀀𐀀') | |
def test_first_possible_6_byte(self): | |
self.run_test('������') | |
def test_last_possible_6_byte(self): | |
self.run_test('������') | |
if __name__ == '__main__': | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment