RickyCook · May 16, 2019 01:53
diff --git a/trim_utf8.py b/trim_utf8.py
 #!/usr/bin/env python3
 import unittest


 CONTINUATION_HIGH_BITS = 0b10
 CONTINUATION_SHIFT_RIGHT = 8 - CONTINUATION_HIGH_BITS.bit_length()


 def is_continuation(byte):
    return byte >> CONTINUATION_SHIFT_RIGHT == CONTINUATION_HIGH_BITS


 def trim_utf8(raw, maxlen):
    """ Trims a UTF8 string to a maximum length, presenting a valid UTF8 output
    back (dropping off incomplete multi-byte sequences)

    :param raw: initial ``bytes`` that you want to trim
    :param maxlen: maximum length of the output
    """
    trimmed = raw[:maxlen]

    tail_is_continuation = is_continuation(trimmed[-1])

    # Drop off bytes until non-continuation
    while is_continuation(trimmed[-1]):
        trimmed = trimmed[:-1]

    # If initially a continuation, drop 1 more byte (the start byte)
    if tail_is_continuation:
        trimmed = trimmed[:-1]

    return trimmed


 class TestIt(unittest.TestCase):
    def run_test(self, rawstr):
        rawbytes = rawstr.encode()
        assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == rawstr[:-1]
    def test_emojis(self):
        self.run_test('😇🤭🤬')
    @unittest.expectedFailure
    def test_multi_codepoint_emoji(self):
        # Rainbow flag is a white flag with an extra modifier byte
        # We don't handle this case; it decodes without exception
        rawstr = '🏳️‍🌈🏳️‍🌈'
        rawbytes = rawstr.encode()
        expstr = '🏳️‍🌈'
        assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == expstr
    # Following byte sequences are from
    # https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
    def test_first_possible_4_byte(self):
        # Note this is different to the 6-bytes; it's 4x 1 char
        self.run_test('𐀀𐀀𐀀𐀀')
    def test_first_possible_6_byte(self):
        self.run_test('������')
    def test_last_possible_6_byte(self):
        self.run_test('������')


 if __name__ == '__main__':
    unittest.main()
	#!/usr/bin/env python3
	import unittest


	CONTINUATION_HIGH_BITS = 0b10
	CONTINUATION_SHIFT_RIGHT = 8 - CONTINUATION_HIGH_BITS.bit_length()


	def is_continuation(byte):
	return byte >> CONTINUATION_SHIFT_RIGHT == CONTINUATION_HIGH_BITS


	def trim_utf8(raw, maxlen):
	""" Trims a UTF8 string to a maximum length, presenting a valid UTF8 output
	back (dropping off incomplete multi-byte sequences)

	:param raw: initial ``bytes`` that you want to trim
	:param maxlen: maximum length of the output
	"""
	trimmed = raw[:maxlen]

	tail_is_continuation = is_continuation(trimmed[-1])

	# Drop off bytes until non-continuation
	while is_continuation(trimmed[-1]):
	trimmed = trimmed[:-1]

	# If initially a continuation, drop 1 more byte (the start byte)
	if tail_is_continuation:
	trimmed = trimmed[:-1]

	return trimmed


	class TestIt(unittest.TestCase):
	def run_test(self, rawstr):
	rawbytes = rawstr.encode()
	assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == rawstr[:-1]
	def test_emojis(self):
	self.run_test('😇🤭🤬')
	@unittest.expectedFailure
	def test_multi_codepoint_emoji(self):
	# Rainbow flag is a white flag with an extra modifier byte
	# We don't handle this case; it decodes without exception
	rawstr = '🏳️‍🌈🏳️‍🌈'
	rawbytes = rawstr.encode()
	expstr = '🏳️‍🌈'
	assert trim_utf8(rawbytes, len(rawbytes) - 1).decode() == expstr
	# Following byte sequences are from
	# https://www.cl.cam.ac.uk/~mgk25/ucs/examples/UTF-8-test.txt
	def test_first_possible_4_byte(self):
	# Note this is different to the 6-bytes; it's 4x 1 char
	self.run_test('𐀀𐀀𐀀𐀀')
	def test_first_possible_6_byte(self):
	self.run_test('��')
	def test_last_possible_6_byte(self):
	self.run_test('��')


	if __name__ == '__main__':
	unittest.main()