eao197 · June 19, 2024 13:23
diff --git a/utf8_checker_speed.cpp b/utf8_checker_speed.cpp
 #include <algorithm>
 #include <array>
 #include <chrono>
 #include <iostream>
 #include <span>
 #include <string>
 #include <string_view>
 #include <cstdint>

 namespace restinio
 {

 namespace utils
 {

 //
 // utf8_checker_t
 //

 /*!
 * @brief Helper class for checking UTF-8 byte sequence during parsing
 * URI or incoming byte stream.
 */
 class utf8_checker_t
 {
 	//! Enumeration of all possible checker states.
 	enum class state_t
 	{
 		wait_first_byte,
 		wait_second_of_two,
 		wait_second_of_three,
 		wait_second_of_four,
 		wait_third_of_three,
 		wait_third_of_four,
 		wait_fourth_of_four,
 		invalid,
 	};

 	//! The current UNICODE symbol.
 	/*!
 	 * Contains a valid value only if some bytes were successfully
 	 * processed by process_byte() and the current state is
 	 * wait_first_byte.
 	 */
 	std::uint32_t m_current_symbol = 0u;

 	//! The current state of the checker.
 	state_t m_state{ state_t::wait_first_byte };

 	void
 	on_first_byte( std::uint8_t byte ) noexcept
 	{
 		if( byte <= 0x7Fu )
 		{
 			m_state = state_t::wait_first_byte;
 			m_current_symbol = byte;
 		}
 		else if( 0xC0u == (byte & 0xE0u) )
 		{
 			m_state = state_t::wait_second_of_two;
 			m_current_symbol = (byte & 0x1Fu);
 		}
 		else if( 0xE0u == (byte & 0xF0u) )
 		{
 			m_state = state_t::wait_second_of_three;
 			m_current_symbol = (byte & 0x0Fu);
 		}
 		else if( 0xF0u == (byte & 0xF8u) )
 		{
 			m_state = state_t::wait_second_of_four;
 			m_current_symbol = (byte & 0x07u);
 		}
 		else
 		{
 			// Because UTF-8 can represent only ranges from:
 			//
 			// 0000 0000-0000 007F
 			// 0000 0080-0000 07FF
 			// 0000 0800-0000 FFFF
 			// 0001 0000-0010 FFFF
 			//
 			// There is no need to check masks like 0b111110xx and so on.
 			//
 			// See https://datatracker.ietf.org/doc/html/rfc3629
 			//
 			m_state = state_t::invalid;
 		}
 	}

 	void
 	on_second_of_two( std::uint8_t byte ) noexcept
 	{
 		if( 0x80u == (byte & 0xC0u) )
 		{
 			m_current_symbol <<= 6;
 			m_current_symbol |= (byte & 0x3Fu);

 			// Check for overlong sequence.
 			// The valid range for two bytes representation is 0x0080..0x07FF.
 			if( m_current_symbol < 0x0080u )
 			{
 				// The value is too small, it's overlong.
 				m_state = state_t::invalid;
 			}
 			else
 				// Three is no need to check the result value against
 				// invalid ranges (0xD800..0xDFFF and 0x110000..)
 				// because two bytes only represents 0x0080..0x07FF.
 				m_state = state_t::wait_first_byte;
 		}
 		else
 		{
 			m_state = state_t::invalid;
 		}
 	}

 	void
 	on_second_of_three( std::uint8_t byte ) noexcept
 	{
 		if( 0x80u == (byte & 0xC0u) )
 		{
 			m_current_symbol <<= 6;
 			m_current_symbol |= (byte & 0x3Fu);

 			m_state = state_t::wait_third_of_three;
 		}
 		else
 		{
 			m_state = state_t::invalid;
 		}
 	}

 	void
 	on_second_of_four( std::uint8_t byte ) noexcept
 	{
 		if( 0x80u == (byte & 0xC0u) )
 		{
 			m_current_symbol <<= 6;
 			m_current_symbol |= (byte & 0x3Fu);

 			m_state = state_t::wait_third_of_four;
 		}
 		else
 		{
 			m_state = state_t::invalid;
 		}
 	}

 	void
 	on_third_of_three( std::uint8_t byte ) noexcept
 	{
 		if( 0x80u == (byte & 0xC0u) )
 		{
 			m_current_symbol <<= 6;
 			m_current_symbol |= (byte & 0x3Fu);

 			// Check for overlong sequence.
 			// The valid range for three bytes representation is 0x0800..0xFFFF.
 			if( m_current_symbol < 0x0800u )
 			{
 				// The value is too small, it's overlong.
 				m_state = state_t::invalid;
 			}
 			else
 			{
 				// It's necessary to check illigal points 0xD800..0xDFFF.
 				if( m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF )
 					m_state = state_t::invalid;
 				else
 					m_state = state_t::wait_first_byte;
 			}
 		}
 		else
 		{
 			m_state = state_t::invalid;
 		}
 	}

 	void
 	on_third_of_four( std::uint8_t byte ) noexcept
 	{
 		if( 0x80u == (byte & 0xC0u) )
 		{
 			m_current_symbol <<= 6;
 			m_current_symbol |= (byte & 0x3Fu);

 			m_state = state_t::wait_fourth_of_four;
 		}
 		else
 		{
 			m_state = state_t::invalid;
 		}
 	}

 	void
 	on_fourth_of_four( std::uint8_t byte ) noexcept
 	{
 		if( 0x80u == (byte & 0xC0u) )
 		{
 			m_current_symbol <<= 6;
 			m_current_symbol |= (byte & 0x3Fu);

 			// Check for overlong sequence.
 			// The valid range for three bytes representation is 0x10000..0x10FFFF.
 			if( m_current_symbol < 0x10000u )
 			{
 				// The value is too small, it's overlong.
 				m_state = state_t::invalid;
 			}
 			else
 			{
 				// It's necessary to check for values above 0x10FFFF.
 				// There is no need to check 0xD800..0xDFFF range because
 				// it was already handled by overlong check.
 				if( m_current_symbol >= 0x110000 )
 					m_state = state_t::invalid;
 				else
 					m_state = state_t::wait_first_byte;
 			}
 		}
 		else
 		{
 			m_state = state_t::invalid;
 		}
 	}

 public:
 	utf8_checker_t() = default;

 	/*!
 	 * Checks another byte.
 	 *
 	 * @note
 	 * The actual value of the current symbol can be obtained only if
 	 * process_byte() returns `true` and the subsequent call to
 	 * finalized() returns `true`:
 	 *
 	 * @code
 	 * utf8checker_t checker;
 	 * for( const auto ch : some_string )
 	 * {
 	 * 	if( checker.process_byte() )
 	 * 	{
 	 * 		if( checker.finalized() )
 	 * 			process_unicode_symbol( checker.current_symbol() );
 	 * 	}
 	 * 	else
 	 * 	{
 	 * 		... // Invalid sequence found!
 	 * 		break;
 	 * 	}
 	 * }
 	 * @endcode
 	 *
 	 * @retval true if the sequence is still valid and the next byte
 	 * can be given to the next call to process_byte().
 	 *
 	 * @retval false if the sequence is invalid an there is no sense
 	 * to continue call process_byte().
 	 */
 	[[nodiscard]]
 	bool
 	process_byte( std::uint8_t byte ) noexcept
 	{
 		switch( m_state )
 		{
 			case state_t::wait_first_byte:
 				on_first_byte( byte );
 			break;

 			case state_t::wait_second_of_two:
 				on_second_of_two( byte );
 			break;

 			case state_t::wait_second_of_three:
 				on_second_of_three( byte );
 			break;

 			case state_t::wait_second_of_four:
 				on_second_of_four( byte );
 			break;

 			case state_t::wait_third_of_three:
 				on_third_of_three( byte );
 			break;

 			case state_t::wait_third_of_four:
 				on_third_of_four( byte );
 			break;

 			case state_t::wait_fourth_of_four:
 				on_fourth_of_four( byte );
 			break;

 			case state_t::invalid:
 				// Nothing to do.
 			break;
 		}

 		return (state_t::invalid != m_state);
 	}

 	/*!
 	 * @return true if the current sequence finalized.
 	 */
 	[[nodiscard]]
 	bool
 	finalized() const noexcept
 	{
 		return state_t::wait_first_byte == m_state;
 	}

 	/*!
 	 * Return the object into the initial state.
 	 */
 	void
 	reset() noexcept
 	{
 		m_current_symbol = 0u;
 		m_state = state_t::wait_first_byte;
 	}

 	/*!
 	 * Get the collected value of the current symbol.
 	 *
 	 * @note
 	 * It returns the actual value only if:
 	 *
 	 * - some bytes were successfully feed into process_byte();
 	 * - finalized() returns `true`.
 	 */
 	[[nodiscard]]
 	std::uint32_t
 	current_symbol() const noexcept { return m_current_symbol; }
 };

 } /* namespace utils */

 } /* namespace restinio */

 namespace decode_2009
 {

 constexpr std::uint32_t utf8_accept = 0;
 constexpr std::uint32_t utf8_reject = 1;

 static const uint8_t utf8d[] = {
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
 	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
 	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
 	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
 	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
 	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
 	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
 	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
 	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
 	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
 	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
 };

 std::uint32_t inline
 decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
 	std::uint32_t type = utf8d[byte];

 	*codep = (*state != utf8_accept)
 		? (byte & 0x3fu) | (*codep << 6)
 		: (0xff >> type) & (byte);

 	*state = utf8d[256 + *state*16 + type];
 	return *state;
 }

 } /* namespace decode_2009 */

 namespace decode_2010
 {

 constexpr std::uint32_t utf8_accept = 0;
 constexpr std::uint32_t utf8_reject = 12;

 static const uint8_t utf8d[] = {
 	// The first part of the table maps bytes to character classes that
 	// to reduce the size of the transition table and create bitmasks.
 	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 	 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
 	 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
 	 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
 	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

 	// The second part is a transition table that maps a combination
 	// of a state of the automaton and a character class to a state.
 	 0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
 	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
 	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
 	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
 	12,36,12,12,12,12,12,12,12,12,12,12, 
 };

 std::uint32_t inline
 decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
 	std::uint32_t type = utf8d[byte];

 	*codep = (*state != utf8_accept) ?
 		(byte & 0x3fu) | (*codep << 6) :
 		(0xff >> type) & (byte);

 	*state = utf8d[256 + *state + type];
 	return *state;
 }

 } /* namespace decode_2010 */

 bool
 check_validity_with_restinio(std::string_view str, std::uint32_t & out)
 {
 	restinio::utils::utf8_checker_t checker;
 	for( const auto ch : str )
 	{
 		if( checker.process_byte( ch ) )
 		{
 			if( checker.finalized() )
 				out += checker.current_symbol();
 		}
 		else
 			return false;
 	}

 	return true;
 }

 bool
 check_validity_with_decode_2009(std::string_view str, std::uint32_t & out)
 {
 	std::uint32_t state = decode_2009::utf8_accept;
 	std::uint32_t code_point;
 	for( const auto ch : str )
 	{
 		switch( decode_2009::decode( &state, &code_point,
 				static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
 		{
 		case decode_2009::utf8_accept:
 			out += code_point;
 		break;

 		case decode_2009::utf8_reject:
 			return false;
 		}
 	}

 	return true;
 }

 bool
 check_validity_with_decode_2010(std::string_view str, std::uint32_t & out)
 {
 	std::uint32_t state = decode_2010::utf8_accept;
 	std::uint32_t code_point;
 	for( const auto ch : str )
 	{
 		switch( decode_2010::decode( &state, &code_point,
 				static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
 		{
 		case decode_2010::utf8_accept:
 			out += code_point;
 		break;

 		case decode_2010::utf8_reject:
 			return false;
 		}
 	}

 	return true;
 }

 template< typename Checker >
 std::pair< bool, std::uint32_t >
 checking_loop(
 	Checker && checker,
 	unsigned int loops,
 	std::string_view str )
 {
 	std::uint32_t out = 0;
 	bool result = false;
 	for( unsigned int i = 0; i < loops; ++i )
 	{
 		out = 0;
 		result = checker( str, out );
 	}

 	return { result, out };
 }

 class duration_meter
 {
 	const char * _name;
 	const std::chrono::high_resolution_clock::time_point _started_at;

 public:
 	duration_meter( const char * name )
 		: _name{ name }
 		, _started_at{ std::chrono::high_resolution_clock::now() }
 	{}
 	~duration_meter()
 	{
 		const auto f = std::chrono::high_resolution_clock::now();

 		std::cout << "*** " << _name << ": "
 			<< std::chrono::duration_cast<std::chrono::microseconds>(
 				f - _started_at ).count()
 			<< "us *** " << std::endl;
 	}
 };

 template<typename Lambda>
 decltype(auto)
 measure( const char * name, Lambda && lambda )
 {
 	duration_meter meter{ name };
 	return lambda();
 }

 int main()
 {
 	std::string_view text{
 		"В последний раз статья, целиком посвященная открытому проекту RESTinio, вышла "
 		"на Хабре в декабре 2020-го года, без малого три года назад. Это был рассказ "
 		"о релизе версии 0.6.13. По сути, это был последний релиз, в котором в "
 		"RESTinio появилось что-то новое и важное. Потом были только небольшие "
 		"корректирующие релизы, исправляющие ошибки или адаптирующие RESTinio к "
 		"свежим версиям зависимостей. "
 		" "
 		"И вот спустя три года нам удалось выпустить новое существенное обновление. А "
 		"посему есть повод поговорить о том, что было удалено/добавлено/изменено в этой "
 		"версии. Ну и о причинах паузы в развитии и перспективах проекта вообще. "
 		" "
 		"Кому интересно, милости прошу под кат. "
 		" "
 		"Для тех же, кто про данную разработку слышит в первый раз: это наша попытка "
 		"сделать встраиваемый в C++ приложения HTTP(S)/WebSocket сервер, который бы "
 		"обладал и большой гибкостью, и нормальной производительностью, освобождал бы "
 		"пользователя от рутины, но не прятал бы абсолютно все детали 'под капот', и "
 		"удовлетворял бы нашим представлениям о том, как подобные вещи должны "
 		"выглядеть... "
 		" "
 		"Вроде бы получилось. Мне кажется, что раз уж RESTinio сумел набрать тысячу "
 		"звезд на GitHub, результат понравился и пригодился не только нам. Впрочем, это "
 		"уже совсем другая история. Давайте вернемся к рассказу об изменениях в версии "
 		"0.7.0 и к тому, почему этих изменений пришлось ждать так долго... "
 		" "
 		"Что нового в 0.7.0 "
 		"Переход на C++17 "
 		"В версии 0.7.0 мы перешли с C++14 на C++17. Вероятно, это не самое лучшее из "
 		"наших решений, ведь кто-то все еще вынужден оставаться на C++14 не имея "
 		"возможности обновиться до C++17, однако мы для себя больше не видели смысла "
 		"держаться за C++14. "
 		" "
 		"Выгода от перехода на C++17 заключалась прежде всего в том, что удалось "
 		"избавиться от таких зависимостей, как optional-lite, string_view-lite и "
 		"variant-lite, т.к. теперь это все доступно в стандартной библиотеке. Так что "
 		"остается сказать большое спасибо Martin Moene за его труд по написанию и "
 		"сопровождению этих библиотек, они нам здорово помогали в течении шести лет, но "
 		"дальше мы пойдем с stdlib 🙂 "
 		" "
 		"Хотя осталась зависимость от expected-lite, но с ней придется жить еще долго. "
 		"Если уж мы на 17-ые плюсы перебрались только в 2023-ем, то перехода на C++23 "
 		"нужно будет подождать еще лет пять-шесть, а то и девять-десять 😆 "
 		" "
 		"Выгода от 17-го стандарта проявилась еще и в том, что в ряде мест мы смогли "
 		"выбросить сложные (и не очень) шаблонные конструкции в пользу простых if "
 		"constexpr и fold expressions. "
 		" "
 		"Так что дальше пойдем уже в рамках C++17. Если кого-то это расстраивает, то уж "
 		"простите за откровенность, но за поддержку C++14 нам никто не платит. "
 		" "
 		"Переход на llhttp, Catch2 v3 и modern CMake "
 		"Изначально RESTinio использовал nodejs/http-parser в качестве парсера "
 		"HTTP-запросов. Но несколько лет назад его развитие и поддержка прекратились. "
 		"Посему в версии 0.7.0 мы переехали на nodejs/llhttp. Собственно, этот переезд и "
 		"был главной мотивацией для выпуска версии 0.7.0. "
 		" "
 		"Заодно мы обновили у себя Catch2. Эта библиотека начиная с версии 3.0 уже не "
 		"является header-only и требует компиляции. "
 	};

 	auto v1 = measure( "   restinio", [&]() {
 			return checking_loop(
 					check_validity_with_restinio, 100'000u, text );
 		} );

 	auto v2 = measure( "decode_2009", [&]() {
 			return checking_loop(
 					check_validity_with_decode_2009, 100'000u, text );
 		} );

 	auto v3 = measure( "decode_2010", [&]() {
 			return checking_loop(
 					check_validity_with_decode_2010, 100'000u, text );
 		} );

 	std::cout << v1.first << " " << v1.second << std::endl;
 	std::cout << v2.first << " " << v2.second << std::endl;
 	std::cout << v3.first << " " << v3.second << std::endl;
 }
	#include <algorithm>
	#include <array>
	#include <chrono>
	#include <iostream>
	#include <span>
	#include <string>
	#include <string_view>
	#include <cstdint>

	namespace restinio
	{

	namespace utils
	{

	//
	// utf8_checker_t
	//

	/*!
	* @brief Helper class for checking UTF-8 byte sequence during parsing
	* URI or incoming byte stream.
	*/
	class utf8_checker_t
	{
	//! Enumeration of all possible checker states.
	enum class state_t
	{
	wait_first_byte,
	wait_second_of_two,
	wait_second_of_three,
	wait_second_of_four,
	wait_third_of_three,
	wait_third_of_four,
	wait_fourth_of_four,
	invalid,
	};

	//! The current UNICODE symbol.
	/*!
	* Contains a valid value only if some bytes were successfully
	* processed by process_byte() and the current state is
	* wait_first_byte.
	*/
	std::uint32_t m_current_symbol = 0u;

	//! The current state of the checker.
	state_t m_state{ state_t::wait_first_byte };

	void
	on_first_byte( std::uint8_t byte ) noexcept
	{
	if( byte <= 0x7Fu )
	{
	m_state = state_t::wait_first_byte;
	m_current_symbol = byte;
	}
	else if( 0xC0u == (byte & 0xE0u) )
	{
	m_state = state_t::wait_second_of_two;
	m_current_symbol = (byte & 0x1Fu);
	}
	else if( 0xE0u == (byte & 0xF0u) )
	{
	m_state = state_t::wait_second_of_three;
	m_current_symbol = (byte & 0x0Fu);
	}
	else if( 0xF0u == (byte & 0xF8u) )
	{
	m_state = state_t::wait_second_of_four;
	m_current_symbol = (byte & 0x07u);
	}
	else
	{
	// Because UTF-8 can represent only ranges from:
	//
	// 0000 0000-0000 007F
	// 0000 0080-0000 07FF
	// 0000 0800-0000 FFFF
	// 0001 0000-0010 FFFF
	//
	// There is no need to check masks like 0b111110xx and so on.
	//
	// See https://datatracker.ietf.org/doc/html/rfc3629
	//
	m_state = state_t::invalid;
	}
	}

	void
	on_second_of_two( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	// Check for overlong sequence.
	// The valid range for two bytes representation is 0x0080..0x07FF.
	if( m_current_symbol < 0x0080u )
	{
	// The value is too small, it's overlong.
	m_state = state_t::invalid;
	}
	else
	// Three is no need to check the result value against
	// invalid ranges (0xD800..0xDFFF and 0x110000..)
	// because two bytes only represents 0x0080..0x07FF.
	m_state = state_t::wait_first_byte;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_second_of_three( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	m_state = state_t::wait_third_of_three;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_second_of_four( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	m_state = state_t::wait_third_of_four;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_third_of_three( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	// Check for overlong sequence.
	// The valid range for three bytes representation is 0x0800..0xFFFF.
	if( m_current_symbol < 0x0800u )
	{
	// The value is too small, it's overlong.
	m_state = state_t::invalid;
	}
	else
	{
	// It's necessary to check illigal points 0xD800..0xDFFF.
	if( m_current_symbol >= 0xD800 && m_current_symbol <= 0xDFFF )
	m_state = state_t::invalid;
	else
	m_state = state_t::wait_first_byte;
	}
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_third_of_four( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	m_state = state_t::wait_fourth_of_four;
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	void
	on_fourth_of_four( std::uint8_t byte ) noexcept
	{
	if( 0x80u == (byte & 0xC0u) )
	{
	m_current_symbol <<= 6;
	m_current_symbol \|= (byte & 0x3Fu);

	// Check for overlong sequence.
	// The valid range for three bytes representation is 0x10000..0x10FFFF.
	if( m_current_symbol < 0x10000u )
	{
	// The value is too small, it's overlong.
	m_state = state_t::invalid;
	}
	else
	{
	// It's necessary to check for values above 0x10FFFF.
	// There is no need to check 0xD800..0xDFFF range because
	// it was already handled by overlong check.
	if( m_current_symbol >= 0x110000 )
	m_state = state_t::invalid;
	else
	m_state = state_t::wait_first_byte;
	}
	}
	else
	{
	m_state = state_t::invalid;
	}
	}

	public:
	utf8_checker_t() = default;

	/*!
	* Checks another byte.
	*
	* @note
	* The actual value of the current symbol can be obtained only if
	* process_byte() returns `true` and the subsequent call to
	* finalized() returns `true`:
	*
	* @code
	* utf8checker_t checker;
	* for( const auto ch : some_string )
	* {
	* if( checker.process_byte() )
	* {
	* if( checker.finalized() )
	* process_unicode_symbol( checker.current_symbol() );
	* }
	* else
	* {
	* ... // Invalid sequence found!
	* break;
	* }
	* }
	* @endcode
	*
	* @retval true if the sequence is still valid and the next byte
	* can be given to the next call to process_byte().
	*
	* @retval false if the sequence is invalid an there is no sense
	* to continue call process_byte().
	*/
	[[nodiscard]]
	bool
	process_byte( std::uint8_t byte ) noexcept
	{
	switch( m_state )
	{
	case state_t::wait_first_byte:
	on_first_byte( byte );
	break;

	case state_t::wait_second_of_two:
	on_second_of_two( byte );
	break;

	case state_t::wait_second_of_three:
	on_second_of_three( byte );
	break;

	case state_t::wait_second_of_four:
	on_second_of_four( byte );
	break;

	case state_t::wait_third_of_three:
	on_third_of_three( byte );
	break;

	case state_t::wait_third_of_four:
	on_third_of_four( byte );
	break;

	case state_t::wait_fourth_of_four:
	on_fourth_of_four( byte );
	break;

	case state_t::invalid:
	// Nothing to do.
	break;
	}

	return (state_t::invalid != m_state);
	}

	/*!
	* @return true if the current sequence finalized.
	*/
	[[nodiscard]]
	bool
	finalized() const noexcept
	{
	return state_t::wait_first_byte == m_state;
	}

	/*!
	* Return the object into the initial state.
	*/
	void
	reset() noexcept
	{
	m_current_symbol = 0u;
	m_state = state_t::wait_first_byte;
	}

	/*!
	* Get the collected value of the current symbol.
	*
	* @note
	* It returns the actual value only if:
	*
	* - some bytes were successfully feed into process_byte();
	* - finalized() returns `true`.
	*/
	[[nodiscard]]
	std::uint32_t
	current_symbol() const noexcept { return m_current_symbol; }
	};

	} /* namespace utils */

	} /* namespace restinio */

	namespace decode_2009
	{

	constexpr std::uint32_t utf8_accept = 0;
	constexpr std::uint32_t utf8_reject = 1;

	static const uint8_t utf8d[] = {
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 00..1f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 20..3f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 40..5f
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, // 60..7f
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9, // 80..9f
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, // a0..bf
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2, // c0..df
	0xa,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x3,0x4,0x3,0x3, // e0..ef
	0xb,0x6,0x6,0x6,0x5,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8,0x8, // f0..ff
	0x0,0x1,0x2,0x3,0x5,0x8,0x7,0x1,0x1,0x1,0x4,0x6,0x1,0x1,0x1,0x1, // s0..s0
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,0,1,1,1,1,1,0,1,0,1,1,1,1,1,1, // s1..s2
	1,2,1,1,1,1,1,2,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1, // s3..s4
	1,2,1,1,1,1,1,1,1,2,1,1,1,1,1,1,1,1,1,1,1,1,1,3,1,3,1,1,1,1,1,1, // s5..s6
	1,3,1,1,1,1,1,3,1,3,1,1,1,1,1,1,1,3,1,1,1,1,1,1,1,1,1,1,1,1,1,1, // s7..s8
	};

	std::uint32_t inline
	decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
	std::uint32_t type = utf8d[byte];

	codep = (state != utf8_accept)
	? (byte & 0x3fu) \| (*codep << 6)
	: (0xff >> type) & (byte);

	state = utf8d[256 + state*16 + type];
	return *state;
	}

	} /* namespace decode_2009 */

	namespace decode_2010
	{

	constexpr std::uint32_t utf8_accept = 0;
	constexpr std::uint32_t utf8_reject = 12;

	static const uint8_t utf8d[] = {
	// The first part of the table maps bytes to character classes that
	// to reduce the size of the transition table and create bitmasks.
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
	1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
	7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
	8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
	10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,

	// The second part is a transition table that maps a combination
	// of a state of the automaton and a character class to a state.
	0,12,24,36,60,96,84,12,12,12,48,72, 12,12,12,12,12,12,12,12,12,12,12,12,
	12, 0,12,12,12,12,12, 0,12, 0,12,12, 12,24,12,12,12,12,12,24,12,24,12,12,
	12,12,12,12,12,12,12,24,12,12,12,12, 12,24,12,12,12,12,12,12,12,24,12,12,
	12,12,12,12,12,12,12,36,12,36,12,12, 12,36,12,12,12,12,12,36,12,36,12,12,
	12,36,12,12,12,12,12,12,12,12,12,12,
	};

	std::uint32_t inline
	decode(std::uint32_t* state, std::uint32_t* codep, std::uint32_t byte) {
	std::uint32_t type = utf8d[byte];

	codep = (state != utf8_accept) ?
	(byte & 0x3fu) \| (*codep << 6) :
	(0xff >> type) & (byte);

	state = utf8d[256 + state + type];
	return *state;
	}

	} /* namespace decode_2010 */

	bool
	check_validity_with_restinio(std::string_view str, std::uint32_t & out)
	{
	restinio::utils::utf8_checker_t checker;
	for( const auto ch : str )
	{
	if( checker.process_byte( ch ) )
	{
	if( checker.finalized() )
	out += checker.current_symbol();
	}
	else
	return false;
	}

	return true;
	}

	bool
	check_validity_with_decode_2009(std::string_view str, std::uint32_t & out)
	{
	std::uint32_t state = decode_2009::utf8_accept;
	std::uint32_t code_point;
	for( const auto ch : str )
	{
	switch( decode_2009::decode( &state, &code_point,
	static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
	{
	case decode_2009::utf8_accept:
	out += code_point;
	break;

	case decode_2009::utf8_reject:
	return false;
	}
	}

	return true;
	}

	bool
	check_validity_with_decode_2010(std::string_view str, std::uint32_t & out)
	{
	std::uint32_t state = decode_2010::utf8_accept;
	std::uint32_t code_point;
	for( const auto ch : str )
	{
	switch( decode_2010::decode( &state, &code_point,
	static_cast<std::uint32_t>( static_cast<unsigned char>(ch) ) ) )
	{
	case decode_2010::utf8_accept:
	out += code_point;
	break;

	case decode_2010::utf8_reject:
	return false;
	}
	}

	return true;
	}

	template< typename Checker >
	std::pair< bool, std::uint32_t >
	checking_loop(
	Checker && checker,
	unsigned int loops,
	std::string_view str )
	{
	std::uint32_t out = 0;
	bool result = false;
	for( unsigned int i = 0; i < loops; ++i )
	{
	out = 0;
	result = checker( str, out );
	}

	return { result, out };
	}

	class duration_meter
	{
	const char * _name;
	const std::chrono::high_resolution_clock::time_point _started_at;

	public:
	duration_meter( const char * name )
	: _name{ name }
	, _started_at{ std::chrono::high_resolution_clock::now() }
	{}
	~duration_meter()
	{
	const auto f = std::chrono::high_resolution_clock::now();

	std::cout << "*** " << _name << ": "
	<< std::chrono::duration_cast<std::chrono::microseconds>(
	f - _started_at ).count()
	<< "us *** " << std::endl;
	}
	};

	template<typename Lambda>
	decltype(auto)
	measure( const char * name, Lambda && lambda )
	{
	duration_meter meter{ name };
	return lambda();
	}

	int main()
	{
	std::string_view text{
	"В последний раз статья, целиком посвященная открытому проекту RESTinio, вышла "
	"на Хабре в декабре 2020-го года, без малого три года назад. Это был рассказ "
	"о релизе версии 0.6.13. По сути, это был последний релиз, в котором в "
	"RESTinio появилось что-то новое и важное. Потом были только небольшие "
	"корректирующие релизы, исправляющие ошибки или адаптирующие RESTinio к "
	"свежим версиям зависимостей. "
	" "
	"И вот спустя три года нам удалось выпустить новое существенное обновление. А "
	"посему есть повод поговорить о том, что было удалено/добавлено/изменено в этой "
	"версии. Ну и о причинах паузы в развитии и перспективах проекта вообще. "
	" "
	"Кому интересно, милости прошу под кат. "
	" "
	"Для тех же, кто про данную разработку слышит в первый раз: это наша попытка "
	"сделать встраиваемый в C++ приложения HTTP(S)/WebSocket сервер, который бы "
	"обладал и большой гибкостью, и нормальной производительностью, освобождал бы "
	"пользователя от рутины, но не прятал бы абсолютно все детали 'под капот', и "
	"удовлетворял бы нашим представлениям о том, как подобные вещи должны "
	"выглядеть... "
	" "
	"Вроде бы получилось. Мне кажется, что раз уж RESTinio сумел набрать тысячу "
	"звезд на GitHub, результат понравился и пригодился не только нам. Впрочем, это "
	"уже совсем другая история. Давайте вернемся к рассказу об изменениях в версии "
	"0.7.0 и к тому, почему этих изменений пришлось ждать так долго... "
	" "
	"Что нового в 0.7.0 "
	"Переход на C++17 "
	"В версии 0.7.0 мы перешли с C++14 на C++17. Вероятно, это не самое лучшее из "
	"наших решений, ведь кто-то все еще вынужден оставаться на C++14 не имея "
	"возможности обновиться до C++17, однако мы для себя больше не видели смысла "
	"держаться за C++14. "
	" "
	"Выгода от перехода на C++17 заключалась прежде всего в том, что удалось "
	"избавиться от таких зависимостей, как optional-lite, string_view-lite и "
	"variant-lite, т.к. теперь это все доступно в стандартной библиотеке. Так что "
	"остается сказать большое спасибо Martin Moene за его труд по написанию и "
	"сопровождению этих библиотек, они нам здорово помогали в течении шести лет, но "
	"дальше мы пойдем с stdlib 🙂 "
	" "
	"Хотя осталась зависимость от expected-lite, но с ней придется жить еще долго. "
	"Если уж мы на 17-ые плюсы перебрались только в 2023-ем, то перехода на C++23 "
	"нужно будет подождать еще лет пять-шесть, а то и девять-десять 😆 "
	" "
	"Выгода от 17-го стандарта проявилась еще и в том, что в ряде мест мы смогли "
	"выбросить сложные (и не очень) шаблонные конструкции в пользу простых if "
	"constexpr и fold expressions. "
	" "
	"Так что дальше пойдем уже в рамках C++17. Если кого-то это расстраивает, то уж "
	"простите за откровенность, но за поддержку C++14 нам никто не платит. "
	" "
	"Переход на llhttp, Catch2 v3 и modern CMake "
	"Изначально RESTinio использовал nodejs/http-parser в качестве парсера "
	"HTTP-запросов. Но несколько лет назад его развитие и поддержка прекратились. "
	"Посему в версии 0.7.0 мы переехали на nodejs/llhttp. Собственно, этот переезд и "
	"был главной мотивацией для выпуска версии 0.7.0. "
	" "
	"Заодно мы обновили у себя Catch2. Эта библиотека начиная с версии 3.0 уже не "
	"является header-only и требует компиляции. "
	};

	auto v1 = measure( " restinio", [&]() {
	return checking_loop(
	check_validity_with_restinio, 100'000u, text );
	} );

	auto v2 = measure( "decode_2009", [&]() {
	return checking_loop(
	check_validity_with_decode_2009, 100'000u, text );
	} );

	auto v3 = measure( "decode_2010", [&]() {
	return checking_loop(
	check_validity_with_decode_2010, 100'000u, text );
	} );

	std::cout << v1.first << " " << v1.second << std::endl;
	std::cout << v2.first << " " << v2.second << std::endl;
	std::cout << v3.first << " " << v3.second << std::endl;
	}