From b19e43ae9976fe1067e8216af8e8bf6cd8fb6ce2 Mon Sep 17 00:00:00 2001 From: frosch Date: Tue, 1 Apr 2025 14:41:39 +0200 Subject: [PATCH] Add: Utf8View and iterator. --- src/core/CMakeLists.txt | 2 + src/core/utf8.cpp | 92 ++++++++++++++++++++++ src/core/utf8.hpp | 121 ++++++++++++++++++++++++++++ src/settingsgen/CMakeLists.txt | 1 + src/strgen/CMakeLists.txt | 1 + src/tests/CMakeLists.txt | 1 + src/tests/utf8.cpp | 140 +++++++++++++++++++++++++++++++++ 7 files changed, 358 insertions(+) create mode 100644 src/core/utf8.cpp create mode 100644 src/core/utf8.hpp create mode 100644 src/tests/utf8.cpp diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 85ed8222a5..7177ad3224 100644 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -24,4 +24,6 @@ add_files( smallstack_type.hpp container_func.hpp strong_typedef_type.hpp + utf8.cpp + utf8.hpp ) diff --git a/src/core/utf8.cpp b/src/core/utf8.cpp new file mode 100644 index 0000000000..8a4157bfd5 --- /dev/null +++ b/src/core/utf8.cpp @@ -0,0 +1,92 @@ +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see . + */ + +/** + * @file utf8.cpp Handling of UTF-8 encoded data. + */ + +#include "../stdafx.h" +#include "utf8.hpp" +#include "../safeguards.h" + +/** + * Encode a character to UTF-8. + * @param c Character + * @return Binary data and length. Length is zero, if "c" is out of range. + */ +[[nodiscard]] std::pair EncodeUtf8(char32_t c) +{ + std::pair result{}; + auto &[buf, len] = result; + if (c < 0x80) { + buf[len++] = c; + } else if (c < 0x800) { + buf[len++] = 0xC0 + GB(c, 6, 5); + buf[len++] = 0x80 + GB(c, 0, 6); + } else if (c < 0x10000) { + buf[len++] = 0xE0 + GB(c, 12, 4); + buf[len++] = 0x80 + GB(c, 6, 6); + buf[len++] = 0x80 + GB(c, 0, 6); + } else if (c < 0x110000) { + buf[len++] = 0xF0 + GB(c, 18, 3); + buf[len++] = 0x80 + GB(c, 12, 6); + buf[len++] = 0x80 + GB(c, 6, 6); + buf[len++] = 0x80 + GB(c, 0, 6); + } + return result; +} + +/** + * Decode a character from UTF-8. + * @param buf Binary data. + * @return Length and character. Length is zero, if the input data is invalid. + */ +[[nodiscard]] std::pair DecodeUtf8(std::string_view buf) +{ + if (buf.size() >= 1 && !HasBit(buf[0], 7)) { + /* Single byte character: 0xxxxxxx */ + char32_t c = buf[0]; + return {1, c}; + } else if (buf.size() >= 2 && GB(buf[0], 5, 3) == 6) { + if (IsUtf8Part(buf[1])) { + /* Double byte character: 110xxxxx 10xxxxxx */ + char32_t c = GB(buf[0], 0, 5) << 6 | GB(buf[1], 0, 6); + if (c >= 0x80) return {2, c}; + } + } else if (buf.size() >= 3 && GB(buf[0], 4, 4) == 14) { + if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2])) { + /* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */ + char32_t c = GB(buf[0], 0, 4) << 12 | GB(buf[1], 0, 6) << 6 | GB(buf[2], 0, 6); + if (c >= 0x800) return {3, c}; + } + } else if (buf.size() >= 4 && GB(buf[0], 3, 5) == 30) { + if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2]) && IsUtf8Part(buf[3])) { + /* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + char32_t c = GB(buf[0], 0, 3) << 18 | GB(buf[1], 0, 6) << 12 | GB(buf[2], 0, 6) << 6 | GB(buf[3], 0, 6); + if (c >= 0x10000 && c <= 0x10FFFF) return {4, c}; + } + } + return {}; +} + +/** + * Create iterator pointing at codepoint, which occupies the byte position "offset". + * "offset" does not need to point at the first byte of the UTF-8 sequence, + * the iterator will still address the correct position of the first byte. + * @param offset Byte offset into view. + * @return Iterator pointing at start of codepoint, of which "offset" is part of. + */ +Utf8View::iterator Utf8View::GetIterAtByte(size_t offset) const +{ + assert(offset <= this->src.size()); + if (offset >= this->src.size()) return this->end(); + + /* Sanitize iterator to point to the start of a codepoint */ + auto it = iterator(this->src, offset + 1); + --it; + return it; +} diff --git a/src/core/utf8.hpp b/src/core/utf8.hpp new file mode 100644 index 0000000000..db1354ef24 --- /dev/null +++ b/src/core/utf8.hpp @@ -0,0 +1,121 @@ +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see . + */ + +/** + * @file utf8.hpp Handling of UTF-8 encoded data. + */ + +#ifndef UTF8_HPP +#define UTF8_HPP + +#include +#include "../string_func.h" + +[[nodiscard]] std::pair EncodeUtf8(char32_t c); +[[nodiscard]] std::pair DecodeUtf8(std::string_view buf); + +/** + * Constant span of UTF-8 encoded data. + */ +class Utf8View { + std::string_view src; +public: + Utf8View() = default; + Utf8View(std::string_view src) : src(src) {} + + /** + * Bidirectional input iterator over codepoints. + * + * If invalid encodings are present: + * - the iterator will skip overlong encodings, and + * - dereferencing returns a placeholder char '?'. + */ + class iterator { + std::string_view src; + size_t position = 0; + public: + using value_type = char32_t; + using difference_type = std::ptrdiff_t; + using iterator_category = std::bidirectional_iterator_tag; + using pointer = void; + using reference = void; + + iterator() = default; + iterator(std::string_view src, size_t position) : src(src), position(position) {} + + size_t GetByteOffset() const + { + return this->position; + } + + bool operator==(const iterator &rhs) const + { + assert(this->src.data() == rhs.src.data()); + return this->position == rhs.position; + } + + std::strong_ordering operator<=>(const iterator &rhs) const + { + assert(this->src.data() == rhs.src.data()); + return this->position <=> rhs.position; + } + + char32_t operator*() const + { + assert(this->position < this->src.size()); + auto [len, c] = DecodeUtf8(this->src.substr(this->position)); + return len > 0 ? c : '?'; + } + + iterator& operator++() + { + auto size = this->src.size(); + assert(this->position < size); + do { + ++this->position; + } while (this->position < size && IsUtf8Part(this->src[this->position])); + return *this; + } + + iterator operator++(int) + { + iterator result = *this; + ++*this; + return result; + } + + iterator& operator--() + { + assert(this->position > 0); + do { + --this->position; + } while (this->position > 0 && IsUtf8Part(this->src[this->position])); + return *this; + } + + iterator operator--(int) + { + iterator result = *this; + --*this; + return result; + } + }; + + iterator begin() const + { + return iterator(this->src, 0); + } + + iterator end() const + { + return iterator(this->src, this->src.size()); + } + + iterator GetIterAtByte(size_t offset) const; +}; + +#endif /* UTF8_HPP */ diff --git a/src/settingsgen/CMakeLists.txt b/src/settingsgen/CMakeLists.txt index 49e65c0971..f088a2a642 100644 --- a/src/settingsgen/CMakeLists.txt +++ b/src/settingsgen/CMakeLists.txt @@ -10,6 +10,7 @@ if (NOT HOST_BINARY_DIR) ../error.cpp ../ini_load.cpp ../string.cpp + ../core/utf8.cpp ) add_definitions(-DSETTINGSGEN) add_executable(settingsgen ${sourcefiles}) diff --git a/src/strgen/CMakeLists.txt b/src/strgen/CMakeLists.txt index 581f60eeb0..5d67a744e7 100644 --- a/src/strgen/CMakeLists.txt +++ b/src/strgen/CMakeLists.txt @@ -12,6 +12,7 @@ if (NOT HOST_BINARY_DIR) ../misc/getoptdata.cpp ../error.cpp ../string.cpp + ../core/utf8.cpp ) add_definitions(-DSTRGEN) add_executable(strgen ${sourcefiles}) diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt index c4cbfa4b2f..bd1876297e 100644 --- a/src/tests/CMakeLists.txt +++ b/src/tests/CMakeLists.txt @@ -12,4 +12,5 @@ add_test_files( test_network_crypto.cpp test_script_admin.cpp test_window_desc.cpp + utf8.cpp ) diff --git a/src/tests/utf8.cpp b/src/tests/utf8.cpp new file mode 100644 index 0000000000..c8ff3a191a --- /dev/null +++ b/src/tests/utf8.cpp @@ -0,0 +1,140 @@ +/* + * This file is part of OpenTTD. + * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2. + * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see . + */ + +/** @file utf8.cpp Test functionality from core/utf8. */ + +#include "../stdafx.h" + +#include "../3rdparty/catch2/catch.hpp" + +#include "../core/utf8.hpp" + +#include "../safeguards.h" + +using namespace std::literals; + +TEST_CASE("Utf8View - empty") +{ + Utf8View view; + auto begin = view.begin(); + auto end = view.end(); + CHECK(begin == end); + CHECK(begin.GetByteOffset() == 0); +} + +TEST_CASE("Utf8View - invalid") +{ + Utf8View view("\u1234\x80\x80""a\xFF\x80\x80\x80\x80\x80""b\xF0"); + auto begin = view.begin(); + auto end = view.end(); + CHECK(begin < end); + auto it = begin; + CHECK(it == begin); + CHECK(it.GetByteOffset() == 0); + CHECK(*it == 0x1234); + ++it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 5); + CHECK(*it == 'a'); + ++it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 6); + CHECK(*it == '?'); + ++it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 12); + CHECK(*it == 'b'); + ++it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 13); + CHECK(*it == '?'); + ++it; + CHECK(it.GetByteOffset() == 14); + CHECK(begin < it); + CHECK(it == end); + --it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 13); + CHECK(*it == '?'); + --it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 12); + CHECK(*it == 'b'); + --it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 6); + CHECK(*it == '?'); + --it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 5); + CHECK(*it == 'a'); + --it; + CHECK(it == begin); + CHECK(it.GetByteOffset() == 0); + CHECK(*it == 0x1234); +} + +TEST_CASE("Utf8View - iterate") +{ + Utf8View view("\u1234a\0b\U00012345"sv); + auto begin = view.begin(); + auto end = view.end(); + CHECK(begin < end); + auto it = begin; + CHECK(it == begin); + CHECK(it.GetByteOffset() == 0); + CHECK(*it == 0x1234); + CHECK(it == view.GetIterAtByte(0)); + CHECK(it == view.GetIterAtByte(1)); + CHECK(it == view.GetIterAtByte(2)); + ++it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 3); + CHECK(*it == 'a'); + CHECK(it == view.GetIterAtByte(3)); + ++it; + CHECK(it.GetByteOffset() == 4); + CHECK(*it == 0); + CHECK(it == view.GetIterAtByte(4)); + ++it; + CHECK(it.GetByteOffset() == 5); + CHECK(*it == 'b'); + CHECK(it == view.GetIterAtByte(5)); + ++it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 6); + CHECK(*it == 0x00012345); + CHECK(it == view.GetIterAtByte(6)); + CHECK(it == view.GetIterAtByte(7)); + CHECK(it == view.GetIterAtByte(8)); + CHECK(it == view.GetIterAtByte(9)); + ++it; + CHECK(begin < it); + CHECK(it.GetByteOffset() == 10); + CHECK(it == end); + CHECK(it == view.GetIterAtByte(10)); + --it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 6); + CHECK(*it == 0x00012345); + --it; + CHECK(begin < it); + CHECK(it < end); + CHECK(it.GetByteOffset() == 5); + CHECK(*it == 'b'); +}