From b19e43ae9976fe1067e8216af8e8bf6cd8fb6ce2 Mon Sep 17 00:00:00 2001
From: frosch <frosch@openttd.org>
Date: Tue, 1 Apr 2025 14:41:39 +0200
Subject: [PATCH] Add: Utf8View and iterator.

---
 src/core/CMakeLists.txt        |   2 +
 src/core/utf8.cpp              |  92 ++++++++++++++++++++++
 src/core/utf8.hpp              | 121 ++++++++++++++++++++++++++++
 src/settingsgen/CMakeLists.txt |   1 +
 src/strgen/CMakeLists.txt      |   1 +
 src/tests/CMakeLists.txt       |   1 +
 src/tests/utf8.cpp             | 140 +++++++++++++++++++++++++++++++++
 7 files changed, 358 insertions(+)
 create mode 100644 src/core/utf8.cpp
 create mode 100644 src/core/utf8.hpp
 create mode 100644 src/tests/utf8.cpp

diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt
index 85ed8222a5..7177ad3224 100644
--- a/src/core/CMakeLists.txt
+++ b/src/core/CMakeLists.txt
@@ -24,4 +24,6 @@ add_files(
     smallstack_type.hpp
     container_func.hpp
     strong_typedef_type.hpp
+    utf8.cpp
+    utf8.hpp
 )
diff --git a/src/core/utf8.cpp b/src/core/utf8.cpp
new file mode 100644
index 0000000000..8a4157bfd5
--- /dev/null
+++ b/src/core/utf8.cpp
@@ -0,0 +1,92 @@
+/*
+ * This file is part of OpenTTD.
+ * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
+ * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file utf8.cpp Handling of UTF-8 encoded data.
+ */
+
+#include "../stdafx.h"
+#include "utf8.hpp"
+#include "../safeguards.h"
+
+/**
+ * Encode a character to UTF-8.
+ * @param c Character
+ * @return Binary data and length. Length is zero, if "c" is out of range.
+ */
+[[nodiscard]] std::pair<char[4], size_t> EncodeUtf8(char32_t c)
+{
+	std::pair<char[4], size_t> result{};
+	auto &[buf, len] = result;
+	if (c < 0x80) {
+		buf[len++] = c;
+	} else if (c < 0x800) {
+		buf[len++] = 0xC0 + GB(c,  6, 5);
+		buf[len++] = 0x80 + GB(c,  0, 6);
+	} else if (c < 0x10000) {
+		buf[len++] = 0xE0 + GB(c, 12, 4);
+		buf[len++] = 0x80 + GB(c,  6, 6);
+		buf[len++] = 0x80 + GB(c,  0, 6);
+	} else if (c < 0x110000) {
+		buf[len++] = 0xF0 + GB(c, 18, 3);
+		buf[len++] = 0x80 + GB(c, 12, 6);
+		buf[len++] = 0x80 + GB(c,  6, 6);
+		buf[len++] = 0x80 + GB(c,  0, 6);
+	}
+	return result;
+}
+
+/**
+ * Decode a character from UTF-8.
+ * @param buf Binary data.
+ * @return Length and character. Length is zero, if the input data is invalid.
+ */
+[[nodiscard]] std::pair<size_t, char32_t> DecodeUtf8(std::string_view buf)
+{
+	if (buf.size() >= 1 && !HasBit(buf[0], 7)) {
+		/* Single byte character: 0xxxxxxx */
+		char32_t c = buf[0];
+		return {1, c};
+	} else if (buf.size() >= 2 && GB(buf[0], 5, 3) == 6) {
+		if (IsUtf8Part(buf[1])) {
+			/* Double byte character: 110xxxxx 10xxxxxx */
+			char32_t c = GB(buf[0], 0, 5) << 6 | GB(buf[1], 0, 6);
+			if (c >= 0x80) return {2, c};
+		}
+	} else if (buf.size() >= 3 && GB(buf[0], 4, 4) == 14) {
+		if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2])) {
+			/* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
+			char32_t c = GB(buf[0], 0, 4) << 12 | GB(buf[1], 0, 6) << 6 | GB(buf[2], 0, 6);
+			if (c >= 0x800) return {3, c};
+		}
+	} else if (buf.size() >= 4 && GB(buf[0], 3, 5) == 30) {
+		if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2]) && IsUtf8Part(buf[3])) {
+			/* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
+			char32_t c = GB(buf[0], 0, 3) << 18 | GB(buf[1], 0, 6) << 12 | GB(buf[2], 0, 6) << 6 | GB(buf[3], 0, 6);
+			if (c >= 0x10000 && c <= 0x10FFFF) return {4, c};
+		}
+	}
+	return {};
+}
+
+/**
+ * Create iterator pointing at codepoint, which occupies the byte position "offset".
+ * "offset" does not need to point at the first byte of the UTF-8 sequence,
+ * the iterator will still address the correct position of the first byte.
+ * @param offset Byte offset into view.
+ * @return Iterator pointing at start of codepoint, of which "offset" is part of.
+ */
+Utf8View::iterator Utf8View::GetIterAtByte(size_t offset) const
+{
+	assert(offset <= this->src.size());
+	if (offset >= this->src.size()) return this->end();
+
+	/* Sanitize iterator to point to the start of a codepoint */
+	auto it = iterator(this->src, offset + 1);
+	--it;
+	return it;
+}
diff --git a/src/core/utf8.hpp b/src/core/utf8.hpp
new file mode 100644
index 0000000000..db1354ef24
--- /dev/null
+++ b/src/core/utf8.hpp
@@ -0,0 +1,121 @@
+/*
+ * This file is part of OpenTTD.
+ * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
+ * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/**
+ * @file utf8.hpp Handling of UTF-8 encoded data.
+ */
+
+#ifndef UTF8_HPP
+#define UTF8_HPP
+
+#include <iterator>
+#include "../string_func.h"
+
+[[nodiscard]] std::pair<char[4], size_t> EncodeUtf8(char32_t c);
+[[nodiscard]] std::pair<size_t, char32_t> DecodeUtf8(std::string_view buf);
+
+/**
+ * Constant span of UTF-8 encoded data.
+ */
+class Utf8View {
+	std::string_view src;
+public:
+	Utf8View() = default;
+	Utf8View(std::string_view src) : src(src) {}
+
+	/**
+	 * Bidirectional input iterator over codepoints.
+	 *
+	 * If invalid encodings are present:
+	 * - the iterator will skip overlong encodings, and
+	 * - dereferencing returns a placeholder char '?'.
+	 */
+	class iterator {
+		std::string_view src;
+		size_t position = 0;
+	public:
+		using value_type = char32_t;
+		using difference_type = std::ptrdiff_t;
+		using iterator_category = std::bidirectional_iterator_tag;
+		using pointer = void;
+		using reference = void;
+
+		iterator() = default;
+		iterator(std::string_view src, size_t position) : src(src), position(position) {}
+
+		size_t GetByteOffset() const
+		{
+			return this->position;
+		}
+
+		bool operator==(const iterator &rhs) const
+		{
+			assert(this->src.data() == rhs.src.data());
+			return this->position == rhs.position;
+		}
+
+		std::strong_ordering operator<=>(const iterator &rhs) const
+		{
+			assert(this->src.data() == rhs.src.data());
+			return this->position <=> rhs.position;
+		}
+
+		char32_t operator*() const
+		{
+			assert(this->position < this->src.size());
+			auto [len, c] = DecodeUtf8(this->src.substr(this->position));
+			return len > 0 ? c : '?';
+		}
+
+		iterator& operator++()
+		{
+			auto size = this->src.size();
+			assert(this->position < size);
+			do {
+				++this->position;
+			} while (this->position < size && IsUtf8Part(this->src[this->position]));
+			return *this;
+		}
+
+		iterator operator++(int)
+		{
+			iterator result = *this;
+			++*this;
+			return result;
+		}
+
+		iterator& operator--()
+		{
+			assert(this->position > 0);
+			do {
+				--this->position;
+			} while (this->position > 0 && IsUtf8Part(this->src[this->position]));
+			return *this;
+		}
+
+		iterator operator--(int)
+		{
+			iterator result = *this;
+			--*this;
+			return result;
+		}
+	};
+
+	iterator begin() const
+	{
+		return iterator(this->src, 0);
+	}
+
+	iterator end() const
+	{
+		return iterator(this->src, this->src.size());
+	}
+
+	iterator GetIterAtByte(size_t offset) const;
+};
+
+#endif /* UTF8_HPP */
diff --git a/src/settingsgen/CMakeLists.txt b/src/settingsgen/CMakeLists.txt
index 49e65c0971..f088a2a642 100644
--- a/src/settingsgen/CMakeLists.txt
+++ b/src/settingsgen/CMakeLists.txt
@@ -10,6 +10,7 @@ if (NOT HOST_BINARY_DIR)
             ../error.cpp
             ../ini_load.cpp
             ../string.cpp
+            ../core/utf8.cpp
     )
     add_definitions(-DSETTINGSGEN)
     add_executable(settingsgen ${sourcefiles})
diff --git a/src/strgen/CMakeLists.txt b/src/strgen/CMakeLists.txt
index 581f60eeb0..5d67a744e7 100644
--- a/src/strgen/CMakeLists.txt
+++ b/src/strgen/CMakeLists.txt
@@ -12,6 +12,7 @@ if (NOT HOST_BINARY_DIR)
             ../misc/getoptdata.cpp
             ../error.cpp
             ../string.cpp
+            ../core/utf8.cpp
     )
     add_definitions(-DSTRGEN)
     add_executable(strgen ${sourcefiles})
diff --git a/src/tests/CMakeLists.txt b/src/tests/CMakeLists.txt
index c4cbfa4b2f..bd1876297e 100644
--- a/src/tests/CMakeLists.txt
+++ b/src/tests/CMakeLists.txt
@@ -12,4 +12,5 @@ add_test_files(
     test_network_crypto.cpp
     test_script_admin.cpp
     test_window_desc.cpp
+    utf8.cpp
 )
diff --git a/src/tests/utf8.cpp b/src/tests/utf8.cpp
new file mode 100644
index 0000000000..c8ff3a191a
--- /dev/null
+++ b/src/tests/utf8.cpp
@@ -0,0 +1,140 @@
+/*
+ * This file is part of OpenTTD.
+ * OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
+ * OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+ * See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
+ */
+
+/** @file utf8.cpp Test functionality from core/utf8. */
+
+#include "../stdafx.h"
+
+#include "../3rdparty/catch2/catch.hpp"
+
+#include "../core/utf8.hpp"
+
+#include "../safeguards.h"
+
+using namespace std::literals;
+
+TEST_CASE("Utf8View - empty")
+{
+	Utf8View view;
+	auto begin = view.begin();
+	auto end = view.end();
+	CHECK(begin == end);
+	CHECK(begin.GetByteOffset() == 0);
+}
+
+TEST_CASE("Utf8View - invalid")
+{
+	Utf8View view("\u1234\x80\x80""a\xFF\x80\x80\x80\x80\x80""b\xF0");
+	auto begin = view.begin();
+	auto end = view.end();
+	CHECK(begin < end);
+	auto it = begin;
+	CHECK(it == begin);
+	CHECK(it.GetByteOffset() == 0);
+	CHECK(*it == 0x1234);
+	++it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 5);
+	CHECK(*it == 'a');
+	++it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 6);
+	CHECK(*it == '?');
+	++it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 12);
+	CHECK(*it == 'b');
+	++it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 13);
+	CHECK(*it == '?');
+	++it;
+	CHECK(it.GetByteOffset() == 14);
+	CHECK(begin < it);
+	CHECK(it == end);
+	--it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 13);
+	CHECK(*it == '?');
+	--it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 12);
+	CHECK(*it == 'b');
+	--it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 6);
+	CHECK(*it == '?');
+	--it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 5);
+	CHECK(*it == 'a');
+	--it;
+	CHECK(it == begin);
+	CHECK(it.GetByteOffset() == 0);
+	CHECK(*it == 0x1234);
+}
+
+TEST_CASE("Utf8View - iterate")
+{
+	Utf8View view("\u1234a\0b\U00012345"sv);
+	auto begin = view.begin();
+	auto end = view.end();
+	CHECK(begin < end);
+	auto it = begin;
+	CHECK(it == begin);
+	CHECK(it.GetByteOffset() == 0);
+	CHECK(*it == 0x1234);
+	CHECK(it == view.GetIterAtByte(0));
+	CHECK(it == view.GetIterAtByte(1));
+	CHECK(it == view.GetIterAtByte(2));
+	++it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 3);
+	CHECK(*it == 'a');
+	CHECK(it == view.GetIterAtByte(3));
+	++it;
+	CHECK(it.GetByteOffset() == 4);
+	CHECK(*it == 0);
+	CHECK(it == view.GetIterAtByte(4));
+	++it;
+	CHECK(it.GetByteOffset() == 5);
+	CHECK(*it == 'b');
+	CHECK(it == view.GetIterAtByte(5));
+	++it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 6);
+	CHECK(*it == 0x00012345);
+	CHECK(it == view.GetIterAtByte(6));
+	CHECK(it == view.GetIterAtByte(7));
+	CHECK(it == view.GetIterAtByte(8));
+	CHECK(it == view.GetIterAtByte(9));
+	++it;
+	CHECK(begin < it);
+	CHECK(it.GetByteOffset() == 10);
+	CHECK(it == end);
+	CHECK(it == view.GetIterAtByte(10));
+	--it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 6);
+	CHECK(*it == 0x00012345);
+	--it;
+	CHECK(begin < it);
+	CHECK(it < end);
+	CHECK(it.GetByteOffset() == 5);
+	CHECK(*it == 'b');
+}