mirror of https://github.com/OpenTTD/OpenTTD
Add: Utf8View and iterator.
parent
e6a0cf75a9
commit
b19e43ae99
|
@ -24,4 +24,6 @@ add_files(
|
|||
smallstack_type.hpp
|
||||
container_func.hpp
|
||||
strong_typedef_type.hpp
|
||||
utf8.cpp
|
||||
utf8.hpp
|
||||
)
|
||||
|
|
|
@ -0,0 +1,92 @@
|
|||
/*
|
||||
* This file is part of OpenTTD.
|
||||
* OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
|
||||
* OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file utf8.cpp Handling of UTF-8 encoded data.
|
||||
*/
|
||||
|
||||
#include "../stdafx.h"
|
||||
#include "utf8.hpp"
|
||||
#include "../safeguards.h"
|
||||
|
||||
/**
|
||||
* Encode a character to UTF-8.
|
||||
* @param c Character
|
||||
* @return Binary data and length. Length is zero, if "c" is out of range.
|
||||
*/
|
||||
[[nodiscard]] std::pair<char[4], size_t> EncodeUtf8(char32_t c)
|
||||
{
|
||||
std::pair<char[4], size_t> result{};
|
||||
auto &[buf, len] = result;
|
||||
if (c < 0x80) {
|
||||
buf[len++] = c;
|
||||
} else if (c < 0x800) {
|
||||
buf[len++] = 0xC0 + GB(c, 6, 5);
|
||||
buf[len++] = 0x80 + GB(c, 0, 6);
|
||||
} else if (c < 0x10000) {
|
||||
buf[len++] = 0xE0 + GB(c, 12, 4);
|
||||
buf[len++] = 0x80 + GB(c, 6, 6);
|
||||
buf[len++] = 0x80 + GB(c, 0, 6);
|
||||
} else if (c < 0x110000) {
|
||||
buf[len++] = 0xF0 + GB(c, 18, 3);
|
||||
buf[len++] = 0x80 + GB(c, 12, 6);
|
||||
buf[len++] = 0x80 + GB(c, 6, 6);
|
||||
buf[len++] = 0x80 + GB(c, 0, 6);
|
||||
}
|
||||
return result;
|
||||
}
|
||||
|
||||
/**
|
||||
* Decode a character from UTF-8.
|
||||
* @param buf Binary data.
|
||||
* @return Length and character. Length is zero, if the input data is invalid.
|
||||
*/
|
||||
[[nodiscard]] std::pair<size_t, char32_t> DecodeUtf8(std::string_view buf)
|
||||
{
|
||||
if (buf.size() >= 1 && !HasBit(buf[0], 7)) {
|
||||
/* Single byte character: 0xxxxxxx */
|
||||
char32_t c = buf[0];
|
||||
return {1, c};
|
||||
} else if (buf.size() >= 2 && GB(buf[0], 5, 3) == 6) {
|
||||
if (IsUtf8Part(buf[1])) {
|
||||
/* Double byte character: 110xxxxx 10xxxxxx */
|
||||
char32_t c = GB(buf[0], 0, 5) << 6 | GB(buf[1], 0, 6);
|
||||
if (c >= 0x80) return {2, c};
|
||||
}
|
||||
} else if (buf.size() >= 3 && GB(buf[0], 4, 4) == 14) {
|
||||
if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2])) {
|
||||
/* Triple byte character: 1110xxxx 10xxxxxx 10xxxxxx */
|
||||
char32_t c = GB(buf[0], 0, 4) << 12 | GB(buf[1], 0, 6) << 6 | GB(buf[2], 0, 6);
|
||||
if (c >= 0x800) return {3, c};
|
||||
}
|
||||
} else if (buf.size() >= 4 && GB(buf[0], 3, 5) == 30) {
|
||||
if (IsUtf8Part(buf[1]) && IsUtf8Part(buf[2]) && IsUtf8Part(buf[3])) {
|
||||
/* 4 byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */
|
||||
char32_t c = GB(buf[0], 0, 3) << 18 | GB(buf[1], 0, 6) << 12 | GB(buf[2], 0, 6) << 6 | GB(buf[3], 0, 6);
|
||||
if (c >= 0x10000 && c <= 0x10FFFF) return {4, c};
|
||||
}
|
||||
}
|
||||
return {};
|
||||
}
|
||||
|
||||
/**
|
||||
* Create iterator pointing at codepoint, which occupies the byte position "offset".
|
||||
* "offset" does not need to point at the first byte of the UTF-8 sequence,
|
||||
* the iterator will still address the correct position of the first byte.
|
||||
* @param offset Byte offset into view.
|
||||
* @return Iterator pointing at start of codepoint, of which "offset" is part of.
|
||||
*/
|
||||
Utf8View::iterator Utf8View::GetIterAtByte(size_t offset) const
|
||||
{
|
||||
assert(offset <= this->src.size());
|
||||
if (offset >= this->src.size()) return this->end();
|
||||
|
||||
/* Sanitize iterator to point to the start of a codepoint */
|
||||
auto it = iterator(this->src, offset + 1);
|
||||
--it;
|
||||
return it;
|
||||
}
|
|
@ -0,0 +1,121 @@
|
|||
/*
|
||||
* This file is part of OpenTTD.
|
||||
* OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
|
||||
* OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/**
|
||||
* @file utf8.hpp Handling of UTF-8 encoded data.
|
||||
*/
|
||||
|
||||
#ifndef UTF8_HPP
|
||||
#define UTF8_HPP
|
||||
|
||||
#include <iterator>
|
||||
#include "../string_func.h"
|
||||
|
||||
[[nodiscard]] std::pair<char[4], size_t> EncodeUtf8(char32_t c);
|
||||
[[nodiscard]] std::pair<size_t, char32_t> DecodeUtf8(std::string_view buf);
|
||||
|
||||
/**
|
||||
* Constant span of UTF-8 encoded data.
|
||||
*/
|
||||
class Utf8View {
|
||||
std::string_view src;
|
||||
public:
|
||||
Utf8View() = default;
|
||||
Utf8View(std::string_view src) : src(src) {}
|
||||
|
||||
/**
|
||||
* Bidirectional input iterator over codepoints.
|
||||
*
|
||||
* If invalid encodings are present:
|
||||
* - the iterator will skip overlong encodings, and
|
||||
* - dereferencing returns a placeholder char '?'.
|
||||
*/
|
||||
class iterator {
|
||||
std::string_view src;
|
||||
size_t position = 0;
|
||||
public:
|
||||
using value_type = char32_t;
|
||||
using difference_type = std::ptrdiff_t;
|
||||
using iterator_category = std::bidirectional_iterator_tag;
|
||||
using pointer = void;
|
||||
using reference = void;
|
||||
|
||||
iterator() = default;
|
||||
iterator(std::string_view src, size_t position) : src(src), position(position) {}
|
||||
|
||||
size_t GetByteOffset() const
|
||||
{
|
||||
return this->position;
|
||||
}
|
||||
|
||||
bool operator==(const iterator &rhs) const
|
||||
{
|
||||
assert(this->src.data() == rhs.src.data());
|
||||
return this->position == rhs.position;
|
||||
}
|
||||
|
||||
std::strong_ordering operator<=>(const iterator &rhs) const
|
||||
{
|
||||
assert(this->src.data() == rhs.src.data());
|
||||
return this->position <=> rhs.position;
|
||||
}
|
||||
|
||||
char32_t operator*() const
|
||||
{
|
||||
assert(this->position < this->src.size());
|
||||
auto [len, c] = DecodeUtf8(this->src.substr(this->position));
|
||||
return len > 0 ? c : '?';
|
||||
}
|
||||
|
||||
iterator& operator++()
|
||||
{
|
||||
auto size = this->src.size();
|
||||
assert(this->position < size);
|
||||
do {
|
||||
++this->position;
|
||||
} while (this->position < size && IsUtf8Part(this->src[this->position]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
iterator operator++(int)
|
||||
{
|
||||
iterator result = *this;
|
||||
++*this;
|
||||
return result;
|
||||
}
|
||||
|
||||
iterator& operator--()
|
||||
{
|
||||
assert(this->position > 0);
|
||||
do {
|
||||
--this->position;
|
||||
} while (this->position > 0 && IsUtf8Part(this->src[this->position]));
|
||||
return *this;
|
||||
}
|
||||
|
||||
iterator operator--(int)
|
||||
{
|
||||
iterator result = *this;
|
||||
--*this;
|
||||
return result;
|
||||
}
|
||||
};
|
||||
|
||||
iterator begin() const
|
||||
{
|
||||
return iterator(this->src, 0);
|
||||
}
|
||||
|
||||
iterator end() const
|
||||
{
|
||||
return iterator(this->src, this->src.size());
|
||||
}
|
||||
|
||||
iterator GetIterAtByte(size_t offset) const;
|
||||
};
|
||||
|
||||
#endif /* UTF8_HPP */
|
|
@ -10,6 +10,7 @@ if (NOT HOST_BINARY_DIR)
|
|||
../error.cpp
|
||||
../ini_load.cpp
|
||||
../string.cpp
|
||||
../core/utf8.cpp
|
||||
)
|
||||
add_definitions(-DSETTINGSGEN)
|
||||
add_executable(settingsgen ${sourcefiles})
|
||||
|
|
|
@ -12,6 +12,7 @@ if (NOT HOST_BINARY_DIR)
|
|||
../misc/getoptdata.cpp
|
||||
../error.cpp
|
||||
../string.cpp
|
||||
../core/utf8.cpp
|
||||
)
|
||||
add_definitions(-DSTRGEN)
|
||||
add_executable(strgen ${sourcefiles})
|
||||
|
|
|
@ -12,4 +12,5 @@ add_test_files(
|
|||
test_network_crypto.cpp
|
||||
test_script_admin.cpp
|
||||
test_window_desc.cpp
|
||||
utf8.cpp
|
||||
)
|
||||
|
|
|
@ -0,0 +1,140 @@
|
|||
/*
|
||||
* This file is part of OpenTTD.
|
||||
* OpenTTD is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, version 2.
|
||||
* OpenTTD is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
|
||||
* See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with OpenTTD. If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
/** @file utf8.cpp Test functionality from core/utf8. */
|
||||
|
||||
#include "../stdafx.h"
|
||||
|
||||
#include "../3rdparty/catch2/catch.hpp"
|
||||
|
||||
#include "../core/utf8.hpp"
|
||||
|
||||
#include "../safeguards.h"
|
||||
|
||||
using namespace std::literals;
|
||||
|
||||
TEST_CASE("Utf8View - empty")
|
||||
{
|
||||
Utf8View view;
|
||||
auto begin = view.begin();
|
||||
auto end = view.end();
|
||||
CHECK(begin == end);
|
||||
CHECK(begin.GetByteOffset() == 0);
|
||||
}
|
||||
|
||||
TEST_CASE("Utf8View - invalid")
|
||||
{
|
||||
Utf8View view("\u1234\x80\x80""a\xFF\x80\x80\x80\x80\x80""b\xF0");
|
||||
auto begin = view.begin();
|
||||
auto end = view.end();
|
||||
CHECK(begin < end);
|
||||
auto it = begin;
|
||||
CHECK(it == begin);
|
||||
CHECK(it.GetByteOffset() == 0);
|
||||
CHECK(*it == 0x1234);
|
||||
++it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 5);
|
||||
CHECK(*it == 'a');
|
||||
++it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 6);
|
||||
CHECK(*it == '?');
|
||||
++it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 12);
|
||||
CHECK(*it == 'b');
|
||||
++it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 13);
|
||||
CHECK(*it == '?');
|
||||
++it;
|
||||
CHECK(it.GetByteOffset() == 14);
|
||||
CHECK(begin < it);
|
||||
CHECK(it == end);
|
||||
--it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 13);
|
||||
CHECK(*it == '?');
|
||||
--it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 12);
|
||||
CHECK(*it == 'b');
|
||||
--it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 6);
|
||||
CHECK(*it == '?');
|
||||
--it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 5);
|
||||
CHECK(*it == 'a');
|
||||
--it;
|
||||
CHECK(it == begin);
|
||||
CHECK(it.GetByteOffset() == 0);
|
||||
CHECK(*it == 0x1234);
|
||||
}
|
||||
|
||||
TEST_CASE("Utf8View - iterate")
|
||||
{
|
||||
Utf8View view("\u1234a\0b\U00012345"sv);
|
||||
auto begin = view.begin();
|
||||
auto end = view.end();
|
||||
CHECK(begin < end);
|
||||
auto it = begin;
|
||||
CHECK(it == begin);
|
||||
CHECK(it.GetByteOffset() == 0);
|
||||
CHECK(*it == 0x1234);
|
||||
CHECK(it == view.GetIterAtByte(0));
|
||||
CHECK(it == view.GetIterAtByte(1));
|
||||
CHECK(it == view.GetIterAtByte(2));
|
||||
++it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 3);
|
||||
CHECK(*it == 'a');
|
||||
CHECK(it == view.GetIterAtByte(3));
|
||||
++it;
|
||||
CHECK(it.GetByteOffset() == 4);
|
||||
CHECK(*it == 0);
|
||||
CHECK(it == view.GetIterAtByte(4));
|
||||
++it;
|
||||
CHECK(it.GetByteOffset() == 5);
|
||||
CHECK(*it == 'b');
|
||||
CHECK(it == view.GetIterAtByte(5));
|
||||
++it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 6);
|
||||
CHECK(*it == 0x00012345);
|
||||
CHECK(it == view.GetIterAtByte(6));
|
||||
CHECK(it == view.GetIterAtByte(7));
|
||||
CHECK(it == view.GetIterAtByte(8));
|
||||
CHECK(it == view.GetIterAtByte(9));
|
||||
++it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it.GetByteOffset() == 10);
|
||||
CHECK(it == end);
|
||||
CHECK(it == view.GetIterAtByte(10));
|
||||
--it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 6);
|
||||
CHECK(*it == 0x00012345);
|
||||
--it;
|
||||
CHECK(begin < it);
|
||||
CHECK(it < end);
|
||||
CHECK(it.GetByteOffset() == 5);
|
||||
CHECK(*it == 'b');
|
||||
}
|
Loading…
Reference in New Issue