1
0
Fork 0

Codechange: Use Utf8View::iterator in StringIterator.

pull/13960/head
frosch 2025-04-01 14:58:16 +02:00 committed by frosch
parent b19e43ae99
commit 83401ad5e2
7 changed files with 53 additions and 79 deletions

View File

@ -11,6 +11,7 @@
#include "string_osx.h"
#include "../../string_func.h"
#include "../../strings_func.h"
#include "../../core/utf8.hpp"
#include "../../table/control_codes.h"
#include "../../fontcache.h"
#include "../../zoom_func.h"
@ -368,10 +369,8 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
}
/* virtual */ void OSXStringIterator::SetString(const char *s)
/* virtual */ void OSXStringIterator::SetString(std::string_view s)
{
const char *string_base = s;
this->utf16_to_utf8.clear();
this->str_info.clear();
this->cur_pos = 0;
@ -379,10 +378,10 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
/* CoreText operates on UTF-16, thus we have to convert the input string.
* To be able to return proper offsets, we have to create a mapping at the same time. */
std::vector<UniChar> utf16_str; ///< UTF-16 copy of the string.
while (*s != '\0') {
size_t idx = s - string_base;
char32_t c = Utf8Consume(&s);
Utf8View view(s);
for (auto it = view.begin(), end = view.end(); it != end; ++it) {
size_t idx = it.GetByteOffset();
char32_t c = *it;
if (c < 0x10000) {
utf16_str.push_back((UniChar)c);
} else {
@ -393,7 +392,7 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
}
this->utf16_to_utf8.push_back(idx);
}
this->utf16_to_utf8.push_back(s - string_base);
this->utf16_to_utf8.push_back(s.size());
/* Query CoreText for word and cluster break information. */
this->str_info.resize(utf16_to_utf8.size());

View File

@ -27,7 +27,7 @@ class OSXStringIterator : public StringIterator {
size_t cur_pos; ///< Current iteration position.
public:
void SetString(const char *s) override;
void SetString(std::string_view s) override;
size_t SetCurPosition(size_t pos) override;
size_t Next(IterType what) override;
size_t Prev(IterType what) override;

View File

@ -13,6 +13,7 @@
#include "../../language.h"
#include "../../strings_func.h"
#include "../../string_func.h"
#include "../../core/utf8.hpp"
#include "../../table/control_codes.h"
#include "../../zoom_func.h"
#include "win32.h"
@ -516,10 +517,8 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
}
/* virtual */ void UniscribeStringIterator::SetString(const char *s)
/* virtual */ void UniscribeStringIterator::SetString(std::string_view s)
{
const char *string_base = s;
this->utf16_to_utf8.clear();
this->str_info.clear();
this->cur_pos = 0;
@ -527,10 +526,10 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
/* Uniscribe operates on UTF-16, thus we have to convert the input string.
* To be able to return proper offsets, we have to create a mapping at the same time. */
std::vector<wchar_t> utf16_str; ///< UTF-16 copy of the string.
while (*s != '\0') {
size_t idx = s - string_base;
char32_t c = Utf8Consume(&s);
Utf8View view(s);
for (auto it = view.begin(), end = view.end(); it != end; ++it) {
size_t idx = it.GetByteOffset();
char32_t c = *it;
if (c < 0x10000) {
utf16_str.push_back((wchar_t)c);
} else {
@ -541,7 +540,7 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
}
this->utf16_to_utf8.push_back(idx);
}
this->utf16_to_utf8.push_back(s - string_base);
this->utf16_to_utf8.push_back(s.size());
/* Query Uniscribe for word and cluster break information. */
this->str_info.resize(utf16_to_utf8.size());

View File

@ -77,7 +77,7 @@ class UniscribeStringIterator : public StringIterator {
size_t cur_pos; ///< Current iteration position.
public:
void SetString(const char *s) override;
void SetString(std::string_view s) override;
size_t SetCurPosition(size_t pos) override;
size_t Next(IterType what) override;
size_t Prev(IterType what) override;

View File

@ -13,6 +13,7 @@
#include "error_func.h"
#include "string_func.h"
#include "string_base.h"
#include "core/utf8.hpp"
#include "table/control_codes.h"
@ -826,10 +827,8 @@ public:
delete this->word_itr;
}
void SetString(const char *s) override
void SetString(std::string_view s) override
{
const char *string_base = s;
/* Unfortunately current ICU versions only provide rudimentary support
* for word break iterators (especially for CJK languages) in combination
* with UTF-8 input. As a work around we have to convert the input to
@ -837,10 +836,10 @@ public:
this->utf16_str.clear();
this->utf16_to_utf8.clear();
while (*s != '\0') {
size_t idx = s - string_base;
char32_t c = Utf8Consume(&s);
Utf8View view(s);
for (auto it = view.begin(), end = view.end(); it != end; ++it) {
size_t idx = it.GetByteOffset();
char32_t c = *it;
if (c < 0x10000) {
this->utf16_str.push_back((UChar)c);
} else {
@ -852,7 +851,7 @@ public:
this->utf16_to_utf8.push_back(idx);
}
this->utf16_str.push_back('\0');
this->utf16_to_utf8.push_back(s - string_base);
this->utf16_to_utf8.push_back(s.size());
UText text = UTEXT_INITIALIZER;
UErrorCode status = U_ZERO_ERROR;
@ -956,60 +955,43 @@ public:
/** Fallback simple string iterator. */
class DefaultStringIterator : public StringIterator
{
const char *string; ///< Current string.
size_t len; ///< String length.
size_t cur_pos; ///< Current iteration position.
Utf8View string; ///< Current string.
Utf8View::iterator cur_pos; //< Current iteration position.
public:
DefaultStringIterator() : string(nullptr), len(0), cur_pos(0)
{
}
void SetString(const char *s) override
void SetString(std::string_view s) override
{
this->string = s;
this->len = strlen(s);
this->cur_pos = 0;
this->cur_pos = this->string.begin();
}
size_t SetCurPosition(size_t pos) override
{
assert(this->string != nullptr && pos <= this->len);
/* Sanitize in case we get a position inside an UTF-8 sequence. */
while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
return this->cur_pos = pos;
this->cur_pos = this->string.GetIterAtByte(pos);
return this->cur_pos.GetByteOffset();
}
size_t Next(IterType what) override
{
assert(this->string != nullptr);
const auto end = this->string.end();
/* Already at the end? */
if (this->cur_pos >= this->len) return END;
if (this->cur_pos >= end) return END;
switch (what) {
case ITER_CHARACTER: {
char32_t c;
this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
return this->cur_pos;
}
case ITER_CHARACTER:
++this->cur_pos;
return this->cur_pos.GetByteOffset();
case ITER_WORD: {
char32_t c;
case ITER_WORD:
/* Consume current word. */
size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
while (this->cur_pos < this->len && !IsWhitespace(c)) {
this->cur_pos += offs;
offs = Utf8Decode(&c, this->string + this->cur_pos);
while (this->cur_pos != end && !IsWhitespace(*this->cur_pos)) {
++this->cur_pos;
}
/* Consume whitespace to the next word. */
while (this->cur_pos < this->len && IsWhitespace(c)) {
this->cur_pos += offs;
offs = Utf8Decode(&c, this->string + this->cur_pos);
while (this->cur_pos != end && IsWhitespace(*this->cur_pos)) {
++this->cur_pos;
}
return this->cur_pos;
}
return this->cur_pos.GetByteOffset();
default:
NOT_REACHED();
@ -1020,33 +1002,27 @@ public:
size_t Prev(IterType what) override
{
assert(this->string != nullptr);
const auto begin = this->string.begin();
/* Already at the beginning? */
if (this->cur_pos == 0) return END;
if (this->cur_pos == begin) return END;
switch (what) {
case ITER_CHARACTER:
return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
--this->cur_pos;
return this->cur_pos.GetByteOffset();
case ITER_WORD: {
const char *s = this->string + this->cur_pos;
char32_t c;
case ITER_WORD:
/* Consume preceding whitespace. */
do {
s = Utf8PrevChar(s);
Utf8Decode(&c, s);
} while (s > this->string && IsWhitespace(c));
--this->cur_pos;
} while (this->cur_pos != begin && IsWhitespace(*this->cur_pos));
/* Consume preceding word. */
while (s > this->string && !IsWhitespace(c)) {
s = Utf8PrevChar(s);
Utf8Decode(&c, s);
while (this->cur_pos != begin && !IsWhitespace(*this->cur_pos)) {
--this->cur_pos;
}
/* Move caret back to the beginning of the word. */
if (IsWhitespace(c)) Utf8Consume(&s);
return this->cur_pos = s - this->string;
}
if (IsWhitespace(*this->cur_pos)) ++this->cur_pos;
return this->cur_pos.GetByteOffset();
default:
NOT_REACHED();

View File

@ -35,7 +35,7 @@ public:
* changed. The cursor is reset to the start of the string.
* @param s New string.
*/
virtual void SetString(const char *s) = 0;
virtual void SetString(std::string_view s) = 0;
/**
* Change the current string cursor.

View File

@ -291,7 +291,7 @@ const char *Textbuf::GetText() const
/** Update the character iter after the text has changed. */
void Textbuf::UpdateStringIter()
{
this->char_iter->SetString(this->buf.c_str());
this->char_iter->SetString(this->buf);
size_t pos = this->char_iter->SetCurPosition(this->caretpos);
this->caretpos = pos == StringIterator::END ? 0 : (uint16_t)pos;
}