Codechange: Use Utf8View::iterator in StringIterator.

2025-08-29 17:39:09 +00:00 · 2025-04-01 14:58:16 +02:00
parent b19e43ae99
commit 83401ad5e2
7 changed files with 53 additions and 79 deletions
--- a/src/os/macosx/string_osx.cpp
+++ b/src/os/macosx/string_osx.cpp
@@ -11,6 +11,7 @@
 #include "string_osx.h"
 #include "../../string_func.h"
 #include "../../strings_func.h"
+#include "../../core/utf8.hpp"
 #include "../../table/control_codes.h"
 #include "../../fontcache.h"
 #include "../../zoom_func.h"
@@ -368,10 +369,8 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
 }


-/* virtual */ void OSXStringIterator::SetString(const char *s)
+/* virtual */ void OSXStringIterator::SetString(std::string_view s)
 {
-	const char *string_base = s;
-
 	this->utf16_to_utf8.clear();
 	this->str_info.clear();
 	this->cur_pos = 0;
@@ -379,10 +378,10 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
 	/* CoreText operates on UTF-16, thus we have to convert the input string.
 	 * To be able to return proper offsets, we have to create a mapping at the same time. */
 	std::vector<UniChar> utf16_str;     ///< UTF-16 copy of the string.
-	while (*s != '\0') {
-		size_t idx = s - string_base;
-
-		char32_t c = Utf8Consume(&s);
+	Utf8View view(s);
+	for (auto it = view.begin(), end = view.end(); it != end; ++it) {
+		size_t idx = it.GetByteOffset();
+		char32_t c = *it;
 		if (c < 0x10000) {
 			utf16_str.push_back((UniChar)c);
 		} else {
@@ -393,7 +392,7 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
 		}
 		this->utf16_to_utf8.push_back(idx);
 	}
-	this->utf16_to_utf8.push_back(s - string_base);
+	this->utf16_to_utf8.push_back(s.size());

 	/* Query CoreText for word and cluster break information. */
 	this->str_info.resize(utf16_to_utf8.size());
--- a/src/os/macosx/string_osx.h
+++ b/src/os/macosx/string_osx.h
@@ -27,7 +27,7 @@ class OSXStringIterator : public StringIterator {
 	size_t cur_pos; ///< Current iteration position.

 public:
-	void SetString(const char *s) override;
+	void SetString(std::string_view s) override;
 	size_t SetCurPosition(size_t pos) override;
 	size_t Next(IterType what) override;
 	size_t Prev(IterType what) override;
--- a/src/os/windows/string_uniscribe.cpp
+++ b/src/os/windows/string_uniscribe.cpp
@@ -13,6 +13,7 @@
 #include "../../language.h"
 #include "../../strings_func.h"
 #include "../../string_func.h"
+#include "../../core/utf8.hpp"
 #include "../../table/control_codes.h"
 #include "../../zoom_func.h"
 #include "win32.h"
@@ -516,10 +517,8 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
 }


-/* virtual */ void UniscribeStringIterator::SetString(const char *s)
+/* virtual */ void UniscribeStringIterator::SetString(std::string_view s)
 {
-	const char *string_base = s;
-
 	this->utf16_to_utf8.clear();
 	this->str_info.clear();
 	this->cur_pos = 0;
@@ -527,10 +526,10 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
 	/* Uniscribe operates on UTF-16, thus we have to convert the input string.
 	 * To be able to return proper offsets, we have to create a mapping at the same time. */
 	std::vector<wchar_t> utf16_str;     ///< UTF-16 copy of the string.
-	while (*s != '\0') {
-		size_t idx = s - string_base;
-
-		char32_t c = Utf8Consume(&s);
+	Utf8View view(s);
+	for (auto it = view.begin(), end = view.end(); it != end; ++it) {
+		size_t idx = it.GetByteOffset();
+		char32_t c = *it;
 		if (c < 0x10000) {
 			utf16_str.push_back((wchar_t)c);
 		} else {
@@ -541,7 +540,7 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
 		}
 		this->utf16_to_utf8.push_back(idx);
 	}
-	this->utf16_to_utf8.push_back(s - string_base);
+	this->utf16_to_utf8.push_back(s.size());

 	/* Query Uniscribe for word and cluster break information. */
 	this->str_info.resize(utf16_to_utf8.size());
--- a/src/os/windows/string_uniscribe.h
+++ b/src/os/windows/string_uniscribe.h
@@ -77,7 +77,7 @@ class UniscribeStringIterator : public StringIterator {
 	size_t cur_pos; ///< Current iteration position.

 public:
-	void SetString(const char *s) override;
+	void SetString(std::string_view s) override;
 	size_t SetCurPosition(size_t pos) override;
 	size_t Next(IterType what) override;
 	size_t Prev(IterType what) override;
--- a/src/string.cpp
+++ b/src/string.cpp
@@ -13,6 +13,7 @@
 #include "error_func.h"
 #include "string_func.h"
 #include "string_base.h"
+#include "core/utf8.hpp"

 #include "table/control_codes.h"

@@ -826,10 +827,8 @@ public:
 		delete this->word_itr;
 	}

-	void SetString(const char *s) override
+	void SetString(std::string_view s) override
 	{
-		const char *string_base = s;
-
 		/* Unfortunately current ICU versions only provide rudimentary support
 		 * for word break iterators (especially for CJK languages) in combination
 		 * with UTF-8 input. As a work around we have to convert the input to
@@ -837,10 +836,10 @@ public:
 		this->utf16_str.clear();
 		this->utf16_to_utf8.clear();

-		while (*s != '\0') {
-			size_t idx = s - string_base;
-
-			char32_t c = Utf8Consume(&s);
+		Utf8View view(s);
+		for (auto it = view.begin(), end = view.end(); it != end; ++it) {
+			size_t idx = it.GetByteOffset();
+			char32_t c = *it;
 			if (c < 0x10000) {
 				this->utf16_str.push_back((UChar)c);
 			} else {
@@ -852,7 +851,7 @@ public:
 			this->utf16_to_utf8.push_back(idx);
 		}
 		this->utf16_str.push_back('\0');
-		this->utf16_to_utf8.push_back(s - string_base);
+		this->utf16_to_utf8.push_back(s.size());

 		UText text = UTEXT_INITIALIZER;
 		UErrorCode status = U_ZERO_ERROR;
@@ -956,60 +955,43 @@ public:
 /** Fallback simple string iterator. */
 class DefaultStringIterator : public StringIterator
 {
-	const char *string; ///< Current string.
-	size_t len;         ///< String length.
-	size_t cur_pos;     ///< Current iteration position.
+	Utf8View string; ///< Current string.
+	Utf8View::iterator cur_pos; //< Current iteration position.

 public:
-	DefaultStringIterator() : string(nullptr), len(0), cur_pos(0)
-	{
-	}
-
-	void SetString(const char *s) override
+	void SetString(std::string_view s) override
 	{
 		this->string = s;
-		this->len = strlen(s);
-		this->cur_pos = 0;
+		this->cur_pos = this->string.begin();
 	}

 	size_t SetCurPosition(size_t pos) override
 	{
-		assert(this->string != nullptr && pos <= this->len);
-		/* Sanitize in case we get a position inside an UTF-8 sequence. */
-		while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
-		return this->cur_pos = pos;
+		this->cur_pos = this->string.GetIterAtByte(pos);
+		return this->cur_pos.GetByteOffset();
 	}

 	size_t Next(IterType what) override
 	{
-		assert(this->string != nullptr);
-
+		const auto end = this->string.end();
 		/* Already at the end? */
-		if (this->cur_pos >= this->len) return END;
+		if (this->cur_pos >= end) return END;

 		switch (what) {
-			case ITER_CHARACTER: {
-				char32_t c;
-				this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
-				return this->cur_pos;
-			}
+			case ITER_CHARACTER:
+				++this->cur_pos;
+				return this->cur_pos.GetByteOffset();

-			case ITER_WORD: {
-				char32_t c;
+			case ITER_WORD:
 				/* Consume current word. */
-				size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
-				while (this->cur_pos < this->len && !IsWhitespace(c)) {
-					this->cur_pos += offs;
-					offs = Utf8Decode(&c, this->string + this->cur_pos);
+				while (this->cur_pos != end && !IsWhitespace(*this->cur_pos)) {
+					++this->cur_pos;
 				}
 				/* Consume whitespace to the next word. */
-				while (this->cur_pos < this->len && IsWhitespace(c)) {
-					this->cur_pos += offs;
-					offs = Utf8Decode(&c, this->string + this->cur_pos);
+				while (this->cur_pos != end && IsWhitespace(*this->cur_pos)) {
+					++this->cur_pos;
 				}
-
-				return this->cur_pos;
-			}
+				return this->cur_pos.GetByteOffset();

 			default:
 				NOT_REACHED();
@@ -1020,33 +1002,27 @@ public:

 	size_t Prev(IterType what) override
 	{
-		assert(this->string != nullptr);
-
+		const auto begin = this->string.begin();
 		/* Already at the beginning? */
-		if (this->cur_pos == 0) return END;
+		if (this->cur_pos == begin) return END;

 		switch (what) {
 			case ITER_CHARACTER:
-				return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+				--this->cur_pos;
+				return this->cur_pos.GetByteOffset();

-			case ITER_WORD: {
-				const char *s = this->string + this->cur_pos;
-				char32_t c;
+			case ITER_WORD:
 				/* Consume preceding whitespace. */
 				do {
-					s = Utf8PrevChar(s);
-					Utf8Decode(&c, s);
-				} while (s > this->string && IsWhitespace(c));
+					--this->cur_pos;
+				} while (this->cur_pos != begin && IsWhitespace(*this->cur_pos));
 				/* Consume preceding word. */
-				while (s > this->string && !IsWhitespace(c)) {
-					s = Utf8PrevChar(s);
-					Utf8Decode(&c, s);
+				while (this->cur_pos != begin && !IsWhitespace(*this->cur_pos)) {
+					--this->cur_pos;
 				}
 				/* Move caret back to the beginning of the word. */
-				if (IsWhitespace(c)) Utf8Consume(&s);
-
-				return this->cur_pos = s - this->string;
-			}
+				if (IsWhitespace(*this->cur_pos)) ++this->cur_pos;
+				return this->cur_pos.GetByteOffset();

 			default:
 				NOT_REACHED();
--- a/src/string_base.h
+++ b/src/string_base.h
@@ -35,7 +35,7 @@ public:
 	 * changed. The cursor is reset to the start of the string.
 	 * @param s New string.
 	 */
-	virtual void SetString(const char *s) = 0;
+	virtual void SetString(std::string_view s) = 0;

 	/**
 	 * Change the current string cursor.
--- a/src/textbuf.cpp
+++ b/src/textbuf.cpp
@@ -291,7 +291,7 @@ const char *Textbuf::GetText() const
 /** Update the character iter after the text has changed. */
 void Textbuf::UpdateStringIter()
 {
-	this->char_iter->SetString(this->buf.c_str());
+	this->char_iter->SetString(this->buf);
 	size_t pos = this->char_iter->SetCurPosition(this->caretpos);
 	this->caretpos = pos == StringIterator::END ? 0 : (uint16_t)pos;
 }