Codechange: Use Utf8View::iterator in StringIterator.

2025-04-01 14:58:16 +02:00 · 2025-04-01 14:58:16 +02:00 · 83401ad5e2
parent b19e43ae99
commit 83401ad5e2
7 changed files with 53 additions and 79 deletions
--- a/src/os/macosx/string_osx.cpp
+++ b/src/os/macosx/string_osx.cpp
@ -11,6 +11,7 @@
 #include "string_osx.h"
 #include "../../string_func.h"
 #include "../../strings_func.h"
 #include "../../core/utf8.hpp"
 #include "../../table/control_codes.h"
 #include "../../fontcache.h"
 #include "../../zoom_func.h"
@ -368,10 +369,8 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
 }
-/* virtual */ void OSXStringIterator::SetString(const char *s)
+/* virtual */ void OSXStringIterator::SetString(std::string_view s)
 {
 	const char *string_base = s;
 	this->utf16_to_utf8.clear();
 	this->str_info.clear();
 	this->cur_pos = 0;
@ -379,10 +378,10 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
 	/* CoreText operates on UTF-16, thus we have to convert the input string.
 	 * To be able to return proper offsets, we have to create a mapping at the same time. */
 	std::vector<UniChar> utf16_str;     ///< UTF-16 copy of the string.
-	while (*s != '\0') {
+	Utf8View view(s);
-		size_t idx = s - string_base;
+	for (auto it = view.begin(), end = view.end(); it != end; ++it) {
-
+		size_t idx = it.GetByteOffset();
-		char32_t c = Utf8Consume(&s);
+		char32_t c = *it;
 		if (c < 0x10000) {
 			utf16_str.push_back((UniChar)c);
 		} else {
@ -393,7 +392,7 @@ int MacOSStringContains(const std::string_view str, const std::string_view value
 		}
 		this->utf16_to_utf8.push_back(idx);
 	}
-	this->utf16_to_utf8.push_back(s - string_base);
+	this->utf16_to_utf8.push_back(s.size());
 	/* Query CoreText for word and cluster break information. */
 	this->str_info.resize(utf16_to_utf8.size());
--- a/src/os/macosx/string_osx.h
+++ b/src/os/macosx/string_osx.h
@ -27,7 +27,7 @@ class OSXStringIterator : public StringIterator {
 	size_t cur_pos; ///< Current iteration position.
 public:
-	void SetString(const char *s) override;
+	void SetString(std::string_view s) override;
 	size_t SetCurPosition(size_t pos) override;
 	size_t Next(IterType what) override;
 	size_t Prev(IterType what) override;
--- a/src/os/windows/string_uniscribe.cpp
+++ b/src/os/windows/string_uniscribe.cpp
@ -13,6 +13,7 @@
 #include "../../language.h"
 #include "../../strings_func.h"
 #include "../../string_func.h"
 #include "../../core/utf8.hpp"
 #include "../../table/control_codes.h"
 #include "../../zoom_func.h"
 #include "win32.h"
@ -516,10 +517,8 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
 }
-/* virtual */ void UniscribeStringIterator::SetString(const char *s)
+/* virtual */ void UniscribeStringIterator::SetString(std::string_view s)
 {
 	const char *string_base = s;
 	this->utf16_to_utf8.clear();
 	this->str_info.clear();
 	this->cur_pos = 0;
@ -527,10 +526,10 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
 	/* Uniscribe operates on UTF-16, thus we have to convert the input string.
 	 * To be able to return proper offsets, we have to create a mapping at the same time. */
 	std::vector<wchar_t> utf16_str;     ///< UTF-16 copy of the string.
-	while (*s != '\0') {
+	Utf8View view(s);
-		size_t idx = s - string_base;
+	for (auto it = view.begin(), end = view.end(); it != end; ++it) {
-
+		size_t idx = it.GetByteOffset();
-		char32_t c = Utf8Consume(&s);
+		char32_t c = *it;
 		if (c < 0x10000) {
 			utf16_str.push_back((wchar_t)c);
 		} else {
@ -541,7 +540,7 @@ std::span<const int> UniscribeParagraphLayout::UniscribeVisualRun::GetGlyphToCha
 		}
 		this->utf16_to_utf8.push_back(idx);
 	}
-	this->utf16_to_utf8.push_back(s - string_base);
+	this->utf16_to_utf8.push_back(s.size());
 	/* Query Uniscribe for word and cluster break information. */
 	this->str_info.resize(utf16_to_utf8.size());
--- a/src/os/windows/string_uniscribe.h
+++ b/src/os/windows/string_uniscribe.h
@ -77,7 +77,7 @@ class UniscribeStringIterator : public StringIterator {
 	size_t cur_pos; ///< Current iteration position.
 public:
-	void SetString(const char *s) override;
+	void SetString(std::string_view s) override;
 	size_t SetCurPosition(size_t pos) override;
 	size_t Next(IterType what) override;
 	size_t Prev(IterType what) override;
--- a/src/string.cpp
+++ b/src/string.cpp
@ -13,6 +13,7 @@
 #include "error_func.h"
 #include "string_func.h"
 #include "string_base.h"
 #include "core/utf8.hpp"
 #include "table/control_codes.h"
@ -826,10 +827,8 @@ public:
 		delete this->word_itr;
 	}
-	void SetString(const char *s) override
+	void SetString(std::string_view s) override
 	{
 		const char *string_base = s;
 		/* Unfortunately current ICU versions only provide rudimentary support
 		 * for word break iterators (especially for CJK languages) in combination
 		 * with UTF-8 input. As a work around we have to convert the input to
@ -837,10 +836,10 @@ public:
 		this->utf16_str.clear();
 		this->utf16_to_utf8.clear();
-		while (*s != '\0') {
+		Utf8View view(s);
-			size_t idx = s - string_base;
+		for (auto it = view.begin(), end = view.end(); it != end; ++it) {
-
+			size_t idx = it.GetByteOffset();
-			char32_t c = Utf8Consume(&s);
+			char32_t c = *it;
 			if (c < 0x10000) {
 				this->utf16_str.push_back((UChar)c);
 			} else {
@ -852,7 +851,7 @@ public:
 			this->utf16_to_utf8.push_back(idx);
 		}
 		this->utf16_str.push_back('\0');
-		this->utf16_to_utf8.push_back(s - string_base);
+		this->utf16_to_utf8.push_back(s.size());
 		UText text = UTEXT_INITIALIZER;
 		UErrorCode status = U_ZERO_ERROR;
@ -956,60 +955,43 @@ public:
 /** Fallback simple string iterator. */
 class DefaultStringIterator : public StringIterator
 {
-	const char *string; ///< Current string.
+	Utf8View string; ///< Current string.
-	size_t len;         ///< String length.
+	Utf8View::iterator cur_pos; //< Current iteration position.
 	size_t cur_pos;     ///< Current iteration position.
 public:
-	DefaultStringIterator() : string(nullptr), len(0), cur_pos(0)
+	void SetString(std::string_view s) override
 	{
 	}
 	void SetString(const char *s) override
 	{
 		this->string = s;
-		this->len = strlen(s);
+		this->cur_pos = this->string.begin();
 		this->cur_pos = 0;
 	}
 	size_t SetCurPosition(size_t pos) override
 	{
-		assert(this->string != nullptr && pos <= this->len);
+		this->cur_pos = this->string.GetIterAtByte(pos);
-		/* Sanitize in case we get a position inside an UTF-8 sequence. */
+		return this->cur_pos.GetByteOffset();
 		while (pos > 0 && IsUtf8Part(this->string[pos])) pos--;
 		return this->cur_pos = pos;
 	}
 	size_t Next(IterType what) override
 	{
-		assert(this->string != nullptr);
+		const auto end = this->string.end();
 		/* Already at the end? */
-		if (this->cur_pos >= this->len) return END;
+		if (this->cur_pos >= end) return END;
 		switch (what) {
-			case ITER_CHARACTER: {
+			case ITER_CHARACTER:
-				char32_t c;
+				++this->cur_pos;
-				this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
+				return this->cur_pos.GetByteOffset();
 				return this->cur_pos;
 			}
-			case ITER_WORD: {
+			case ITER_WORD:
 				char32_t c;
 				/* Consume current word. */
-				size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
+				while (this->cur_pos != end && !IsWhitespace(*this->cur_pos)) {
-				while (this->cur_pos < this->len && !IsWhitespace(c)) {
+					++this->cur_pos;
 					this->cur_pos += offs;
 					offs = Utf8Decode(&c, this->string + this->cur_pos);
 				}
 				/* Consume whitespace to the next word. */
-				while (this->cur_pos < this->len && IsWhitespace(c)) {
+				while (this->cur_pos != end && IsWhitespace(*this->cur_pos)) {
-					this->cur_pos += offs;
+					++this->cur_pos;
 					offs = Utf8Decode(&c, this->string + this->cur_pos);
 				}
-
+				return this->cur_pos.GetByteOffset();
 				return this->cur_pos;
 			}
 			default:
 				NOT_REACHED();
@ -1020,33 +1002,27 @@ public:
 	size_t Prev(IterType what) override
 	{
-		assert(this->string != nullptr);
+		const auto begin = this->string.begin();
 		/* Already at the beginning? */
-		if (this->cur_pos == 0) return END;
+		if (this->cur_pos == begin) return END;
 		switch (what) {
 			case ITER_CHARACTER:
-				return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
+				--this->cur_pos;
 				return this->cur_pos.GetByteOffset();
-			case ITER_WORD: {
+			case ITER_WORD:
 				const char *s = this->string + this->cur_pos;
 				char32_t c;
 				/* Consume preceding whitespace. */
 				do {
-					s = Utf8PrevChar(s);
+					--this->cur_pos;
-					Utf8Decode(&c, s);
+				} while (this->cur_pos != begin && IsWhitespace(*this->cur_pos));
 				} while (s > this->string && IsWhitespace(c));
 				/* Consume preceding word. */
-				while (s > this->string && !IsWhitespace(c)) {
+				while (this->cur_pos != begin && !IsWhitespace(*this->cur_pos)) {
-					s = Utf8PrevChar(s);
+					--this->cur_pos;
 					Utf8Decode(&c, s);
 				}
 				/* Move caret back to the beginning of the word. */
-				if (IsWhitespace(c)) Utf8Consume(&s);
+				if (IsWhitespace(*this->cur_pos)) ++this->cur_pos;
-
+				return this->cur_pos.GetByteOffset();
 				return this->cur_pos = s - this->string;
 			}
 			default:
 				NOT_REACHED();
--- a/src/string_base.h
+++ b/src/string_base.h
@ -35,7 +35,7 @@ public:
 	 * changed. The cursor is reset to the start of the string.
 	 * @param s New string.
 	 */
-	virtual void SetString(const char *s) = 0;
+	virtual void SetString(std::string_view s) = 0;
 	/**
 	 * Change the current string cursor.
--- a/src/textbuf.cpp
+++ b/src/textbuf.cpp
@ -291,7 +291,7 @@ const char *Textbuf::GetText() const
 /** Update the character iter after the text has changed. */
 void Textbuf::UpdateStringIter()
 {
-	this->char_iter->SetString(this->buf.c_str());
+	this->char_iter->SetString(this->buf);
 	size_t pos = this->char_iter->SetCurPosition(this->caretpos);
 	this->caretpos = pos == StringIterator::END ? 0 : (uint16_t)pos;
 }