(svn r25653) -Add: Caret movement by words for CJK languages.

2013-08-05 20:35:31 +00:00 · 2013-08-05 20:35:31 +00:00 · 76367f6bf1
parent e7dc14b25a
commit 76367f6bf1
5 changed files with 220 additions and 123 deletions
--- a/src/string.cpp
+++ b/src/string.cpp
@ -661,50 +661,132 @@ int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
 class IcuStringIterator : public StringIterator
 {
 	icu::BreakIterator *char_itr; ///< ICU iterator for characters.
 	icu::BreakIterator *word_itr; ///< ICU iterator for words.
 	const char *string;           ///< Iteration string in UTF-8.
 	SmallVector<UChar, 32> utf16_str;      ///< UTF-16 copy of the string.
 	SmallVector<size_t, 32> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
 public:
-	IcuStringIterator() : char_itr(NULL)
+	IcuStringIterator() : char_itr(NULL), word_itr(NULL)
 	{
 		UErrorCode status = U_ZERO_ERROR;
 		this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
 		this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
 		*this->utf16_str.Append() = '\0';
 		*this->utf16_to_utf8.Append() = 0;
 	}
 	virtual ~IcuStringIterator()
 	{
 		delete this->char_itr;
 		delete this->word_itr;
 	}
 	virtual void SetString(const char *s)
 	{
 		this->string = s;
 		/* Unfortunately current ICU versions only provide rudimentary support
 		 * for word break iterators (especially for CJK languages) in combination
 		 * with UTF-8 input. As a work around we have to convert the input to
 		 * UTF-16 and create a mapping back to UTF-8 character indices. */
 		this->utf16_str.Clear();
 		this->utf16_to_utf8.Clear();
 		while (*s != '\0') {
 			size_t idx = s - this->string;
 			WChar c = Utf8Consume(&s);
 			if (c <	0x10000) {
 				*this->utf16_str.Append() = (UChar)c;
 			} else {
 				/* Make a surrogate pair. */
 				*this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
 				*this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
 				*this->utf16_to_utf8.Append() = idx;
 			}
 			*this->utf16_to_utf8.Append() = idx;
 		}
 		*this->utf16_str.Append() = '\0';
 		*this->utf16_to_utf8.Append() = s - this->string;
 		UText text = UTEXT_INITIALIZER;
 		UErrorCode status = U_ZERO_ERROR;
-		utext_openUTF8(&text, s, -1, &status);
+		utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
 		this->char_itr->setText(&text, status);
 		this->word_itr->setText(&text, status);
 		this->char_itr->first();
 		this->word_itr->first();
 	}
 	virtual size_t SetCurPosition(size_t pos)
 	{
 		/* Convert incoming position to an UTF-16 string index. */
 		uint utf16_pos = 0;
 		for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
 			if (this->utf16_to_utf8[i] == pos) {
 				utf16_pos = i;
 				break;
 			}
 		}
 		/* isBoundary has the documented side-effect of setting the current
 		 * position to the first valid boundary equal to or greater than
 		 * the passed value. */
-		this->char_itr->isBoundary((int32_t)pos);
+		this->char_itr->isBoundary(utf16_pos);
-		return this->char_itr->current();
+		return this->utf16_to_utf8[this->char_itr->current()];
 	}
-	virtual size_t Next()
+	virtual size_t Next(IterType what)
 	{
-		int32_t pos = this->char_itr->next();
+		int32_t pos;
-		return pos == icu::BreakIterator::DONE ? END : pos;
+		switch (what) {
 			case ITER_CHARACTER:
 				pos = this->char_itr->next();
 				break;
 			case ITER_WORD:
 				pos = this->word_itr->following(this->char_itr->current());
 				/* The ICU word iterator considers both the start and the end of a word a valid
 				 * break point, but we only want word starts. Move to the next location in
 				 * case the new position points to whitespace. */
 				while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
 				this->char_itr->isBoundary(pos);
 				break;
 			default:
 				NOT_REACHED();
 		}
-	virtual size_t Prev()
+		return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
 	}
 	virtual size_t Prev(IterType what)
 	{
-		int32_t pos = this->char_itr->previous();
+		int32_t pos;
-		return pos == icu::BreakIterator::DONE ? END : pos;
+		switch (what) {
 			case ITER_CHARACTER:
 				pos = this->char_itr->previous();
 				break;
 			case ITER_WORD:
 				pos = this->word_itr->preceding(this->char_itr->current());
 				/* The ICU word iterator considers both the start and the end of a word a valid
 				 * break point, but we only want word starts. Move to the previous location in
 				 * case the new position points to whitespace. */
 				while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
 				this->char_itr->isBoundary(pos);
 				break;
 			default:
 				NOT_REACHED();
 		}
 		return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
 	}
 };
@ -742,26 +824,79 @@ public:
 		return this->cur_pos = pos;
 	}
-	virtual size_t Next()
+	virtual size_t Next(IterType what)
 	{
 		assert(this->string != NULL);
 		/* Already at the end? */
 		if (this->cur_pos >= this->len) return END;
 		switch (what) {
 			case ITER_CHARACTER: {
 				WChar c;
 				this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
 				return this->cur_pos;
 			}
-	virtual size_t Prev()
+			case ITER_WORD: {
 				WChar c;
 				/* Consume current word. */
 				size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
 				while (this->cur_pos < this->len && !IsWhitespace(c)) {
 					this->cur_pos += offs;
 					offs = Utf8Decode(&c, this->string + this->cur_pos);
 				}
 				/* Consume whitespace to the next word. */
 				while (this->cur_pos < this->len && IsWhitespace(c)) {
 					this->cur_pos += offs;
 					offs = Utf8Decode(&c, this->string + this->cur_pos);
 				}
 				return this->cur_pos;
 			}
 			default:
 				NOT_REACHED();
 		}
 		return END;
 	}
 	virtual size_t Prev(IterType what)
 	{
 		assert(this->string != NULL);
 		/* Already at the beginning? */
 		if (this->cur_pos == 0) return END;
 		switch (what) {
 			case ITER_CHARACTER:
 				return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
 			case ITER_WORD: {
 				const char *s = this->string + this->cur_pos;
 				WChar c;
 				/* Consume preceding whitespace. */
 				do {
 					s = Utf8PrevChar(s);
 					Utf8Decode(&c, s);
 				} while (s > this->string && IsWhitespace(c));
 				/* Consume preceding word. */
 				while (s > this->string && !IsWhitespace(c)) {
 					s = Utf8PrevChar(s);
 					Utf8Decode(&c, s);
 				}
 				/* Move caret back to the beginning of the word. */
 				if (IsWhitespace(c)) Utf8Consume(&s);
 				return this->cur_pos = s - this->string;
 			}
 			default:
 				NOT_REACHED();
 		}
 		return END;
 	}
 };
--- a/src/string_base.h
+++ b/src/string_base.h
@ -15,6 +15,12 @@
 /** Class for iterating over different kind of parts of a string. */
 class StringIterator {
 public:
 	/** Type of the iterator. */
 	enum IterType {
 		ITER_CHARACTER, ///< Iterate over characters (or more exactly grapheme clusters).
 		ITER_WORD,      ///< Iterate over words.
 	};
 	/** Sentinel to indicate end-of-iteration. */
 	static const size_t END = SIZE_MAX;
@ -45,13 +51,13 @@ public:
 	 * Advance the cursor by one iteration unit.
 	 * @return New cursor position (in bytes) or #END if the cursor is already at the end of the string.
 	 */
-	virtual size_t Next() = 0;
+	virtual size_t Next(IterType what = ITER_CHARACTER) = 0;
 	/**
 	 * Move the cursor back by one iteration unit.
 	 * @return New cursor position (in bytes) or #END if the cursor is already at the start of the string.
 	 */
-	virtual size_t Prev() = 0;
+	virtual size_t Prev(IterType what = ITER_CHARACTER) = 0;
 protected:
 	StringIterator() {}
--- a/src/string_func.h
+++ b/src/string_func.h
@ -90,7 +90,6 @@ static inline WChar Utf8Consume(const char **s)
 	return c;
 }
 /**
 * Return the length of a UTF-8 encoded character.
 * @param c Unicode character.
@ -156,6 +155,51 @@ static inline const char *Utf8PrevChar(const char *s)
 size_t Utf8StringLength(const char *s);
 /**
 * Is the given character a lead surrogate code point?
 * @param c The character to test.
 * @return True if the character is a lead surrogate code point.
 */
 static inline bool Utf16IsLeadSurrogate(uint c)
 {
 	return c >= 0xD800 && c <= 0xDBFF;
 }
 /**
 * Is the given character a lead surrogate code point?
 * @param c The character to test.
 * @return True if the character is a lead surrogate code point.
 */
 static inline bool Utf16IsTrailSurrogate(uint c)
 {
 	return c >= 0xDC00 && c <= 0xDFFF;
 }
 /**
 * Convert an UTF-16 surrogate pair to the corresponding Unicode character.
 * @param lead Lead surrogate code point.
 * @param trail Trail surrogate code point.
 * @return Decoded Unicode character.
 */
 static inline WChar Utf16DecodeSurrogate(uint lead, uint trail)
 {
 	return 0x10000 + (((lead - 0xD800) << 10) | (trail - 0xDC00));
 }
 /**
 * Decode an UTF-16 character.
 * @param c Pointer to one or two UTF-16 code points.
 * @return Decoded Unicode character.
 */
 static inline WChar Utf16DecodeChar(const uint16 *c)
 {
 	if (Utf16IsLeadSurrogate(c[0])) {
 		return Utf16DecodeSurrogate(c[0], c[1]);
 	} else {
 		return *c;
 	}
 }
 /**
 * Is the given character a text direction character.
 * @param c The character to test.
--- a/src/textbuf.cpp
+++ b/src/textbuf.cpp
@ -219,70 +219,12 @@ bool Textbuf::InsertClipboard()
 	return true;
 }
 /**
 * Checks if it is possible to move caret to the left
 * @return true if the caret can be moved to the left, otherwise false.
 */
 bool Textbuf::CanMoveCaretLeft()
 {
 	return this->caretpos != 0;
 }
 /**
 * Moves the caret to the left.
 * @pre Ensure that Textbuf::CanMoveCaretLeft returns true
 * @return The character under the caret.
 */
 WChar Textbuf::MoveCaretLeft()
 {
 	assert(this->CanMoveCaretLeft());
 	size_t pos = this->char_iter->Prev();
 	if (pos == StringIterator::END) pos = 0;
 	this->caretpos = (uint16)pos;
 	this->UpdateCaretPosition();
 	WChar c;
 	Utf8Decode(&c, this->buf + this->caretpos);
 	return c;
 }
 /**
 * Checks if it is possible to move caret to the right
 * @return true if the caret can be moved to the right, otherwise false.
 */
 bool Textbuf::CanMoveCaretRight()
 {
 	return this->caretpos < this->bytes - 1;
 }
 /**
 * Moves the caret to the right.
 * @pre Ensure that Textbuf::CanMoveCaretRight returns true
 * @return The character under the caret.
 */
 WChar Textbuf::MoveCaretRight()
 {
 	assert(this->CanMoveCaretRight());
 	size_t pos = this->char_iter->Next();
 	if (pos == StringIterator::END) pos = this->bytes - 1;
 	this->caretpos = (uint16)pos;
 	this->UpdateCaretPosition();
 	WChar c;
 	Utf8Decode(&c, this->buf + this->caretpos);
 	return c;
 }
 /** Update the character iter after the text has changed. */
 void Textbuf::UpdateStringIter()
 {
 	this->char_iter->SetString(this->buf);
-	this->caretpos = (uint16)this->char_iter->SetCurPosition(this->caretpos);
+	size_t pos = this->char_iter->SetCurPosition(this->caretpos);
 	this->caretpos = pos == StringIterator::END ? 0 : (uint16)pos;
 }
 /** Update pixel width of the text. */
@ -307,64 +249,38 @@ bool Textbuf::MovePos(uint16 keycode)
 {
 	switch (keycode) {
 		case WKC_LEFT:
 			if (this->CanMoveCaretLeft()) {
 				this->MoveCaretLeft();
 				return true;
 			}
 			break;
 		case WKC_CTRL | WKC_LEFT: {
-			if (!this->CanMoveCaretLeft()) break;
+			if (this->caretpos == 0) break;
-			/* Unconditionally move one char to the left. */
+			size_t pos = this->char_iter->Prev(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
-			WChar c = this->MoveCaretLeft();
+			if (pos == StringIterator::END) return true;
-			/* Consume left whitespaces. */
+
-			while (IsWhitespace(c)) {
+			this->caretpos = (uint16)pos;
-				if (!this->CanMoveCaretLeft()) return true;
+			this->UpdateCaretPosition();
 				c = this->MoveCaretLeft();
 			}
 			/* Consume left word. */
 			while (!IsWhitespace(c)) {
 				if (!this->CanMoveCaretLeft()) return true;
 				c = this->MoveCaretLeft();
 			}
 			/* Place caret at the beginning of the left word. */
 			this->MoveCaretRight();
 			return true;
 		}
 		case WKC_RIGHT:
 			if (this->CanMoveCaretRight()) {
 				this->MoveCaretRight();
 				return true;
 			}
 			break;
 		case WKC_CTRL | WKC_RIGHT: {
-			if (!this->CanMoveCaretRight()) break;
+			if (this->caretpos >= this->bytes - 1) break;
-			/* Unconditionally move one char to the right. */
+			size_t pos = this->char_iter->Next(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
-			WChar c = this->MoveCaretRight();
+			if (pos == StringIterator::END) return true;
-			/* Continue to consume current word. */
+
-			while (!IsWhitespace(c)) {
+			this->caretpos = (uint16)pos;
-				if (!this->CanMoveCaretRight()) return true;
+			this->UpdateCaretPosition();
 				c = this->MoveCaretRight();
 			}
 			/* Consume right whitespaces. */
 			while (IsWhitespace(c)) {
 				if (!this->CanMoveCaretRight()) return true;
 				c = this->MoveCaretRight();
 			}
 			return true;
 		}
 		case WKC_HOME:
 			this->caretpos = 0;
 			this->char_iter->SetCurPosition(this->caretpos);
 			this->UpdateCaretPosition();
 			return true;
 		case WKC_END:
 			this->caretpos = this->bytes - 1;
 			this->char_iter->SetCurPosition(this->caretpos);
 			this->UpdateCaretPosition();
 			return true;
--- a/src/textbuf_type.h
+++ b/src/textbuf_type.h
@ -67,10 +67,6 @@ private:
 	bool CanDelChar(bool backspace);
 	WChar GetNextDelChar(bool backspace);
 	void DelChar(bool backspace);
 	bool CanMoveCaretLeft();
 	WChar MoveCaretLeft();
 	bool CanMoveCaretRight();
 	WChar MoveCaretRight();
 	void UpdateStringIter();
 	void UpdateWidth();