1
0
Fork 0

(svn r25653) -Add: Caret movement by words for CJK languages.

release/1.4
michi_cc 2013-08-05 20:35:31 +00:00
parent e7dc14b25a
commit 76367f6bf1
5 changed files with 220 additions and 123 deletions

View File

@ -661,50 +661,132 @@ int strnatcmp(const char *s1, const char *s2, bool ignore_garbage_at_front)
class IcuStringIterator : public StringIterator class IcuStringIterator : public StringIterator
{ {
icu::BreakIterator *char_itr; ///< ICU iterator for characters. icu::BreakIterator *char_itr; ///< ICU iterator for characters.
icu::BreakIterator *word_itr; ///< ICU iterator for words.
const char *string; ///< Iteration string in UTF-8. const char *string; ///< Iteration string in UTF-8.
SmallVector<UChar, 32> utf16_str; ///< UTF-16 copy of the string.
SmallVector<size_t, 32> utf16_to_utf8; ///< Mapping from UTF-16 code point position to index in the UTF-8 source string.
public: public:
IcuStringIterator() : char_itr(NULL) IcuStringIterator() : char_itr(NULL), word_itr(NULL)
{ {
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status); this->char_itr = icu::BreakIterator::createCharacterInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
this->word_itr = icu::BreakIterator::createWordInstance(icu::Locale(_current_language != NULL ? _current_language->isocode : "en"), status);
*this->utf16_str.Append() = '\0';
*this->utf16_to_utf8.Append() = 0;
} }
virtual ~IcuStringIterator() virtual ~IcuStringIterator()
{ {
delete this->char_itr; delete this->char_itr;
delete this->word_itr;
} }
virtual void SetString(const char *s) virtual void SetString(const char *s)
{ {
this->string = s; this->string = s;
/* Unfortunately current ICU versions only provide rudimentary support
* for word break iterators (especially for CJK languages) in combination
* with UTF-8 input. As a work around we have to convert the input to
* UTF-16 and create a mapping back to UTF-8 character indices. */
this->utf16_str.Clear();
this->utf16_to_utf8.Clear();
while (*s != '\0') {
size_t idx = s - this->string;
WChar c = Utf8Consume(&s);
if (c < 0x10000) {
*this->utf16_str.Append() = (UChar)c;
} else {
/* Make a surrogate pair. */
*this->utf16_str.Append() = (UChar)(0xD800 + ((c - 0x10000) >> 10));
*this->utf16_str.Append() = (UChar)(0xDC00 + ((c - 0x10000) & 0x3FF));
*this->utf16_to_utf8.Append() = idx;
}
*this->utf16_to_utf8.Append() = idx;
}
*this->utf16_str.Append() = '\0';
*this->utf16_to_utf8.Append() = s - this->string;
UText text = UTEXT_INITIALIZER; UText text = UTEXT_INITIALIZER;
UErrorCode status = U_ZERO_ERROR; UErrorCode status = U_ZERO_ERROR;
utext_openUTF8(&text, s, -1, &status); utext_openUChars(&text, this->utf16_str.Begin(), this->utf16_str.Length() - 1, &status);
this->char_itr->setText(&text, status); this->char_itr->setText(&text, status);
this->word_itr->setText(&text, status);
this->char_itr->first(); this->char_itr->first();
this->word_itr->first();
} }
virtual size_t SetCurPosition(size_t pos) virtual size_t SetCurPosition(size_t pos)
{ {
/* Convert incoming position to an UTF-16 string index. */
uint utf16_pos = 0;
for (uint i = 0; i < this->utf16_to_utf8.Length(); i++) {
if (this->utf16_to_utf8[i] == pos) {
utf16_pos = i;
break;
}
}
/* isBoundary has the documented side-effect of setting the current /* isBoundary has the documented side-effect of setting the current
* position to the first valid boundary equal to or greater than * position to the first valid boundary equal to or greater than
* the passed value. */ * the passed value. */
this->char_itr->isBoundary((int32_t)pos); this->char_itr->isBoundary(utf16_pos);
return this->char_itr->current(); return this->utf16_to_utf8[this->char_itr->current()];
} }
virtual size_t Next() virtual size_t Next(IterType what)
{ {
int32_t pos = this->char_itr->next(); int32_t pos;
return pos == icu::BreakIterator::DONE ? END : pos; switch (what) {
case ITER_CHARACTER:
pos = this->char_itr->next();
break;
case ITER_WORD:
pos = this->word_itr->following(this->char_itr->current());
/* The ICU word iterator considers both the start and the end of a word a valid
* break point, but we only want word starts. Move to the next location in
* case the new position points to whitespace. */
while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->next();
this->char_itr->isBoundary(pos);
break;
default:
NOT_REACHED();
} }
virtual size_t Prev() return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
}
virtual size_t Prev(IterType what)
{ {
int32_t pos = this->char_itr->previous(); int32_t pos;
return pos == icu::BreakIterator::DONE ? END : pos; switch (what) {
case ITER_CHARACTER:
pos = this->char_itr->previous();
break;
case ITER_WORD:
pos = this->word_itr->preceding(this->char_itr->current());
/* The ICU word iterator considers both the start and the end of a word a valid
* break point, but we only want word starts. Move to the previous location in
* case the new position points to whitespace. */
while (pos != icu::BreakIterator::DONE && IsWhitespace(Utf16DecodeChar((const uint16 *)&this->utf16_str[pos]))) pos = this->word_itr->previous();
this->char_itr->isBoundary(pos);
break;
default:
NOT_REACHED();
}
return pos == icu::BreakIterator::DONE ? END : this->utf16_to_utf8[pos];
} }
}; };
@ -742,26 +824,79 @@ public:
return this->cur_pos = pos; return this->cur_pos = pos;
} }
virtual size_t Next() virtual size_t Next(IterType what)
{ {
assert(this->string != NULL); assert(this->string != NULL);
/* Already at the end? */ /* Already at the end? */
if (this->cur_pos >= this->len) return END; if (this->cur_pos >= this->len) return END;
switch (what) {
case ITER_CHARACTER: {
WChar c; WChar c;
this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos); this->cur_pos += Utf8Decode(&c, this->string + this->cur_pos);
return this->cur_pos; return this->cur_pos;
} }
virtual size_t Prev() case ITER_WORD: {
WChar c;
/* Consume current word. */
size_t offs = Utf8Decode(&c, this->string + this->cur_pos);
while (this->cur_pos < this->len && !IsWhitespace(c)) {
this->cur_pos += offs;
offs = Utf8Decode(&c, this->string + this->cur_pos);
}
/* Consume whitespace to the next word. */
while (this->cur_pos < this->len && IsWhitespace(c)) {
this->cur_pos += offs;
offs = Utf8Decode(&c, this->string + this->cur_pos);
}
return this->cur_pos;
}
default:
NOT_REACHED();
}
return END;
}
virtual size_t Prev(IterType what)
{ {
assert(this->string != NULL); assert(this->string != NULL);
/* Already at the beginning? */ /* Already at the beginning? */
if (this->cur_pos == 0) return END; if (this->cur_pos == 0) return END;
switch (what) {
case ITER_CHARACTER:
return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string; return this->cur_pos = Utf8PrevChar(this->string + this->cur_pos) - this->string;
case ITER_WORD: {
const char *s = this->string + this->cur_pos;
WChar c;
/* Consume preceding whitespace. */
do {
s = Utf8PrevChar(s);
Utf8Decode(&c, s);
} while (s > this->string && IsWhitespace(c));
/* Consume preceding word. */
while (s > this->string && !IsWhitespace(c)) {
s = Utf8PrevChar(s);
Utf8Decode(&c, s);
}
/* Move caret back to the beginning of the word. */
if (IsWhitespace(c)) Utf8Consume(&s);
return this->cur_pos = s - this->string;
}
default:
NOT_REACHED();
}
return END;
} }
}; };

View File

@ -15,6 +15,12 @@
/** Class for iterating over different kind of parts of a string. */ /** Class for iterating over different kind of parts of a string. */
class StringIterator { class StringIterator {
public: public:
/** Type of the iterator. */
enum IterType {
ITER_CHARACTER, ///< Iterate over characters (or more exactly grapheme clusters).
ITER_WORD, ///< Iterate over words.
};
/** Sentinel to indicate end-of-iteration. */ /** Sentinel to indicate end-of-iteration. */
static const size_t END = SIZE_MAX; static const size_t END = SIZE_MAX;
@ -45,13 +51,13 @@ public:
* Advance the cursor by one iteration unit. * Advance the cursor by one iteration unit.
* @return New cursor position (in bytes) or #END if the cursor is already at the end of the string. * @return New cursor position (in bytes) or #END if the cursor is already at the end of the string.
*/ */
virtual size_t Next() = 0; virtual size_t Next(IterType what = ITER_CHARACTER) = 0;
/** /**
* Move the cursor back by one iteration unit. * Move the cursor back by one iteration unit.
* @return New cursor position (in bytes) or #END if the cursor is already at the start of the string. * @return New cursor position (in bytes) or #END if the cursor is already at the start of the string.
*/ */
virtual size_t Prev() = 0; virtual size_t Prev(IterType what = ITER_CHARACTER) = 0;
protected: protected:
StringIterator() {} StringIterator() {}

View File

@ -90,7 +90,6 @@ static inline WChar Utf8Consume(const char **s)
return c; return c;
} }
/** /**
* Return the length of a UTF-8 encoded character. * Return the length of a UTF-8 encoded character.
* @param c Unicode character. * @param c Unicode character.
@ -156,6 +155,51 @@ static inline const char *Utf8PrevChar(const char *s)
size_t Utf8StringLength(const char *s); size_t Utf8StringLength(const char *s);
/**
* Is the given character a lead surrogate code point?
* @param c The character to test.
* @return True if the character is a lead surrogate code point.
*/
static inline bool Utf16IsLeadSurrogate(uint c)
{
return c >= 0xD800 && c <= 0xDBFF;
}
/**
* Is the given character a lead surrogate code point?
* @param c The character to test.
* @return True if the character is a lead surrogate code point.
*/
static inline bool Utf16IsTrailSurrogate(uint c)
{
return c >= 0xDC00 && c <= 0xDFFF;
}
/**
* Convert an UTF-16 surrogate pair to the corresponding Unicode character.
* @param lead Lead surrogate code point.
* @param trail Trail surrogate code point.
* @return Decoded Unicode character.
*/
static inline WChar Utf16DecodeSurrogate(uint lead, uint trail)
{
return 0x10000 + (((lead - 0xD800) << 10) | (trail - 0xDC00));
}
/**
* Decode an UTF-16 character.
* @param c Pointer to one or two UTF-16 code points.
* @return Decoded Unicode character.
*/
static inline WChar Utf16DecodeChar(const uint16 *c)
{
if (Utf16IsLeadSurrogate(c[0])) {
return Utf16DecodeSurrogate(c[0], c[1]);
} else {
return *c;
}
}
/** /**
* Is the given character a text direction character. * Is the given character a text direction character.
* @param c The character to test. * @param c The character to test.

View File

@ -219,70 +219,12 @@ bool Textbuf::InsertClipboard()
return true; return true;
} }
/**
* Checks if it is possible to move caret to the left
* @return true if the caret can be moved to the left, otherwise false.
*/
bool Textbuf::CanMoveCaretLeft()
{
return this->caretpos != 0;
}
/**
* Moves the caret to the left.
* @pre Ensure that Textbuf::CanMoveCaretLeft returns true
* @return The character under the caret.
*/
WChar Textbuf::MoveCaretLeft()
{
assert(this->CanMoveCaretLeft());
size_t pos = this->char_iter->Prev();
if (pos == StringIterator::END) pos = 0;
this->caretpos = (uint16)pos;
this->UpdateCaretPosition();
WChar c;
Utf8Decode(&c, this->buf + this->caretpos);
return c;
}
/**
* Checks if it is possible to move caret to the right
* @return true if the caret can be moved to the right, otherwise false.
*/
bool Textbuf::CanMoveCaretRight()
{
return this->caretpos < this->bytes - 1;
}
/**
* Moves the caret to the right.
* @pre Ensure that Textbuf::CanMoveCaretRight returns true
* @return The character under the caret.
*/
WChar Textbuf::MoveCaretRight()
{
assert(this->CanMoveCaretRight());
size_t pos = this->char_iter->Next();
if (pos == StringIterator::END) pos = this->bytes - 1;
this->caretpos = (uint16)pos;
this->UpdateCaretPosition();
WChar c;
Utf8Decode(&c, this->buf + this->caretpos);
return c;
}
/** Update the character iter after the text has changed. */ /** Update the character iter after the text has changed. */
void Textbuf::UpdateStringIter() void Textbuf::UpdateStringIter()
{ {
this->char_iter->SetString(this->buf); this->char_iter->SetString(this->buf);
this->caretpos = (uint16)this->char_iter->SetCurPosition(this->caretpos); size_t pos = this->char_iter->SetCurPosition(this->caretpos);
this->caretpos = pos == StringIterator::END ? 0 : (uint16)pos;
} }
/** Update pixel width of the text. */ /** Update pixel width of the text. */
@ -307,64 +249,38 @@ bool Textbuf::MovePos(uint16 keycode)
{ {
switch (keycode) { switch (keycode) {
case WKC_LEFT: case WKC_LEFT:
if (this->CanMoveCaretLeft()) {
this->MoveCaretLeft();
return true;
}
break;
case WKC_CTRL | WKC_LEFT: { case WKC_CTRL | WKC_LEFT: {
if (!this->CanMoveCaretLeft()) break; if (this->caretpos == 0) break;
/* Unconditionally move one char to the left. */ size_t pos = this->char_iter->Prev(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
WChar c = this->MoveCaretLeft(); if (pos == StringIterator::END) return true;
/* Consume left whitespaces. */
while (IsWhitespace(c)) { this->caretpos = (uint16)pos;
if (!this->CanMoveCaretLeft()) return true; this->UpdateCaretPosition();
c = this->MoveCaretLeft();
}
/* Consume left word. */
while (!IsWhitespace(c)) {
if (!this->CanMoveCaretLeft()) return true;
c = this->MoveCaretLeft();
}
/* Place caret at the beginning of the left word. */
this->MoveCaretRight();
return true; return true;
} }
case WKC_RIGHT: case WKC_RIGHT:
if (this->CanMoveCaretRight()) {
this->MoveCaretRight();
return true;
}
break;
case WKC_CTRL | WKC_RIGHT: { case WKC_CTRL | WKC_RIGHT: {
if (!this->CanMoveCaretRight()) break; if (this->caretpos >= this->bytes - 1) break;
/* Unconditionally move one char to the right. */ size_t pos = this->char_iter->Next(keycode & WKC_CTRL ? StringIterator::ITER_WORD : StringIterator::ITER_CHARACTER);
WChar c = this->MoveCaretRight(); if (pos == StringIterator::END) return true;
/* Continue to consume current word. */
while (!IsWhitespace(c)) { this->caretpos = (uint16)pos;
if (!this->CanMoveCaretRight()) return true; this->UpdateCaretPosition();
c = this->MoveCaretRight();
}
/* Consume right whitespaces. */
while (IsWhitespace(c)) {
if (!this->CanMoveCaretRight()) return true;
c = this->MoveCaretRight();
}
return true; return true;
} }
case WKC_HOME: case WKC_HOME:
this->caretpos = 0; this->caretpos = 0;
this->char_iter->SetCurPosition(this->caretpos);
this->UpdateCaretPosition(); this->UpdateCaretPosition();
return true; return true;
case WKC_END: case WKC_END:
this->caretpos = this->bytes - 1; this->caretpos = this->bytes - 1;
this->char_iter->SetCurPosition(this->caretpos);
this->UpdateCaretPosition(); this->UpdateCaretPosition();
return true; return true;

View File

@ -67,10 +67,6 @@ private:
bool CanDelChar(bool backspace); bool CanDelChar(bool backspace);
WChar GetNextDelChar(bool backspace); WChar GetNextDelChar(bool backspace);
void DelChar(bool backspace); void DelChar(bool backspace);
bool CanMoveCaretLeft();
WChar MoveCaretLeft();
bool CanMoveCaretRight();
WChar MoveCaretRight();
void UpdateStringIter(); void UpdateStringIter();
void UpdateWidth(); void UpdateWidth();