1
0
Fork 0

Fix bb8a0c7641: Skip control codes when sorting strings. (#13035)

Now that SkipGarbage doesn't skip all multi-byte utf-8 characters, string control codes are not skipped either. This gave unintended sorting when NewGRF names start with colour codes.

Make SkipGarbage UTF-8 aware so that it is able to skip some unicode ranges as well.
pull/13037/head
Peter Nelson 2024-10-27 15:49:09 +00:00 committed by GitHub
parent 233ee16c44
commit 9cf47e69d6
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
1 changed files with 25 additions and 2 deletions

View File

@ -541,6 +541,22 @@ char *strcasestr(const char *haystack, const char *needle)
}
#endif /* DEFINE_STRCASESTR */
/**
* Test if a unicode character is considered garbage to be skipped.
* @param c Character to test.
* @returns true iff the character should be skipped.
*/
static bool IsGarbageCharacter(char32_t c)
{
if (c >= '0' && c <= '9') return false;
if (c >= 'A' && c <= 'Z') return false;
if (c >= 'a' && c <= 'z') return false;
if (c >= SCC_CONTROL_START && c <= SCC_CONTROL_END) return true;
if (c >= 0xC0 && c <= 0x10FFFF) return false;
return true;
}
/**
* Skip some of the 'garbage' in the string that we don't want to use
* to sort on. This way the alphabetical sorting will work better as
@ -551,8 +567,15 @@ char *strcasestr(const char *haystack, const char *needle)
*/
static std::string_view SkipGarbage(std::string_view str)
{
while (!str.empty() && (static_cast<uint8_t>(str[0]) < '0' || IsInsideMM(str[0], ';', '@' + 1) || IsInsideMM(str[0], '[', '`' + 1) || IsInsideMM(str[0], '{', '~' + 1))) str.remove_prefix(1);
return str;
auto first = std::begin(str);
auto last = std::end(str);
while (first < last) {
char32_t c;
size_t len = Utf8Decode(&c, &*first);
if (!IsGarbageCharacter(c)) break;
first += len;
}
return {first, last};
}
/**