From 9cf47e69d6c377bdef61ccc8bc76d9e4ebd50f36 Mon Sep 17 00:00:00 2001 From: Peter Nelson Date: Sun, 27 Oct 2024 15:49:09 +0000 Subject: [PATCH] Fix bb8a0c7641: Skip control codes when sorting strings. (#13035) Now that SkipGarbage doesn't skip all multi-byte utf-8 characters, string control codes are not skipped either. This gave unintended sorting when NewGRF names start with colour codes. Make SkipGarbage UTF-8 aware so that it is able to skip some unicode ranges as well. --- src/string.cpp | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/src/string.cpp b/src/string.cpp index 6d386c04a9..eb8bb0632b 100644 --- a/src/string.cpp +++ b/src/string.cpp @@ -541,6 +541,22 @@ char *strcasestr(const char *haystack, const char *needle) } #endif /* DEFINE_STRCASESTR */ +/** + * Test if a unicode character is considered garbage to be skipped. + * @param c Character to test. + * @returns true iff the character should be skipped. + */ +static bool IsGarbageCharacter(char32_t c) +{ + if (c >= '0' && c <= '9') return false; + if (c >= 'A' && c <= 'Z') return false; + if (c >= 'a' && c <= 'z') return false; + if (c >= SCC_CONTROL_START && c <= SCC_CONTROL_END) return true; + if (c >= 0xC0 && c <= 0x10FFFF) return false; + + return true; +} + /** * Skip some of the 'garbage' in the string that we don't want to use * to sort on. This way the alphabetical sorting will work better as @@ -551,8 +567,15 @@ char *strcasestr(const char *haystack, const char *needle) */ static std::string_view SkipGarbage(std::string_view str) { - while (!str.empty() && (static_cast(str[0]) < '0' || IsInsideMM(str[0], ';', '@' + 1) || IsInsideMM(str[0], '[', '`' + 1) || IsInsideMM(str[0], '{', '~' + 1))) str.remove_prefix(1); - return str; + auto first = std::begin(str); + auto last = std::end(str); + while (first < last) { + char32_t c; + size_t len = Utf8Decode(&c, &*first); + if (!IsGarbageCharacter(c)) break; + first += len; + } + return {first, last}; } /**