mirror of https://github.com/OpenTTD/OpenTTD
Codechange: Use StringConsumer and Builder in StrMakeValid and StrValid.
parent
dc21fae18e
commit
b81a35ea89
137
src/string.cpp
137
src/string.cpp
|
@ -14,6 +14,7 @@
|
||||||
#include "string_func.h"
|
#include "string_func.h"
|
||||||
#include "string_base.h"
|
#include "string_base.h"
|
||||||
#include "core/utf8.hpp"
|
#include "core/utf8.hpp"
|
||||||
|
#include "core/string_inplace.hpp"
|
||||||
|
|
||||||
#include "table/control_codes.h"
|
#include "table/control_codes.h"
|
||||||
|
|
||||||
|
@ -108,76 +109,40 @@ static bool IsSccEncodedCode(char32_t c)
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Copies the valid (UTF-8) characters from \c str up to \c last to the \c dst.
|
* Copies the valid (UTF-8) characters from \c consumer to the \c builder.
|
||||||
* Depending on the \c settings invalid characters can be replaced with a
|
* Depending on the \c settings invalid characters can be replaced with a
|
||||||
* question mark, as well as determining what characters are deemed invalid.
|
* question mark, as well as determining what characters are deemed invalid.
|
||||||
*
|
*
|
||||||
* It is allowed for \c dst to be the same as \c src, in which case the string
|
* @param builder The destination to write to.
|
||||||
* is make valid in place.
|
* @param consumer The string to validate.
|
||||||
* @param dst The destination to write to.
|
|
||||||
* @param str The string to validate.
|
|
||||||
* @param last The last valid character of str.
|
|
||||||
* @param settings The settings for the string validation.
|
* @param settings The settings for the string validation.
|
||||||
*/
|
*/
|
||||||
template <class T>
|
template<class Builder>
|
||||||
static void StrMakeValid(T &dst, const char *str, const char *last, StringValidationSettings settings)
|
static void StrMakeValid(Builder &builder, StringConsumer &consumer, StringValidationSettings settings)
|
||||||
{
|
{
|
||||||
/* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
|
/* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
|
||||||
|
while (consumer.AnyBytesLeft()) {
|
||||||
while (str <= last && *str != '\0') {
|
auto c = consumer.TryReadUtf8();
|
||||||
size_t len = Utf8EncodedCharLen(*str);
|
if (!c.has_value()) {
|
||||||
char32_t c;
|
|
||||||
/* If the first byte does not look like the first byte of an encoded
|
|
||||||
* character, i.e. encoded length is 0, then this byte is definitely bad
|
|
||||||
* and it should be skipped.
|
|
||||||
* When the first byte looks like the first byte of an encoded character,
|
|
||||||
* then the remaining bytes in the string are checked whether the whole
|
|
||||||
* encoded character can be there. If that is not the case, this byte is
|
|
||||||
* skipped.
|
|
||||||
* Finally we attempt to decode the encoded character, which does certain
|
|
||||||
* extra validations to see whether the correct number of bytes were used
|
|
||||||
* to encode the character. If that is not the case, the byte is probably
|
|
||||||
* invalid and it is skipped. We could emit a question mark, but then the
|
|
||||||
* logic below cannot just copy bytes, it would need to re-encode the
|
|
||||||
* decoded characters as the length in bytes may have changed.
|
|
||||||
*
|
|
||||||
* The goals here is to get as much valid Utf8 encoded characters from the
|
|
||||||
* source string to the destination string.
|
|
||||||
*
|
|
||||||
* Note: a multi-byte encoded termination ('\0') will trigger the encoded
|
|
||||||
* char length and the decoded length to differ, so it will be ignored as
|
|
||||||
* invalid character data. If it were to reach the termination, then we
|
|
||||||
* would also reach the "last" byte of the string and a normal '\0'
|
|
||||||
* termination will be placed after it.
|
|
||||||
*/
|
|
||||||
if (len == 0 || str + len > last + 1 || len != Utf8Decode(&c, str)) {
|
|
||||||
/* Maybe the next byte is still a valid character? */
|
/* Maybe the next byte is still a valid character? */
|
||||||
str++;
|
consumer.Skip(1);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
if (*c == 0) break;
|
||||||
|
|
||||||
if ((IsPrintable(c) && (c < SCC_SPRITE_START || c > SCC_SPRITE_END)) || (settings.Test(StringValidationSetting::AllowControlCode) && IsSccEncodedCode(c))) {
|
if ((IsPrintable(*c) && (*c < SCC_SPRITE_START || *c > SCC_SPRITE_END)) ||
|
||||||
/* Copy the character back. Even if dst is current the same as str
|
(settings.Test(StringValidationSetting::AllowControlCode) && IsSccEncodedCode(*c)) ||
|
||||||
* (i.e. no characters have been changed) this is quicker than
|
(settings.Test(StringValidationSetting::AllowNewline) && *c == '\n')) {
|
||||||
* moving the pointers ahead by len */
|
builder.PutUtf8(*c);
|
||||||
do {
|
} else if (settings.Test(StringValidationSetting::AllowNewline) && *c == '\r' && consumer.PeekCharIf('\n')) {
|
||||||
*dst++ = *str++;
|
/* Skip \r, if followed by \n */
|
||||||
} while (--len != 0);
|
/* continue */
|
||||||
} else if (settings.Test(StringValidationSetting::AllowNewline) && c == '\n') {
|
} else if (settings.Test(StringValidationSetting::ReplaceTabCrNlWithSpace) && (*c == '\r' || *c == '\n' || *c == '\t')) {
|
||||||
*dst++ = *str++;
|
/* Replace the tab, carriage return or newline with a space. */
|
||||||
} else {
|
builder.PutChar(' ');
|
||||||
if (settings.Test(StringValidationSetting::AllowNewline) && c == '\r' && str[1] == '\n') {
|
} else if (settings.Test(StringValidationSetting::ReplaceWithQuestionMark)) {
|
||||||
str += len;
|
/* Replace the undesirable character with a question mark */
|
||||||
continue;
|
builder.PutChar('?');
|
||||||
}
|
|
||||||
str += len;
|
|
||||||
if (settings.Test(StringValidationSetting::ReplaceTabCrNlWithSpace) && (c == '\r' || c == '\n' || c == '\t')) {
|
|
||||||
/* Replace the tab, carriage return or newline with a space. */
|
|
||||||
*dst++ = ' ';
|
|
||||||
} else if (settings.Test(StringValidationSetting::ReplaceWithQuestionMark)) {
|
|
||||||
/* Replace the undesirable character with a question mark */
|
|
||||||
*dst++ = '?';
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -193,9 +158,10 @@ static void StrMakeValid(T &dst, const char *str, const char *last, StringValida
|
||||||
*/
|
*/
|
||||||
void StrMakeValidInPlace(char *str, StringValidationSettings settings)
|
void StrMakeValidInPlace(char *str, StringValidationSettings settings)
|
||||||
{
|
{
|
||||||
char *dst = str;
|
InPlaceReplacement inplace(std::span(str, strlen(str)));
|
||||||
StrMakeValid(dst, str, str + strlen(str), settings);
|
StrMakeValid(inplace.builder, inplace.consumer, settings);
|
||||||
*dst = '\0';
|
/* Add NUL terminator, if we ended up with less bytes than before */
|
||||||
|
if (inplace.builder.AnyBytesUnused()) inplace.builder.PutChar('\0');
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -209,11 +175,9 @@ void StrMakeValidInPlace(std::string &str, StringValidationSettings settings)
|
||||||
{
|
{
|
||||||
if (str.empty()) return;
|
if (str.empty()) return;
|
||||||
|
|
||||||
char *buf = str.data();
|
InPlaceReplacement inplace(std::span(str.data(), str.size()));
|
||||||
char *last = buf + str.size() - 1;
|
StrMakeValid(inplace.builder, inplace.consumer, settings);
|
||||||
char *dst = buf;
|
str.erase(inplace.builder.GetBytesWritten(), std::string::npos);
|
||||||
StrMakeValid(dst, buf, last, settings);
|
|
||||||
str.erase(dst - buf, std::string::npos);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -225,16 +189,11 @@ void StrMakeValidInPlace(std::string &str, StringValidationSettings settings)
|
||||||
*/
|
*/
|
||||||
std::string StrMakeValid(std::string_view str, StringValidationSettings settings)
|
std::string StrMakeValid(std::string_view str, StringValidationSettings settings)
|
||||||
{
|
{
|
||||||
if (str.empty()) return {};
|
std::string result;
|
||||||
|
StringBuilder builder(result);
|
||||||
auto buf = str.data();
|
StringConsumer consumer(str);
|
||||||
auto last = buf + str.size() - 1;
|
StrMakeValid(builder, consumer, settings);
|
||||||
|
return result;
|
||||||
std::ostringstream dst;
|
|
||||||
std::ostreambuf_iterator<char> dst_iter(dst);
|
|
||||||
StrMakeValid(dst_iter, buf, last, settings);
|
|
||||||
|
|
||||||
return dst.str();
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -248,27 +207,17 @@ std::string StrMakeValid(std::string_view str, StringValidationSettings settings
|
||||||
bool StrValid(std::span<const char> str)
|
bool StrValid(std::span<const char> str)
|
||||||
{
|
{
|
||||||
/* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
|
/* Assume the ABSOLUTE WORST to be in str as it comes from the outside. */
|
||||||
auto it = std::begin(str);
|
StringConsumer consumer(str);
|
||||||
auto last = std::prev(std::end(str));
|
while (consumer.AnyBytesLeft()) {
|
||||||
|
auto c = consumer.TryReadUtf8();
|
||||||
while (it <= last && *it != '\0') {
|
if (!c.has_value()) return false; // invalid codepoint
|
||||||
size_t len = Utf8EncodedCharLen(*it);
|
if (*c == 0) return true; // NUL termination
|
||||||
/* Encoded length is 0 if the character isn't known.
|
if (!IsPrintable(*c) || (*c >= SCC_SPRITE_START && *c <= SCC_SPRITE_END)) {
|
||||||
* The length check is needed to prevent Utf8Decode to read
|
|
||||||
* over the terminating '\0' if that happens to be placed
|
|
||||||
* within the encoding of an UTF8 character. */
|
|
||||||
if (len == 0 || it + len > last) return false;
|
|
||||||
|
|
||||||
char32_t c;
|
|
||||||
len = Utf8Decode(&c, &*it);
|
|
||||||
if (!IsPrintable(c) || (c >= SCC_SPRITE_START && c <= SCC_SPRITE_END)) {
|
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
it += len;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return *it == '\0';
|
return false; // missing NUL termination
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|
Loading…
Reference in New Issue