Merge pull request #75846 from dalexeev/string-cases-unicode

Add Unicode support to `String.to_*_case()` methods
This commit is contained in:
Rémi Verschelde 2024-02-22 14:38:52 +01:00
commit 49d7ad9f5d
No known key found for this signature in database
GPG Key ID: C3336907360768E1
4 changed files with 1387 additions and 45 deletions

File diff suppressed because it is too large Load Diff

View File

@ -35,24 +35,43 @@
#include "char_range.inc"
#define BSEARCH_CHAR_RANGE(m_array) \
int low = 0; \
int high = sizeof(m_array) / sizeof(m_array[0]) - 1; \
int middle; \
\
while (low <= high) { \
middle = (low + high) / 2; \
\
if (c < m_array[middle].start) { \
high = middle - 1; \
} else if (c > m_array[middle].end) { \
low = middle + 1; \
} else { \
return true; \
} \
} \
\
return false
static _FORCE_INLINE_ bool is_unicode_identifier_start(char32_t c) {
for (int i = 0; xid_start[i].start != 0; i++) {
if (c >= xid_start[i].start && c <= xid_start[i].end) {
return true;
}
}
return false;
BSEARCH_CHAR_RANGE(xid_start);
}
static _FORCE_INLINE_ bool is_unicode_identifier_continue(char32_t c) {
for (int i = 0; xid_continue[i].start != 0; i++) {
if (c >= xid_continue[i].start && c <= xid_continue[i].end) {
return true;
}
}
return false;
BSEARCH_CHAR_RANGE(xid_continue);
}
static _FORCE_INLINE_ bool is_unicode_upper_case(char32_t c) {
BSEARCH_CHAR_RANGE(uppercase_letter);
}
static _FORCE_INLINE_ bool is_unicode_lower_case(char32_t c) {
BSEARCH_CHAR_RANGE(lowercase_letter);
}
#undef BSEARCH_CHAR_RANGE
static _FORCE_INLINE_ bool is_ascii_upper_case(char32_t c) {
return (c >= 'A' && c <= 'Z');
}

View File

@ -1044,17 +1044,17 @@ String String::_camelcase_to_underscore() const {
int start_index = 0;
for (int i = 1; i < size(); i++) {
bool is_prev_upper = is_ascii_upper_case(cstr[i - 1]);
bool is_prev_lower = is_ascii_lower_case(cstr[i - 1]);
bool is_prev_upper = is_unicode_upper_case(cstr[i - 1]);
bool is_prev_lower = is_unicode_lower_case(cstr[i - 1]);
bool is_prev_digit = is_digit(cstr[i - 1]);
bool is_curr_upper = is_ascii_upper_case(cstr[i]);
bool is_curr_lower = is_ascii_lower_case(cstr[i]);
bool is_curr_upper = is_unicode_upper_case(cstr[i]);
bool is_curr_lower = is_unicode_lower_case(cstr[i]);
bool is_curr_digit = is_digit(cstr[i]);
bool is_next_lower = false;
if (i + 1 < size()) {
is_next_lower = is_ascii_lower_case(cstr[i + 1]);
is_next_lower = is_unicode_lower_case(cstr[i + 1]);
}
const bool cond_a = is_prev_lower && is_curr_upper; // aA

View File

@ -1300,39 +1300,54 @@ TEST_CASE("[String] Capitalize against many strings") {
input = "snake_case_function( snake_case_arg )";
output = "Snake Case Function( Snake Case Arg )";
CHECK(input.capitalize() == output);
input = U"словоСлово_слово слово";
output = U"Слово Слово Слово Слово";
CHECK(input.capitalize() == output);
input = U"λέξηΛέξη_λέξη λέξη";
output = U"Λέξη Λέξη Λέξη Λέξη";
CHECK(input.capitalize() == output);
input = U"բառԲառառ բառ";
output = U"Բառ Բառ Բառ Բառ";
CHECK(input.capitalize() == output);
}
struct StringCasesTestCase {
const char *input;
const char *camel_case;
const char *pascal_case;
const char *snake_case;
const char32_t *input;
const char32_t *camel_case;
const char32_t *pascal_case;
const char32_t *snake_case;
};
TEST_CASE("[String] Checking case conversion methods") {
StringCasesTestCase test_cases[] = {
/* clang-format off */
{ "2D", "2d", "2d", "2d" },
{ "2d", "2d", "2d", "2d" },
{ "2db", "2Db", "2Db", "2_db" },
{ "Vector3", "vector3", "Vector3", "vector_3" },
{ "sha256", "sha256", "Sha256", "sha_256" },
{ "Node2D", "node2d", "Node2d", "node_2d" },
{ "RichTextLabel", "richTextLabel", "RichTextLabel", "rich_text_label" },
{ "HTML5", "html5", "Html5", "html_5" },
{ "Node2DPosition", "node2dPosition", "Node2dPosition", "node_2d_position" },
{ "Number2Digits", "number2Digits", "Number2Digits", "number_2_digits" },
{ "get_property_list", "getPropertyList", "GetPropertyList", "get_property_list" },
{ "get_camera_2d", "getCamera2d", "GetCamera2d", "get_camera_2d" },
{ "_physics_process", "physicsProcess", "PhysicsProcess", "_physics_process" },
{ "bytes2var", "bytes2Var", "Bytes2Var", "bytes_2_var" },
{ "linear2db", "linear2Db", "Linear2Db", "linear_2_db" },
{ "sha256sum", "sha256Sum", "Sha256Sum", "sha_256_sum" },
{ "camelCase", "camelCase", "CamelCase", "camel_case" },
{ "PascalCase", "pascalCase", "PascalCase", "pascal_case" },
{ "snake_case", "snakeCase", "SnakeCase", "snake_case" },
{ "Test TEST test", "testTestTest", "TestTestTest", "test_test_test" },
{ nullptr, nullptr, nullptr, nullptr },
{ U"2D", U"2d", U"2d", U"2d" },
{ U"2d", U"2d", U"2d", U"2d" },
{ U"2db", U"2Db", U"2Db", U"2_db" },
{ U"Vector3", U"vector3", U"Vector3", U"vector_3" },
{ U"sha256", U"sha256", U"Sha256", U"sha_256" },
{ U"Node2D", U"node2d", U"Node2d", U"node_2d" },
{ U"RichTextLabel", U"richTextLabel", U"RichTextLabel", U"rich_text_label" },
{ U"HTML5", U"html5", U"Html5", U"html_5" },
{ U"Node2DPosition", U"node2dPosition", U"Node2dPosition", U"node_2d_position" },
{ U"Number2Digits", U"number2Digits", U"Number2Digits", U"number_2_digits" },
{ U"get_property_list", U"getPropertyList", U"GetPropertyList", U"get_property_list" },
{ U"get_camera_2d", U"getCamera2d", U"GetCamera2d", U"get_camera_2d" },
{ U"_physics_process", U"physicsProcess", U"PhysicsProcess", U"_physics_process" },
{ U"bytes2var", U"bytes2Var", U"Bytes2Var", U"bytes_2_var" },
{ U"linear2db", U"linear2Db", U"Linear2Db", U"linear_2_db" },
{ U"sha256sum", U"sha256Sum", U"Sha256Sum", U"sha_256_sum" },
{ U"camelCase", U"camelCase", U"CamelCase", U"camel_case" },
{ U"PascalCase", U"pascalCase", U"PascalCase", U"pascal_case" },
{ U"snake_case", U"snakeCase", U"SnakeCase", U"snake_case" },
{ U"Test TEST test", U"testTestTest", U"TestTestTest", U"test_test_test" },
{ U"словоСлово_слово слово", U"словоСловоСловоСлово", U"СловоСловоСловоСлово", U"слово_слово_слово_слово" },
{ U"λέξηΛέξη_λέξη λέξη", U"λέξηΛέξηΛέξηΛέξη", U"ΛέξηΛέξηΛέξηΛέξη", U"λέξη_λέξη_λέξη_λέξη" },
{ U"բառԲառառ բառ", U"բառԲառԲառԲառ", U"ԲառԲառԲառԲառ", U"բառառառառ" },
{ nullptr, nullptr, nullptr, nullptr },
/* clang-format on */
};