Merge pull request #73973 from dalexeev/fix-regex-sub

RegEx: Fix handling of unset/unknown capture groups
This commit is contained in:
Rémi Verschelde 2024-12-02 15:49:45 +01:00
commit 867806954f
No known key found for this signature in database
GPG Key ID: C3336907360768E1
3 changed files with 49 additions and 19 deletions

View File

@ -289,25 +289,17 @@ TypedArray<RegExMatch> RegEx::search_all(const String &p_subject, int p_offset,
return result;
}
String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const {
ERR_FAIL_COND_V(!is_valid(), String());
ERR_FAIL_COND_V_MSG(p_offset < 0, String(), "RegEx sub offset must be >= 0");
// safety_zone is the number of chars we allocate in addition to the number of chars expected in order to
// guard against the PCRE API writing one additional \0 at the end. PCRE's API docs are unclear on whether
// PCRE understands outlength in pcre2_substitute() as counting an implicit additional terminating char or
// not. always allocating one char more than telling PCRE has us on the safe side.
int RegEx::_sub(const String &p_subject, const String &p_replacement, int p_offset, int p_end, uint32_t p_flags, String &r_output) const {
// `safety_zone` is the number of chars we allocate in addition to the number of chars expected in order to
// guard against the PCRE API writing one additional `\0` at the end. PCRE's API docs are unclear on whether
// PCRE understands outlength in `pcre2_substitute(`) as counting an implicit additional terminating char or
// not. Always allocating one char more than telling PCRE has us on the safe side.
const int safety_zone = 1;
PCRE2_SIZE olength = p_subject.length() + 1; // space for output string and one terminating \0 character
PCRE2_SIZE olength = p_subject.length() + 1; // Space for output string and one terminating `\0` character.
Vector<char32_t> output;
output.resize(olength + safety_zone);
uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH;
if (p_all) {
flags |= PCRE2_SUBSTITUTE_GLOBAL;
}
PCRE2_SIZE length = p_subject.length();
if (p_end >= 0 && (uint32_t)p_end < length) {
length = p_end;
@ -322,22 +314,49 @@ String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_a
pcre2_match_data_32 *match = pcre2_match_data_create_from_pattern_32(c, gctx);
int res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength);
int res = pcre2_substitute_32(c, s, length, p_offset, p_flags, match, mctx, r, p_replacement.length(), o, &olength);
if (res == PCRE2_ERROR_NOMEMORY) {
output.resize(olength + safety_zone);
o = (PCRE2_UCHAR32 *)output.ptrw();
res = pcre2_substitute_32(c, s, length, p_offset, flags, match, mctx, r, p_replacement.length(), o, &olength);
res = pcre2_substitute_32(c, s, length, p_offset, p_flags, match, mctx, r, p_replacement.length(), o, &olength);
}
pcre2_match_data_free_32(match);
pcre2_match_context_free_32(mctx);
if (res < 0) {
return String();
if (res >= 0) {
r_output = String(output.ptr(), olength) + p_subject.substr(length);
}
return String(output.ptr(), olength) + p_subject.substr(length);
return res;
}
String RegEx::sub(const String &p_subject, const String &p_replacement, bool p_all, int p_offset, int p_end) const {
ERR_FAIL_COND_V(!is_valid(), String());
ERR_FAIL_COND_V_MSG(p_offset < 0, String(), "RegEx sub offset must be >= 0");
uint32_t flags = PCRE2_SUBSTITUTE_OVERFLOW_LENGTH | PCRE2_SUBSTITUTE_UNSET_EMPTY;
if (p_all) {
flags |= PCRE2_SUBSTITUTE_GLOBAL;
}
String output;
const int res = _sub(p_subject, p_replacement, p_offset, p_end, flags, output);
if (res < 0) {
PCRE2_UCHAR32 buf[256];
pcre2_get_error_message_32(res, buf, 256);
String message = "PCRE2 Error: " + String((const char32_t *)buf);
ERR_PRINT(message.utf8());
if (res == PCRE2_ERROR_NOSUBSTRING) {
flags |= PCRE2_SUBSTITUTE_UNKNOWN_UNSET;
_sub(p_subject, p_replacement, p_offset, p_end, flags, output);
}
}
return output;
}
bool RegEx::is_valid() const {

View File

@ -78,6 +78,8 @@ class RegEx : public RefCounted {
void _pattern_info(uint32_t what, void *where) const;
int _sub(const String &p_subject, const String &p_replacement, int p_offset, int p_end, uint32_t p_flags, String &r_output) const;
protected:
static void _bind_methods();

View File

@ -145,6 +145,15 @@ TEST_CASE("[RegEx] Substitution") {
CHECK(re5.sub(s5, "cc", true, 0, 2) == "ccccaa");
CHECK(re5.sub(s5, "cc", true, 1, 3) == "acccca");
CHECK(re5.sub(s5, "", true, 0, 2) == "aa");
const String s6 = "property get_property set_property";
RegEx re6("(get_|set_)?property");
REQUIRE(re6.is_valid());
CHECK(re6.sub(s6, "$1new_property", true) == "new_property get_new_property set_new_property");
ERR_PRINT_OFF;
CHECK(re6.sub(s6, "$5new_property", true) == "new_property new_property new_property");
ERR_PRINT_ON;
}
TEST_CASE("[RegEx] Substitution with empty input and/or replacement") {