2
0
mirror of https://github.com/godotengine/godot.git synced 2025-04-25 01:48:08 +08:00

Automate generation of the char_range.inc file

Co-authored-by: Danil Alexeev <dalexeev12@yandex.ru>
This commit is contained in:
Jakub Marcowski 2025-01-21 16:10:17 +01:00
parent 1b7b009674
commit 10485764a7
No known key found for this signature in database
GPG Key ID: 10D9E07CFFBC0E6F
2 changed files with 141 additions and 1 deletions
core/string
misc/scripts

@ -28,6 +28,8 @@
/* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */
/**************************************************************************/
// This file was generated using the `misc/scripts/char_range_fetch.py` script.
#ifndef CHAR_RANGE_INC
#define CHAR_RANGE_INC
@ -43,7 +45,7 @@ struct CharRange {
constexpr inline CharRange xid_start[] = {
{ 0x41, 0x5a },
{ 0x5f, 0x5f }, // Underscore technically isn't in XID_Start, but for our purposes it's included.
{ 0x5f, 0x5f },
{ 0x61, 0x7a },
{ 0xaa, 0xaa },
{ 0xb5, 0xb5 },

138
misc/scripts/char_range_fetch.py Executable file

@ -0,0 +1,138 @@
#!/usr/bin/env python3
# Script used to dump char ranges for specific properties from
# the Unicode Character Database to the `char_range.inc` file.
# NOTE: This script is deliberately not integrated into the build system;
# you should run it manually whenever you want to update the data.
import os
import sys
from typing import Final, List, Tuple
from urllib.request import urlopen
if __name__ == "__main__":
sys.path.insert(1, os.path.join(os.path.dirname(__file__), "../../"))
from methods import generate_copyright_header
URL: Final[str] = "https://www.unicode.org/Public/16.0.0/ucd/DerivedCoreProperties.txt"
xid_start: List[Tuple[int, int]] = []
xid_continue: List[Tuple[int, int]] = []
uppercase_letter: List[Tuple[int, int]] = []
lowercase_letter: List[Tuple[int, int]] = []
unicode_letter: List[Tuple[int, int]] = []
def merge_ranges(ranges: List[Tuple[int, int]]) -> None:
if len(ranges) < 2:
return
last_start: int = ranges[0][0]
last_end: int = ranges[0][1]
original_ranges: List[Tuple[int, int]] = ranges[1:]
ranges.clear()
for curr_range in original_ranges:
curr_start: int = curr_range[0]
curr_end: int = curr_range[1]
if last_end + 1 != curr_start:
ranges.append((last_start, last_end))
last_start = curr_start
last_end = curr_end
ranges.append((last_start, last_end))
def parse_unicode_data() -> None:
lines: List[str] = [line.decode("utf-8") for line in urlopen(URL)]
for line in lines:
if line.startswith("#") or not line.strip():
continue
split_line: List[str] = line.split(";")
char_range: str = split_line[0].strip()
char_property: str = split_line[1].strip().split("#")[0].strip()
range_start: str = char_range
range_end: str = char_range
if ".." in char_range:
range_start, range_end = char_range.split("..")
range_tuple: Tuple[int, int] = (int(range_start, 16), int(range_end, 16))
if char_property == "XID_Start":
xid_start.append(range_tuple)
elif char_property == "XID_Continue":
xid_continue.append(range_tuple)
elif char_property == "Uppercase":
uppercase_letter.append(range_tuple)
elif char_property == "Lowercase":
lowercase_letter.append(range_tuple)
elif char_property == "Alphabetic":
unicode_letter.append(range_tuple)
# Underscore technically isn't in XID_Start, but for our purposes it's included.
xid_start.append((0x005F, 0x005F))
xid_start.sort(key=lambda x: x[0])
merge_ranges(xid_start)
merge_ranges(xid_continue)
merge_ranges(uppercase_letter)
merge_ranges(lowercase_letter)
merge_ranges(unicode_letter)
def make_array(array_name: str, range_list: List[Tuple[int, int]]) -> str:
result: str = f"constexpr inline CharRange {array_name}[] = {{\n"
for start, end in range_list:
result += f"\t{{ 0x{start:x}, 0x{end:x} }},\n"
result += "};\n\n"
return result
def generate_char_range_inc() -> None:
parse_unicode_data()
source: str = generate_copyright_header("char_range.inc")
source += f"""
// This file was generated using the `misc/scripts/char_range_fetch.py` script.
#ifndef CHAR_RANGE_INC
#define CHAR_RANGE_INC
#include "core/typedefs.h"
// Unicode Derived Core Properties
// Source: {URL}
struct CharRange {{
\tchar32_t start;
\tchar32_t end;
}};\n\n"""
source += make_array("xid_start", xid_start)
source += make_array("xid_continue", xid_continue)
source += make_array("uppercase_letter", uppercase_letter)
source += make_array("lowercase_letter", lowercase_letter)
source += make_array("unicode_letter", unicode_letter)
source += "#endif // CHAR_RANGE_INC\n"
char_range_path: str = os.path.join(os.path.dirname(__file__), "../../core/string/char_range.inc")
with open(char_range_path, "w", newline="\n") as f:
f.write(source)
print("`char_range.inc` generated successfully.")
if __name__ == "__main__":
generate_char_range_inc()