mirror of
git://sourceware.org/git/glibc.git
synced 2025-01-18 12:16:13 +08:00
Improve utf8_gen.py to set the width for characters with Prepended_Concatenation_Mark property to 1
[BZ #22070] * localedata/unicode-gen/utf8_gen.py: Set the width for characters with Prepended_Concatenation_Mark property to 1 * localedata/charmaps/UTF-8: Updated using the improved script.
This commit is contained in:
parent
af83ed5c46
commit
2ae5be041d
@ -1,3 +1,10 @@
|
||||
2017-09-06 Mike FABIAN <mfabian@redhat.com>
|
||||
|
||||
[BZ #22070]
|
||||
* localedata/unicode-gen/utf8_gen.py: Set the width for
|
||||
characters with Prepended_Concatenation_Mark property to 1
|
||||
* localedata/charmaps/UTF-8: Updated using the improved script.
|
||||
|
||||
2017-09-06 Mike FABIAN <mfabian@redhat.com>
|
||||
|
||||
[BZ #21750]
|
||||
|
@ -46395,7 +46395,7 @@ CHARMAP
|
||||
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
|
||||
END CHARMAP
|
||||
|
||||
% Character width according to Unicode 7.0.0.
|
||||
% Character width according to Unicode 10.0.0.
|
||||
% - Default width is 1.
|
||||
% - Double-width characters have width 2; generated from
|
||||
% "grep '^[^;]*;[WF]' EastAsianWidth.txt"
|
||||
@ -46411,16 +46411,14 @@ WIDTH
|
||||
<U05C1>...<U05C2> 0
|
||||
<U05C4>...<U05C5> 0
|
||||
<U05C7> 0
|
||||
<U0600>...<U0605> 0
|
||||
<U0610>...<U061A> 0
|
||||
<U061C> 0
|
||||
<U064B>...<U065F> 0
|
||||
<U0670> 0
|
||||
<U06D6>...<U06DD> 0
|
||||
<U06D6>...<U06DC> 0
|
||||
<U06DF>...<U06E4> 0
|
||||
<U06E7>...<U06E8> 0
|
||||
<U06EA>...<U06ED> 0
|
||||
<U070F> 0
|
||||
<U0711> 0
|
||||
<U0730>...<U074A> 0
|
||||
<U07A6>...<U07B0> 0
|
||||
@ -46430,7 +46428,8 @@ WIDTH
|
||||
<U0825>...<U0827> 0
|
||||
<U0829>...<U082D> 0
|
||||
<U0859>...<U085B> 0
|
||||
<U08D4>...<U0902> 0
|
||||
<U08D4>...<U08E1> 0
|
||||
<U08E3>...<U0902> 0
|
||||
<U093A> 0
|
||||
<U093C> 0
|
||||
<U0941>...<U0948> 0
|
||||
@ -46692,7 +46691,6 @@ WIDTH
|
||||
<U0001107F>...<U00011081> 0
|
||||
<U000110B3>...<U000110B6> 0
|
||||
<U000110B9>...<U000110BA> 0
|
||||
<U000110BD> 0
|
||||
<U00011100>...<U00011102> 0
|
||||
<U00011127>...<U0001112B> 0
|
||||
<U0001112D>...<U00011134> 0
|
||||
|
@ -40,7 +40,7 @@ UNICODE_VERSION = 10.0.0
|
||||
PYTHON3 = python3
|
||||
WGET = wget
|
||||
|
||||
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt
|
||||
DOWNLOADS = UnicodeData.txt DerivedCoreProperties.txt EastAsianWidth.txt PropList.txt
|
||||
GENERATED = i18n tr_TR UTF-8 translit_combining translit_compat translit_circle translit_cjk_compat translit_font translit_fraction
|
||||
REPORTS = i18n-report UTF-8-report
|
||||
|
||||
@ -92,7 +92,7 @@ tr_TR: gen_unicode_ctype.py
|
||||
|
||||
UTF-8: UnicodeData.txt EastAsianWidth.txt
|
||||
UTF-8: utf8_gen.py
|
||||
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt
|
||||
$(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt
|
||||
|
||||
UTF-8-report: UTF-8 ../charmaps/UTF-8
|
||||
UTF-8-report: utf8_compatibility.py
|
||||
|
1618
localedata/unicode-gen/PropList.txt
Normal file
1618
localedata/unicode-gen/PropList.txt
Normal file
File diff suppressed because it is too large
Load Diff
@ -215,9 +215,11 @@ def write_header_width(outfile):
|
||||
# outfile.write("% \"grep '^[^;]*;ZERO WIDTH ' UnicodeData.txt\"\n")
|
||||
outfile.write("WIDTH\n")
|
||||
|
||||
def process_width(outfile, ulines, elines):
|
||||
def process_width(outfile, ulines, elines, plines):
|
||||
'''ulines are lines from UnicodeData.txt, elines are lines from
|
||||
EastAsianWidth.txt
|
||||
EastAsianWidth.txt containing characters with width “W” or “F”,
|
||||
plines are lines from PropList.txt which contain characters
|
||||
with the property “Prepended_Concatenation_Mark”.
|
||||
|
||||
'''
|
||||
width_dict = {}
|
||||
@ -230,16 +232,29 @@ def process_width(outfile, ulines, elines):
|
||||
for key in range(int(code_points[0], 16),
|
||||
int(code_points[1], 16)+1):
|
||||
width_dict[key] = 2
|
||||
|
||||
for line in ulines:
|
||||
fields = line.split(";")
|
||||
if fields[4] == "NSM" or fields[2] in ("Cf", "Me", "Mn"):
|
||||
width_dict[int(fields[0], 16)] = 0
|
||||
|
||||
for line in plines:
|
||||
# Characters with the property “Prepended_Concatenation_Mark”
|
||||
# should have the width 1:
|
||||
fields = line.split(";")
|
||||
if not '..' in fields[0]:
|
||||
code_points = (fields[0], fields[0])
|
||||
else:
|
||||
code_points = fields[0].split("..")
|
||||
for key in range(int(code_points[0], 16),
|
||||
int(code_points[1], 16)+1):
|
||||
del width_dict[key] # default width is 1
|
||||
|
||||
# handle special cases for compatibility
|
||||
for key in list((0x00AD,)):
|
||||
# https://www.cs.tut.fi/~jkorpela/shy.html
|
||||
if key in width_dict:
|
||||
del width_dict[key]
|
||||
del width_dict[key] # default width is 1
|
||||
for key in list(range(0x1160, 0x1200)):
|
||||
width_dict[key] = 0
|
||||
for key in list(range(0x3248, 0x3250)):
|
||||
@ -278,7 +293,7 @@ def process_width(outfile, ulines, elines):
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 3:
|
||||
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt")
|
||||
print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
|
||||
else:
|
||||
with open(sys.argv[1], mode='r') as UNIDATA_FILE:
|
||||
UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
|
||||
@ -298,6 +313,11 @@ if __name__ == "__main__":
|
||||
continue
|
||||
if re.match(r'^[^;]*;[WF]', LINE):
|
||||
EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
|
||||
with open(sys.argv[3], mode='r') as PROP_LIST_FILE:
|
||||
PROP_LIST_LINES = []
|
||||
for LINE in PROP_LIST_FILE:
|
||||
if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
|
||||
PROP_LIST_LINES.append(LINE.strip())
|
||||
with open('UTF-8', mode='w') as OUTFILE:
|
||||
# Processing UnicodeData.txt and write CHARMAP to UTF-8 file
|
||||
write_header_charmap(OUTFILE)
|
||||
@ -305,5 +325,8 @@ if __name__ == "__main__":
|
||||
OUTFILE.write("END CHARMAP\n\n")
|
||||
# Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
|
||||
write_header_width(OUTFILE)
|
||||
process_width(OUTFILE, UNICODE_DATA_LINES, EAST_ASIAN_WIDTH_LINES)
|
||||
process_width(OUTFILE,
|
||||
UNICODE_DATA_LINES,
|
||||
EAST_ASIAN_WIDTH_LINES,
|
||||
PROP_LIST_LINES)
|
||||
OUTFILE.write("END WIDTH\n")
|
||||
|
Loading…
Reference in New Issue
Block a user