glibc/localedata/unicode-gen/utf8_compatibility.py

#!/usr/bin/python3
# -*- coding: utf-8 -*-
# Copyright (C) 2014-2015 Free Software Foundation, Inc.
# This file is part of the GNU C Library.
#
# The GNU C Library is free software; you can redistribute it and/or
# modify it under the terms of the GNU Lesser General Public
# License as published by the Free Software Foundation; either
# version 2.1 of the License, or (at your option) any later version.
#
# The GNU C Library is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
# Lesser General Public License for more details.
#
# You should have received a copy of the GNU Lesser General Public
# License along with the GNU C Library; if not, see
# <http://www.gnu.org/licenses/>.

'''
This script is useful for checking backward compatibility of newly
generated UTF-8 file from utf8_gen.py script

To see how this script is used, call it with the “-h” option:

    $ ./utf8_compatibility.py -h
    … prints usage message …
'''

import sys
import re
import argparse

# Dictionary holding the entire contents of the UnicodeData.txt file
#
# Contents of this dictionary look like this:
#
# {0: {'category': 'Cc',
#      'title': None,
#      'digit': '',
#      'name': '<control>',
#      'bidi': 'BN',
#      'combining': '0',
#      'comment': '',
#      'oldname': 'NULL',
#      'decomposition': '',
#      'upper': None,
#      'mirrored': 'N',
#      'lower': None,
#      'decdigit': '',
#      'numeric': ''},
#      …
# }
UNICODE_ATTRIBUTES = {}

# Dictionary holding the entire contents of the EastAsianWidths.txt file
#
# Contents of this dictionary look like this:
#
# {0: 'N', … , 45430: 'W', …}
EAST_ASIAN_WIDTHS = {}

def fill_attribute(code_point, fields):
    '''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.

    One entry in the UNICODE_ATTRIBUTES dictionary represents one line
    in the UnicodeData.txt file.

    '''
    UNICODE_ATTRIBUTES[code_point] =  {
        'name': fields[1],          # Character name
        'category': fields[2],      # General category
        'combining': fields[3],     # Canonical combining classes
        'bidi': fields[4],          # Bidirectional category
        'decomposition': fields[5], # Character decomposition mapping
        'decdigit': fields[6],      # Decimal digit value
        'digit': fields[7],         # Digit value
        'numeric': fields[8],       # Numeric value
        'mirrored': fields[9],      # mirrored
        'oldname': fields[10],      # Old Unicode 1.0 name
        'comment': fields[11],      # comment
        # Uppercase mapping
        'upper': int(fields[12], 16) if fields[12] else None,
        # Lowercase mapping
        'lower': int(fields[13], 16) if fields[13] else None,
        # Titlecase mapping
        'title': int(fields[14], 16) if fields[14] else None,
    }

def fill_attributes(filename):
    '''Stores the entire contents of the UnicodeData.txt file
    in the UNICODE_ATTRIBUTES dictionary.

    A typical line for a single code point in UnicodeData.txt looks
    like this:

    0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;

    Code point ranges are indicated by pairs of lines like this:

    4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;
    9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;
    '''
    with open(filename, mode='r') as unicode_data_file:
        fields_start = []
        for line in unicode_data_file:
            fields = line.strip().split(';')
            if len(fields) != 15:
                sys.stderr.write(
                    'short line in file "%(f)s": %(l)s\n' %{
                    'f': filename, 'l': line})
                exit(1)
            if fields[2] == 'Cs':
                # Surrogates are UTF-16 artefacts,
                # not real characters. Ignore them.
                fields_start = []
                continue
            if fields[1].endswith(', First>'):
                fields_start = fields
                fields_start[1] = fields_start[1].split(',')[0][1:]
                continue
            if fields[1].endswith(', Last>'):
                fields[1] = fields[1].split(',')[0][1:]
                if fields[1:] != fields_start[1:]:
                    sys.stderr.write(
                        'broken code point range in file "%(f)s": %(l)s\n' %{
                            'f': filename, 'l': line})
                    exit(1)
                for code_point in range(
                        int(fields_start[0], 16),
                        int(fields[0], 16)+1):
                    fill_attribute(code_point, fields)
                fields_start = []
                continue
            fill_attribute(int(fields[0], 16), fields)
            fields_start = []

def fill_east_asian_widths(filename):
    '''Stores the entire contents of the EastAsianWidths.txt file
    in the EAST_ASIAN_WIDTHS dictionary.

    Lines in EastAsianWidths.txt are either a code point range like
    this:

    9FCD..9FFF;W     # Cn    [51] <reserved-9FCD>..<reserved-9FFF>

    or a single code point like this:

    A015;W           # Lm         YI SYLLABLE WU
    '''
    with open(filename, mode='r') as east_asian_widths_file:
        for line in east_asian_widths_file:
            match = re.match(
                r'^(?P<codepoint1>[0-9A-F]{4,6})'
                +r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'
                +r'\s*;\s*(?P<property>[a-zA-Z]+)',
                line)
            if not match:
                continue
            start = match.group('codepoint1')
            end = match.group('codepoint2')
            if not end:
                end = start
            for code_point in range(int(start, 16), int(end, 16)+1):
                EAST_ASIAN_WIDTHS[code_point] = match.group('property')

def ucs_symbol(code_point):
    '''Return the UCS symbol string for a Unicode character.'''
    if code_point < 0x10000:
        return '<U{:04X}>'.format(code_point)
    else:
        return '<U{:08X}>'.format(code_point)

def create_charmap_dictionary(file_name):
    '''Create a dictionary for all code points found in the CHARMAP
    section of a file
    '''
    with open(file_name, mode='r') as utf8_file:
        charmap_dictionary = {}
        for line in utf8_file:
            if line.startswith('CHARMAP'):
                break
        for line in utf8_file:
            if line.startswith('END CHARMAP'):
                return charmap_dictionary
            if line.startswith('%'):
                continue
            match = re.match(
                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
                +r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
                +r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',
                line)
            if not match:
                continue
            codepoint1 = match.group('codepoint1')
            codepoint2 = match.group('codepoint2')
            if not codepoint2:
                codepoint2 = codepoint1
            for i in range(int(codepoint1, 16),
                           int(codepoint2, 16) + 1):
                charmap_dictionary[i] = match.group('hexutf8')
        sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'
                         %file_name)
        exit(1)

def check_charmap(original_file_name, new_file_name):
    '''Report differences in the CHARMAP section between the old and the
    new file
    '''
    print('************************************************************')
    print('Report on CHARMAP:')
    ocharmap = create_charmap_dictionary(original_file_name)
    ncharmap = create_charmap_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated CHARMAP: %d'
          %len(set(ocharmap)-set(ncharmap)))
    if ARGS.show_missing_characters:
        for key in sorted(set(ocharmap)-set(ncharmap)):
            print('removed: {:s}     {:s} {:s}'.format(
                ucs_symbol(key),
                ocharmap[key],
                UNICODE_ATTRIBUTES[key]['name'] \
                if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    changed_charmap = {}
    for key in set(ocharmap).intersection(set(ncharmap)):
        if ocharmap[key] != ncharmap[key]:
            changed_charmap[key] = (ocharmap[key], ncharmap[key])
    print('Total changed characters in newly generated CHARMAP: %d'
          %len(changed_charmap))
    if ARGS.show_changed_characters:
        for key in sorted(changed_charmap):
            print('changed: {:s}     {:s}->{:s} {:s}'.format(
                ucs_symbol(key),
                changed_charmap[key][0],
                changed_charmap[key][1],
                UNICODE_ATTRIBUTES[key]['name'] \
                if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated CHARMAP: %d'
          %len(set(ncharmap)-set(ocharmap)))
    if ARGS.show_added_characters:
        for key in sorted(set(ncharmap)-set(ocharmap)):
            print('added: {:s}     {:s} {:s}'.format(
                ucs_symbol(key),
                ncharmap[key],
                UNICODE_ATTRIBUTES[key]['name'] \
                if key in UNICODE_ATTRIBUTES else None))

def create_width_dictionary(file_name):
    '''Create a dictionary for all code points found in the WIDTH
    section of a file
    '''
    with open(file_name, mode='r') as utf8_file:
        width_dictionary = {}
        for line in utf8_file:
            if line.startswith('WIDTH'):
                break
        for line in utf8_file:
            if line.startswith('END WIDTH'):
                return width_dictionary
            match = re.match(
                r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'
                +r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'
                +r'\s+(?P<width>[02])',
                line)
            if not match:
                continue
            codepoint1 = match.group('codepoint1')
            codepoint2 = match.group('codepoint2')
            if not codepoint2:
                codepoint2 = codepoint1
            for i in range(int(codepoint1, 16),
                           int(codepoint2, 16) + 1):
                width_dictionary[i] = int(match.group('width'))
        sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)

def check_width(original_file_name, new_file_name):
    '''Report differences in the WIDTH section between the old and the new
    file
    '''
    print('************************************************************')
    print('Report on WIDTH:')
    owidth = create_width_dictionary(original_file_name)
    nwidth = create_width_dictionary(new_file_name)
    print('------------------------------------------------------------')
    print('Total removed characters in newly generated WIDTH: %d'
          %len(set(owidth)-set(nwidth)))
    print('(Characters not in WIDTH get width 1 by default, '
          + 'i.e. these have width 1 now.)')
    if ARGS.show_missing_characters:
        for key in sorted(set(owidth)-set(nwidth)):
            print('removed: {:s} '.format(ucs_symbol(key))
                  + '{:d} : '.format(owidth[key])
                  + 'eaw={:s} '.format(
                      EAST_ASIAN_WIDTHS[key]
                      if key in EAST_ASIAN_WIDTHS else None)
                  + 'category={:2s} '.format(
                      UNICODE_ATTRIBUTES[key]['category']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'bidi={:3s} '.format(
                      UNICODE_ATTRIBUTES[key]['bidi']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'name={:s}'.format(
                      UNICODE_ATTRIBUTES[key]['name']
                      if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    changed_width = {}
    for key in set(owidth).intersection(set(nwidth)):
        if owidth[key] != nwidth[key]:
            changed_width[key] = (owidth[key], nwidth[key])
    print('Total changed characters in newly generated WIDTH: %d'
          %len(changed_width))
    if ARGS.show_changed_characters:
        for key in sorted(changed_width):
            print('changed width: {:s} '.format(ucs_symbol(key))
                  + '{:d}->{:d} : '.format(changed_width[key][0],
                                          changed_width[key][1])
                  + 'eaw={:s} '.format(
                      EAST_ASIAN_WIDTHS[key]
                      if key in EAST_ASIAN_WIDTHS else None)
                  + 'category={:2s} '.format(
                      UNICODE_ATTRIBUTES[key]['category']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'bidi={:3s} '.format(
                      UNICODE_ATTRIBUTES[key]['bidi']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'name={:s}'.format(
                      UNICODE_ATTRIBUTES[key]['name']
                      if key in UNICODE_ATTRIBUTES else None))
    print('------------------------------------------------------------')
    print('Total added characters in newly generated WIDTH: %d'
          %len(set(nwidth)-set(owidth)))
    print('(Characters not in WIDTH get width 1 by default, '
          + 'i.e. these had width 1 before.)')
    if ARGS.show_added_characters:
        for key in sorted(set(nwidth)-set(owidth)):
            print('added: {:s} '.format(ucs_symbol(key))
                  + '{:d} : '.format(nwidth[key])
                  + 'eaw={:s} '.format(
                      EAST_ASIAN_WIDTHS[key]
                      if key in EAST_ASIAN_WIDTHS else None)
                  + 'category={:2s} '.format(
                      UNICODE_ATTRIBUTES[key]['category']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'bidi={:3s} '.format(
                      UNICODE_ATTRIBUTES[key]['bidi']
                      if key in UNICODE_ATTRIBUTES else None)
                  + 'name={:s}'.format(
                      UNICODE_ATTRIBUTES[key]['name']
                      if key in UNICODE_ATTRIBUTES else None))

if __name__ == "__main__":
    PARSER = argparse.ArgumentParser(
        description='''
        Compare the contents of LC_CTYPE in two files and check for errors.
        ''')
    PARSER.add_argument(
        '-o', '--old_utf8_file',
        nargs='?',
        required=True,
        type=str,
        help='The old UTF-8 file.')
    PARSER.add_argument(
        '-n', '--new_utf8_file',
        nargs='?',
        required=True,
        type=str,
        help='The new UTF-8 file.')
    PARSER.add_argument(
        '-u', '--unicode_data_file',
        nargs='?',
        type=str,
        help='The UnicodeData.txt file to read.')
    PARSER.add_argument(
        '-e', '--east_asian_width_file',
        nargs='?',
        type=str,
        help='The EastAsianWidth.txt file to read.')
    PARSER.add_argument(
        '-a', '--show_added_characters',
        action='store_true',
        help='Show characters which were added in detail.')
    PARSER.add_argument(
        '-m', '--show_missing_characters',
        action='store_true',
        help='Show characters which were removed in detail.')
    PARSER.add_argument(
        '-c', '--show_changed_characters',
        action='store_true',
        help='Show characters whose width was changed in detail.')
    ARGS = PARSER.parse_args()

    if ARGS.unicode_data_file:
        fill_attributes(ARGS.unicode_data_file)
    if ARGS.east_asian_width_file:
        fill_east_asian_widths(ARGS.east_asian_width_file)
    check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)
    check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)
Unicode 7.0.0 update; added generator scripts. for localedata/ChangeLog [BZ #17588] [BZ #13064] [BZ #14094] [BZ #17998] * unicode-gen/Makefile: New. * unicode-gen/unicode-license.txt: New, from Unicode. * unicode-gen/UnicodeData.txt: New, from Unicode. * unicode-gen/DerivedCoreProperties.txt: New, from Unicode. * unicode-gen/EastAsianWidth.txt: New, from Unicode. * unicode-gen/gen_unicode_ctype.py: New generator, from Mike FABIAN <mfabian@redhat.com>. * unicode-gen/ctype_compatibility.py: New verifier, from Pravin Satpute <psatpute@redhat.com> and Mike FABIAN. * unicode-gen/ctype_compatibility_test_cases.py: New verifier module, from Mike FABIAN. * unicode-gen/utf8_gen.py: New generator, from Pravin Satpute and Mike FABIAN. * unicode-gen/utf8_compatibility.py: New verifier, from Pravin Satpute and Mike FABIAN. * charmaps/UTF-8: Update. * locales/i18n: Update. * gen-unicode-ctype.c: Remove. * tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns true for ordinal indicators. 2015-02-21 06:14:59 +08:00			`#!/usr/bin/python3`
			`# -- coding: utf-8 --`
Amendments to Unicode 7 update. for ChangeLog * include/stdc-predef.h (__STDC_ISO_10646__): Update to 201304L, for Unicode 7. for localedata/ChangeLog * unicode-gen/ctype_compatibility.py: Use date ranges in copyright notice. * unicode-gen/ctype_compatibility_test_cases.py: Likewise. * unicode-gen/gen_unicode_ctype.py: Likewise. * unicode-gen/utf8_compatibility.py: Likewise. * unicode-gen/utf8_gen.py: Likewise. Use upper case for global variables, use tuples for global constant arrays. From Mike FABIAN. Suggested by Mike Frysinger <vapier@gentoo.org>. 2015-02-23 22:22:37 +08:00			`# Copyright (C) 2014-2015 Free Software Foundation, Inc.`
Unicode 7.0.0 update; added generator scripts. for localedata/ChangeLog [BZ #17588] [BZ #13064] [BZ #14094] [BZ #17998] * unicode-gen/Makefile: New. * unicode-gen/unicode-license.txt: New, from Unicode. * unicode-gen/UnicodeData.txt: New, from Unicode. * unicode-gen/DerivedCoreProperties.txt: New, from Unicode. * unicode-gen/EastAsianWidth.txt: New, from Unicode. * unicode-gen/gen_unicode_ctype.py: New generator, from Mike FABIAN <mfabian@redhat.com>. * unicode-gen/ctype_compatibility.py: New verifier, from Pravin Satpute <psatpute@redhat.com> and Mike FABIAN. * unicode-gen/ctype_compatibility_test_cases.py: New verifier module, from Mike FABIAN. * unicode-gen/utf8_gen.py: New generator, from Pravin Satpute and Mike FABIAN. * unicode-gen/utf8_compatibility.py: New verifier, from Pravin Satpute and Mike FABIAN. * charmaps/UTF-8: Update. * locales/i18n: Update. * gen-unicode-ctype.c: Remove. * tst-ctype-de_DE.ISO-8859-1.in: Adjust, islower now returns true for ordinal indicators. 2015-02-21 06:14:59 +08:00			`# This file is part of the GNU C Library.`
			`#`
			`# The GNU C Library is free software; you can redistribute it and/or`
			`# modify it under the terms of the GNU Lesser General Public`
			`# License as published by the Free Software Foundation; either`
			`# version 2.1 of the License, or (at your option) any later version.`
			`#`
			`# The GNU C Library is distributed in the hope that it will be useful,`
			`# but WITHOUT ANY WARRANTY; without even the implied warranty of`
			`# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU`
			`# Lesser General Public License for more details.`
			`#`
			`# You should have received a copy of the GNU Lesser General Public`
			`# License along with the GNU C Library; if not, see`
			`# <http://www.gnu.org/licenses/>.`

			`'''`
			`This script is useful for checking backward compatibility of newly`
			`generated UTF-8 file from utf8_gen.py script`

			`To see how this script is used, call it with the “-h” option:`

			`$ ./utf8_compatibility.py -h`
			`… prints usage message …`
			`'''`

			`import sys`
			`import re`
			`import argparse`

			`# Dictionary holding the entire contents of the UnicodeData.txt file`
			`#`
			`# Contents of this dictionary look like this:`
			`#`
			`# {0: {'category': 'Cc',`
			`# 'title': None,`
			`# 'digit': '',`
			`# 'name': '<control>',`
			`# 'bidi': 'BN',`
			`# 'combining': '0',`
			`# 'comment': '',`
			`# 'oldname': 'NULL',`
			`# 'decomposition': '',`
			`# 'upper': None,`
			`# 'mirrored': 'N',`
			`# 'lower': None,`
			`# 'decdigit': '',`
			`# 'numeric': ''},`
			`# …`
			`# }`
			`UNICODE_ATTRIBUTES = {}`

			`# Dictionary holding the entire contents of the EastAsianWidths.txt file`
			`#`
			`# Contents of this dictionary look like this:`
			`#`
			`# {0: 'N', … , 45430: 'W', …}`
			`EAST_ASIAN_WIDTHS = {}`

			`def fill_attribute(code_point, fields):`
			`'''Stores in UNICODE_ATTRIBUTES[code_point] the values from the fields.`

			`One entry in the UNICODE_ATTRIBUTES dictionary represents one line`
			`in the UnicodeData.txt file.`

			`'''`
			`UNICODE_ATTRIBUTES[code_point] = {`
			`'name': fields[1], # Character name`
			`'category': fields[2], # General category`
			`'combining': fields[3], # Canonical combining classes`
			`'bidi': fields[4], # Bidirectional category`
			`'decomposition': fields[5], # Character decomposition mapping`
			`'decdigit': fields[6], # Decimal digit value`
			`'digit': fields[7], # Digit value`
			`'numeric': fields[8], # Numeric value`
			`'mirrored': fields[9], # mirrored`
			`'oldname': fields[10], # Old Unicode 1.0 name`
			`'comment': fields[11], # comment`
			`# Uppercase mapping`
			`'upper': int(fields[12], 16) if fields[12] else None,`
			`# Lowercase mapping`
			`'lower': int(fields[13], 16) if fields[13] else None,`
			`# Titlecase mapping`
			`'title': int(fields[14], 16) if fields[14] else None,`
			`}`

			`def fill_attributes(filename):`
			`'''Stores the entire contents of the UnicodeData.txt file`
			`in the UNICODE_ATTRIBUTES dictionary.`

			`A typical line for a single code point in UnicodeData.txt looks`
			`like this:`

			`0041;LATIN CAPITAL LETTER A;Lu;0;L;;;;;N;;;;0061;`

			`Code point ranges are indicated by pairs of lines like this:`

			`4E00;<CJK Ideograph, First>;Lo;0;L;;;;;N;;;;;`
			`9FCC;<CJK Ideograph, Last>;Lo;0;L;;;;;N;;;;;`
			`'''`
			`with open(filename, mode='r') as unicode_data_file:`
			`fields_start = []`
			`for line in unicode_data_file:`
			`fields = line.strip().split(';')`
			`if len(fields) != 15:`
			`sys.stderr.write(`
			`'short line in file "%(f)s": %(l)s\n' %{`
			`'f': filename, 'l': line})`
			`exit(1)`
			`if fields[2] == 'Cs':`
			`# Surrogates are UTF-16 artefacts,`
			`# not real characters. Ignore them.`
			`fields_start = []`
			`continue`
			`if fields[1].endswith(', First>'):`
			`fields_start = fields`
			`fields_start[1] = fields_start[1].split(',')[0][1:]`
			`continue`
			`if fields[1].endswith(', Last>'):`
			`fields[1] = fields[1].split(',')[0][1:]`
			`if fields[1:] != fields_start[1:]:`
			`sys.stderr.write(`
			`'broken code point range in file "%(f)s": %(l)s\n' %{`
			`'f': filename, 'l': line})`
			`exit(1)`
			`for code_point in range(`
			`int(fields_start[0], 16),`
			`int(fields[0], 16)+1):`
			`fill_attribute(code_point, fields)`
			`fields_start = []`
			`continue`
			`fill_attribute(int(fields[0], 16), fields)`
			`fields_start = []`

			`def fill_east_asian_widths(filename):`
			`'''Stores the entire contents of the EastAsianWidths.txt file`
			`in the EAST_ASIAN_WIDTHS dictionary.`

			`Lines in EastAsianWidths.txt are either a code point range like`
			`this:`

			`9FCD..9FFF;W # Cn [51] <reserved-9FCD>..<reserved-9FFF>`

			`or a single code point like this:`

			`A015;W # Lm YI SYLLABLE WU`
			`'''`
			`with open(filename, mode='r') as east_asian_widths_file:`
			`for line in east_asian_widths_file:`
			`match = re.match(`
			`r'^(?P<codepoint1>[0-9A-F]{4,6})'`
			`+r'(?:\.\.(?P<codepoint2>[0-9A-F]{4,6}))?'`
			`+r'\s;\s(?P<property>[a-zA-Z]+)',`
			`line)`
			`if not match:`
			`continue`
			`start = match.group('codepoint1')`
			`end = match.group('codepoint2')`
			`if not end:`
			`end = start`
			`for code_point in range(int(start, 16), int(end, 16)+1):`
			`EAST_ASIAN_WIDTHS[code_point] = match.group('property')`

			`def ucs_symbol(code_point):`
			`'''Return the UCS symbol string for a Unicode character.'''`
			`if code_point < 0x10000:`
			`return '<U{:04X}>'.format(code_point)`
			`else:`
			`return '<U{:08X}>'.format(code_point)`

			`def create_charmap_dictionary(file_name):`
			`'''Create a dictionary for all code points found in the CHARMAP`
			`section of a file`
			`'''`
			`with open(file_name, mode='r') as utf8_file:`
			`charmap_dictionary = {}`
			`for line in utf8_file:`
			`if line.startswith('CHARMAP'):`
			`break`
			`for line in utf8_file:`
			`if line.startswith('END CHARMAP'):`
			`return charmap_dictionary`
			`if line.startswith('%'):`
			`continue`
			`match = re.match(`
			`r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'`
			`+r'(:?\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'`
			`+r'\s+(?P<hexutf8>(/x[0-9a-f]{2}){1,4})',`
			`line)`
			`if not match:`
			`continue`
			`codepoint1 = match.group('codepoint1')`
			`codepoint2 = match.group('codepoint2')`
			`if not codepoint2:`
			`codepoint2 = codepoint1`
			`for i in range(int(codepoint1, 16),`
			`int(codepoint2, 16) + 1):`
			`charmap_dictionary[i] = match.group('hexutf8')`
			`sys.stderr.write('No “CHARMAP” or no “END CHARMAP” found in %s\n'`
			`%file_name)`
			`exit(1)`

			`def check_charmap(original_file_name, new_file_name):`
			`'''Report differences in the CHARMAP section between the old and the`
			`new file`
			`'''`
			`print('************************************************************')`
			`print('Report on CHARMAP:')`
			`ocharmap = create_charmap_dictionary(original_file_name)`
			`ncharmap = create_charmap_dictionary(new_file_name)`
			`print('------------------------------------------------------------')`
			`print('Total removed characters in newly generated CHARMAP: %d'`
			`%len(set(ocharmap)-set(ncharmap)))`
			`if ARGS.show_missing_characters:`
			`for key in sorted(set(ocharmap)-set(ncharmap)):`
			`print('removed: {:s} {:s} {:s}'.format(`
			`ucs_symbol(key),`
			`ocharmap[key],`
			`UNICODE_ATTRIBUTES[key]['name'] \`
			`if key in UNICODE_ATTRIBUTES else None))`
			`print('------------------------------------------------------------')`
			`changed_charmap = {}`
			`for key in set(ocharmap).intersection(set(ncharmap)):`
			`if ocharmap[key] != ncharmap[key]:`
			`changed_charmap[key] = (ocharmap[key], ncharmap[key])`
			`print('Total changed characters in newly generated CHARMAP: %d'`
			`%len(changed_charmap))`
			`if ARGS.show_changed_characters:`
			`for key in sorted(changed_charmap):`
			`print('changed: {:s} {:s}->{:s} {:s}'.format(`
			`ucs_symbol(key),`
			`changed_charmap[key][0],`
			`changed_charmap[key][1],`
			`UNICODE_ATTRIBUTES[key]['name'] \`
			`if key in UNICODE_ATTRIBUTES else None))`
			`print('------------------------------------------------------------')`
			`print('Total added characters in newly generated CHARMAP: %d'`
			`%len(set(ncharmap)-set(ocharmap)))`
			`if ARGS.show_added_characters:`
			`for key in sorted(set(ncharmap)-set(ocharmap)):`
			`print('added: {:s} {:s} {:s}'.format(`
			`ucs_symbol(key),`
			`ncharmap[key],`
			`UNICODE_ATTRIBUTES[key]['name'] \`
			`if key in UNICODE_ATTRIBUTES else None))`

			`def create_width_dictionary(file_name):`
			`'''Create a dictionary for all code points found in the WIDTH`
			`section of a file`
			`'''`
			`with open(file_name, mode='r') as utf8_file:`
			`width_dictionary = {}`
			`for line in utf8_file:`
			`if line.startswith('WIDTH'):`
			`break`
			`for line in utf8_file:`
			`if line.startswith('END WIDTH'):`
			`return width_dictionary`
			`match = re.match(`
			`r'^<U(?P<codepoint1>[0-9A-F]{4,8})>'`
			`+r'(:?\.\.\.<U(?P<codepoint2>[0-9-A-F]{4,8})>)?'`
			`+r'\s+(?P<width>[02])',`
			`line)`
			`if not match:`
			`continue`
			`codepoint1 = match.group('codepoint1')`
			`codepoint2 = match.group('codepoint2')`
			`if not codepoint2:`
			`codepoint2 = codepoint1`
			`for i in range(int(codepoint1, 16),`
			`int(codepoint2, 16) + 1):`
			`width_dictionary[i] = int(match.group('width'))`
			`sys.stderr.write('No “WIDTH” or no “END WIDTH” found in %s\n' %file)`

			`def check_width(original_file_name, new_file_name):`
			`'''Report differences in the WIDTH section between the old and the new`
			`file`
			`'''`
			`print('************************************************************')`
			`print('Report on WIDTH:')`
			`owidth = create_width_dictionary(original_file_name)`
			`nwidth = create_width_dictionary(new_file_name)`
			`print('------------------------------------------------------------')`
			`print('Total removed characters in newly generated WIDTH: %d'`
			`%len(set(owidth)-set(nwidth)))`
			`print('(Characters not in WIDTH get width 1 by default, '`
			`+ 'i.e. these have width 1 now.)')`
			`if ARGS.show_missing_characters:`
			`for key in sorted(set(owidth)-set(nwidth)):`
			`print('removed: {:s} '.format(ucs_symbol(key))`
			`+ '{:d} : '.format(owidth[key])`
			`+ 'eaw={:s} '.format(`
			`EAST_ASIAN_WIDTHS[key]`
			`if key in EAST_ASIAN_WIDTHS else None)`
			`+ 'category={:2s} '.format(`
			`UNICODE_ATTRIBUTES[key]['category']`
			`if key in UNICODE_ATTRIBUTES else None)`
			`+ 'bidi={:3s} '.format(`
			`UNICODE_ATTRIBUTES[key]['bidi']`
			`if key in UNICODE_ATTRIBUTES else None)`
			`+ 'name={:s}'.format(`
			`UNICODE_ATTRIBUTES[key]['name']`
			`if key in UNICODE_ATTRIBUTES else None))`
			`print('------------------------------------------------------------')`
			`changed_width = {}`
			`for key in set(owidth).intersection(set(nwidth)):`
			`if owidth[key] != nwidth[key]:`
			`changed_width[key] = (owidth[key], nwidth[key])`
			`print('Total changed characters in newly generated WIDTH: %d'`
			`%len(changed_width))`
			`if ARGS.show_changed_characters:`
			`for key in sorted(changed_width):`
			`print('changed width: {:s} '.format(ucs_symbol(key))`
			`+ '{:d}->{:d} : '.format(changed_width[key][0],`
			`changed_width[key][1])`
			`+ 'eaw={:s} '.format(`
			`EAST_ASIAN_WIDTHS[key]`
			`if key in EAST_ASIAN_WIDTHS else None)`
			`+ 'category={:2s} '.format(`
			`UNICODE_ATTRIBUTES[key]['category']`
			`if key in UNICODE_ATTRIBUTES else None)`
			`+ 'bidi={:3s} '.format(`
			`UNICODE_ATTRIBUTES[key]['bidi']`
			`if key in UNICODE_ATTRIBUTES else None)`
			`+ 'name={:s}'.format(`
			`UNICODE_ATTRIBUTES[key]['name']`
			`if key in UNICODE_ATTRIBUTES else None))`
			`print('------------------------------------------------------------')`
			`print('Total added characters in newly generated WIDTH: %d'`
			`%len(set(nwidth)-set(owidth)))`
			`print('(Characters not in WIDTH get width 1 by default, '`
			`+ 'i.e. these had width 1 before.)')`
			`if ARGS.show_added_characters:`
			`for key in sorted(set(nwidth)-set(owidth)):`
			`print('added: {:s} '.format(ucs_symbol(key))`
			`+ '{:d} : '.format(nwidth[key])`
			`+ 'eaw={:s} '.format(`
			`EAST_ASIAN_WIDTHS[key]`
			`if key in EAST_ASIAN_WIDTHS else None)`
			`+ 'category={:2s} '.format(`
			`UNICODE_ATTRIBUTES[key]['category']`
			`if key in UNICODE_ATTRIBUTES else None)`
			`+ 'bidi={:3s} '.format(`
			`UNICODE_ATTRIBUTES[key]['bidi']`
			`if key in UNICODE_ATTRIBUTES else None)`
			`+ 'name={:s}'.format(`
			`UNICODE_ATTRIBUTES[key]['name']`
			`if key in UNICODE_ATTRIBUTES else None))`

			`if __name__ == "__main__":`
			`PARSER = argparse.ArgumentParser(`
			`description='''`
			`Compare the contents of LC_CTYPE in two files and check for errors.`
			`''')`
			`PARSER.add_argument(`
			`'-o', '--old_utf8_file',`
			`nargs='?',`
			`required=True,`
			`type=str,`
			`help='The old UTF-8 file.')`
			`PARSER.add_argument(`
			`'-n', '--new_utf8_file',`
			`nargs='?',`
			`required=True,`
			`type=str,`
			`help='The new UTF-8 file.')`
			`PARSER.add_argument(`
			`'-u', '--unicode_data_file',`
			`nargs='?',`
			`type=str,`
			`help='The UnicodeData.txt file to read.')`
			`PARSER.add_argument(`
			`'-e', '--east_asian_width_file',`
			`nargs='?',`
			`type=str,`
			`help='The EastAsianWidth.txt file to read.')`
			`PARSER.add_argument(`
			`'-a', '--show_added_characters',`
			`action='store_true',`
			`help='Show characters which were added in detail.')`
			`PARSER.add_argument(`
			`'-m', '--show_missing_characters',`
			`action='store_true',`
			`help='Show characters which were removed in detail.')`
			`PARSER.add_argument(`
			`'-c', '--show_changed_characters',`
			`action='store_true',`
			`help='Show characters whose width was changed in detail.')`
			`ARGS = PARSER.parse_args()`

			`if ARGS.unicode_data_file:`
			`fill_attributes(ARGS.unicode_data_file)`
			`if ARGS.east_asian_width_file:`
			`fill_east_asian_widths(ARGS.east_asian_width_file)`
			`check_charmap(ARGS.old_utf8_file, ARGS.new_utf8_file)`
			`check_width(ARGS.old_utf8_file, ARGS.new_utf8_file)`