summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorMike FABIAN <mfabian@redhat.com>2018-07-10 11:25:48 +0200
committerMike FABIAN <mfabian@redhat.com>2018-07-10 17:30:31 +0200
commit4beefeeb8ea80fece3d55c82b2b152ed9c78cdbe (patch)
treec4fef129f6932b676780d25d3a0335a536731e30
parentfd70af45528d59a00eb3190ef6706cb299488fcd (diff)
Put the correct Unicode version number 11.0.0 into the generated files
In some places there was still the old Unicode version 10.0.0 in the files. * localedata/charmaps/UTF-8: Use correct Unicode version 11.0.0 in comment. * localedata/locales/i18n_ctype: Use correct Unicode version in comments and headers. * localedata/unicode-gen/utf8_gen.py: Add option to specify Unicode version * localedata/unicode-gen/Makefile: Use option to specify Unicode version for utf8_gen.py
-rw-r--r--ChangeLog9
-rw-r--r--localedata/charmaps/UTF-82
-rw-r--r--localedata/locales/i18n_ctype6
-rw-r--r--localedata/unicode-gen/Makefile4
-rwxr-xr-xlocaledata/unicode-gen/utf8_gen.py112
5 files changed, 88 insertions, 45 deletions
diff --git a/ChangeLog b/ChangeLog
index 8a4f85efec..1981cf5412 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,12 @@
+2018-07-10 Mike FABIAN <mfabian@redhat.com>
+
+ * localedata/charmaps/UTF-8: Use correct Unicode version 11.0.0 in comment.
+ * localedata/locales/i18n_ctype: Use correct Unicode version in comments
+ and headers.
+ * localedata/unicode-gen/utf8_gen.py: Add option to specify Unicode version
+ * localedata/unicode-gen/Makefile: Use option to specify Unicode version
+ for utf8_gen.py
+
2018-07-10 Florian Weimer <fweimer@redhat.com>
* io/Makefile (routines): Add statx.
diff --git a/localedata/charmaps/UTF-8 b/localedata/charmaps/UTF-8
index 885c6ae7fc..1367aa46cf 100644
--- a/localedata/charmaps/UTF-8
+++ b/localedata/charmaps/UTF-8
@@ -47069,7 +47069,7 @@ CHARMAP
<U0010FFC0>..<U0010FFFD> /xf4/x8f/xbf/x80 <Plane 16 Private Use>
END CHARMAP
-% Character width according to Unicode 10.0.0.
+% Character width according to Unicode 11.0.0.
% - Default width is 1.
% - Double-width characters have width 2; generated from
% "grep '^[^;]*;[WF]' EastAsianWidth.txt"
diff --git a/localedata/locales/i18n_ctype b/localedata/locales/i18n_ctype
index ed59aef947..26400cbff1 100644
--- a/localedata/locales/i18n_ctype
+++ b/localedata/locales/i18n_ctype
@@ -13,10 +13,10 @@ comment_char %
% information, but with different transliterations, can include it
% directly.
-% Generated automatically by gen_unicode_ctype.py for Unicode 10.0.0.
+% Generated automatically by gen_unicode_ctype.py for Unicode 11.0.0.
LC_IDENTIFICATION
-title "Unicode 10.0.0 FDCC-set"
+title "Unicode 11.0.0 FDCC-set"
source "UnicodeData.txt, DerivedCoreProperties.txt"
address ""
contact ""
@@ -25,7 +25,7 @@ tel ""
fax ""
language ""
territory "Earth"
-revision "10.0.0"
+revision "11.0.0"
date "2018-06-20"
category "i18n:2012";LC_CTYPE
END LC_IDENTIFICATION
diff --git a/localedata/unicode-gen/Makefile b/localedata/unicode-gen/Makefile
index c0faae5e58..c2b5fa75e1 100644
--- a/localedata/unicode-gen/Makefile
+++ b/localedata/unicode-gen/Makefile
@@ -92,7 +92,9 @@ tr_TR: gen_unicode_ctype.py
UTF-8: UnicodeData.txt EastAsianWidth.txt
UTF-8: utf8_gen.py
- $(PYTHON3) utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt
+ $(PYTHON3) utf8_gen.py -u UnicodeData.txt \
+ -e EastAsianWidth.txt -p PropList.txt \
+ --unicode_version $(UNICODE_VERSION)
UTF-8-report: UTF-8 ../charmaps/UTF-8
UTF-8-report: utf8_compatibility.py
diff --git a/localedata/unicode-gen/utf8_gen.py b/localedata/unicode-gen/utf8_gen.py
index 715b753ec1..2d8d631a96 100755
--- a/localedata/unicode-gen/utf8_gen.py
+++ b/localedata/unicode-gen/utf8_gen.py
@@ -27,6 +27,7 @@ Usage: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt
It will output UTF-8 file
'''
+import argparse
import sys
import re
import unicode_utils
@@ -197,9 +198,10 @@ def write_header_charmap(outfile):
outfile.write("% alias ISO-10646/UTF-8\n")
outfile.write("CHARMAP\n")
-def write_header_width(outfile):
+def write_header_width(outfile, unicode_version):
'''Writes the header on top of the WIDTH section to the output file'''
- outfile.write('% Character width according to Unicode 10.0.0.\n')
+ outfile.write('% Character width according to Unicode '
+ + '{:s}.\n'.format(unicode_version))
outfile.write('% - Default width is 1.\n')
outfile.write('% - Double-width characters have width 2; generated from\n')
outfile.write('% "grep \'^[^;]*;[WF]\' EastAsianWidth.txt"\n')
@@ -292,41 +294,71 @@ def process_width(outfile, ulines, elines, plines):
width_dict[same_width_list[0]]))
if __name__ == "__main__":
- if len(sys.argv) < 3:
- print("USAGE: python3 utf8_gen.py UnicodeData.txt EastAsianWidth.txt PropList.txt")
- else:
- with open(sys.argv[1], mode='r') as UNIDATA_FILE:
- UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
- with open(sys.argv[2], mode='r') as EAST_ASIAN_WIDTH_FILE:
- EAST_ASIAN_WIDTH_LINES = []
- for LINE in EAST_ASIAN_WIDTH_FILE:
- # If characters from EastAasianWidth.txt which are from
- # from reserved ranges (i.e. not yet assigned code points)
- # are added to the WIDTH section of the UTF-8 file, then
- # “make check” produces “Unknown Character” errors for
- # these code points because such unassigned code points
- # are not in the CHARMAP section of the UTF-8 file.
- #
- # Therefore, we skip all reserved code points when reading
- # the EastAsianWidth.txt file.
- if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
- continue
- if re.match(r'^[^;]*;[WF]', LINE):
- EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
- with open(sys.argv[3], mode='r') as PROP_LIST_FILE:
- PROP_LIST_LINES = []
- for LINE in PROP_LIST_FILE:
- if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
- PROP_LIST_LINES.append(LINE.strip())
- with open('UTF-8', mode='w') as OUTFILE:
- # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
- write_header_charmap(OUTFILE)
- process_charmap(UNICODE_DATA_LINES, OUTFILE)
- OUTFILE.write("END CHARMAP\n\n")
- # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
- write_header_width(OUTFILE)
- process_width(OUTFILE,
- UNICODE_DATA_LINES,
- EAST_ASIAN_WIDTH_LINES,
- PROP_LIST_LINES)
- OUTFILE.write("END WIDTH\n")
+ PARSER = argparse.ArgumentParser(
+ description='''
+ Generate a UTF-8 file from UnicodeData.txt, EastAsianWidth.txt, and PropList.txt.
+ ''')
+ PARSER.add_argument(
+ '-u', '--unicode_data_file',
+ nargs='?',
+ type=str,
+ default='UnicodeData.txt',
+ help=('The UnicodeData.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '-e', '--east_asian_with_file',
+ nargs='?',
+ type=str,
+ default='EastAsianWidth.txt',
+ help=('The EastAsianWidth.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '-p', '--prop_list_file',
+ nargs='?',
+ type=str,
+ default='PropList.txt',
+ help=('The PropList.txt file to read, '
+ + 'default: %(default)s'))
+ PARSER.add_argument(
+ '--unicode_version',
+ nargs='?',
+ required=True,
+ type=str,
+ help='The Unicode version of the input files used.')
+ ARGS = PARSER.parse_args()
+
+ with open(ARGS.unicode_data_file, mode='r') as UNIDATA_FILE:
+ UNICODE_DATA_LINES = UNIDATA_FILE.readlines()
+ with open(ARGS.east_asian_with_file, mode='r') as EAST_ASIAN_WIDTH_FILE:
+ EAST_ASIAN_WIDTH_LINES = []
+ for LINE in EAST_ASIAN_WIDTH_FILE:
+ # If characters from EastAasianWidth.txt which are from
+ # from reserved ranges (i.e. not yet assigned code points)
+ # are added to the WIDTH section of the UTF-8 file, then
+ # “make check” produces “Unknown Character” errors for
+ # these code points because such unassigned code points
+ # are not in the CHARMAP section of the UTF-8 file.
+ #
+ # Therefore, we skip all reserved code points when reading
+ # the EastAsianWidth.txt file.
+ if re.match(r'.*<reserved-.+>\.\.<reserved-.+>.*', LINE):
+ continue
+ if re.match(r'^[^;]*;[WF]', LINE):
+ EAST_ASIAN_WIDTH_LINES.append(LINE.strip())
+ with open(ARGS.prop_list_file, mode='r') as PROP_LIST_FILE:
+ PROP_LIST_LINES = []
+ for LINE in PROP_LIST_FILE:
+ if re.match(r'^[^;]*;[\s]*Prepended_Concatenation_Mark', LINE):
+ PROP_LIST_LINES.append(LINE.strip())
+ with open('UTF-8', mode='w') as OUTFILE:
+ # Processing UnicodeData.txt and write CHARMAP to UTF-8 file
+ write_header_charmap(OUTFILE)
+ process_charmap(UNICODE_DATA_LINES, OUTFILE)
+ OUTFILE.write("END CHARMAP\n\n")
+ # Processing EastAsianWidth.txt and write WIDTH to UTF-8 file
+ write_header_width(OUTFILE, ARGS.unicode_version)
+ process_width(OUTFILE,
+ UNICODE_DATA_LINES,
+ EAST_ASIAN_WIDTH_LINES,
+ PROP_LIST_LINES)
+ OUTFILE.write("END WIDTH\n")