utils/unicode-case-fold.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137

#!/usr/bin/env python
"""
Unicode case folding database conversion utility

Parses the database and generates a C++ function which implements the case
folding algorithm. The database entries are of the form:

  <code>; <status>; <mapping>; # <name>

<status> can be one of four characters:
  C - Common mappings
  S - mappings for Simple case folding
  F - mappings for Full case folding
  T - special case for Turkish I characters

Right now this generates a function which implements simple case folding (C+S
entries).
"""

import sys
import re
import urllib2

# This variable will body of the mappings function
body = ""

# Reads file line-by-line, extracts Common and Simple case fold mappings and
# returns a (from_char, to_char, from_name) tuple.
def mappings(f):
    previous_from = -1
    expr = re.compile(r'^(.*); [CS]; (.*); # (.*)')
    for line in f:
        m = expr.match(line)
        if not m: continue
        from_char = int(m.group(1), 16)
        to_char = int(m.group(2), 16)
        from_name = m.group(3)

        if from_char <= previous_from:
            raise Exception("Duplicate or unsorted characters in input")
        yield from_char, to_char, from_name
        previous_from = from_char

# Computes the shift (to_char - from_char) in a mapping.
def shift(mapping):
    return mapping[1] - mapping[0]

# Computes the stride (from_char2 - from_char1) of two mappings.
def stride2(mapping1, mapping2):
    return mapping2[0] - mapping1[0]

# Computes the stride of a list of mappings. The list should have at least two
# mappings. All mappings in the list are assumed to have the same stride.
def stride(block):
    return stride2(block[0], block[1])


# b is a list of mappings. All the mappings are assumed to have the same
# shift and the stride between adjecant mappings (if any) is constant.
def dump_block(b):
    global body

    if len(b) == 1:
        # Special case for handling blocks of length 1. We don't even need to
        # emit the "if (C < X) return C" check below as all characters in this
        # range will be caught by the "C < X" check emitted by the first
        # non-trivial block.
        body  += "  // {2}\n  if (C == {0:#06x})\n    return {1:#06x};\n".format(*b[0])
        return

    first = b[0][0]
    last = first + stride(b) * (len(b)-1)
    modulo = first % stride(b)

    # All characters before this block map to themselves.
    body += "  if (C < {0:#06x})\n    return C;\n".format(first)
    body += "  // {0} characters\n".format(len(b))

    # Generic pattern: check upper bound (lower bound is checked by the "if"
    # above) and modulo of C, return C+shift.
    pattern = "  if (C <= {0:#06x} && C % {1} == {2})\n    return C + {3};\n"

    if stride(b) == 2 and shift(b[0]) == 1 and modulo == 0:
        # Special case:
        # We can elide the modulo-check because the expression "C|1" will map
        # the intervening characters to themselves.
        pattern = "  if (C <= {0:#06x})\n    return C | 1;\n"
    elif stride(b) == 1:
        # Another special case: X % 1 is always zero, so don't emit the
        # modulo-check.
        pattern = "  if (C <= {0:#06x})\n    return C + {3};\n"

    body += pattern.format(last, stride(b), modulo, shift(b[0]))

current_block = []
f = urllib2.urlopen(sys.argv[1])
for m in mappings(f):
    if len(current_block) == 0:
        current_block.append(m)
        continue

    if shift(current_block[0]) != shift(m):
        # Incompatible shift, start a new block.
        dump_block(current_block)
        current_block = [m]
        continue

    if len(current_block) == 1 or stride(current_block) == stride2(current_block[-1], m):
        current_block.append(m)
        continue

    # Incompatible stride, start a new block.
    dump_block(current_block)
    current_block = [m]
f.close()

dump_block(current_block)

print '//===---------- Support/UnicodeCaseFold.cpp -------------------------------===//'
print '//'
print '// This file was generated by utils/unicode-case-fold.py from the Unicode'
print '// case folding database at'
print '//   ', sys.argv[1]
print '//'
print '// To regenerate this file, run:'
print '//   utils/unicode-case-fold.py \\'
print '//     "{}" \\'.format(sys.argv[1])
print '//     > lib/Support/UnicodeCaseFold.cpp'
print '//'
print '//===----------------------------------------------------------------------===//'
print ''
print '#include "llvm/Support/Unicode.h"'
print ''
print "int llvm::sys::unicode::foldCharSimple(int C) {"
print body
print "  return C;"
print "}"