1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109import { Generator, Context } from "./unicode-generator";
// Create sets for fast lookups
const idStartES5Set = new Set([
...require("@unicode/unicode-3.0.0/General_Category/Uppercase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Lowercase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Titlecase_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Modifier_Letter/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Other_Letter/code-points"),
]);
const idContinueES5Set = new Set([
...idStartES5Set,
...require("@unicode/unicode-3.0.0/General_Category/Nonspacing_Mark/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Spacing_Mark/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Decimal_Number/code-points"),
...require("@unicode/unicode-3.0.0/General_Category/Connector_Punctuation/code-points"),
]);
const idStartESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Start/code-points"));
const idContinueESNextSet = new Set(require("@unicode/unicode-15.1.0/Binary_Property/ID_Continue/code-points"));
// Exclude known problematic codepoints
const ID_Continue_mistake = new Set([0x30fb, 0xff65]);
function bitsToU64Array(bits: number[]): bigint[] {
const result: bigint[] = [];
for (let i = 0; i < bits.length; i += 64) {
let value = 0n;
for (let j = 0; j < 64 && i + j < bits.length; j++) {
if (bits[i + j]) {
value |= 1n << BigInt(j);
}
}
result.push(value);
}
return result;
}
async function generateTable(table: string, name: string, checkFn: (cp: number) => boolean) {
const context: Context<boolean> = {
get: (cp: number) => checkFn(cp),
eql: (a: boolean, b: boolean) => a === b,
};
const generator = new Generator(context);
const tables = await generator.generate();
return `
pub fn ${name}(cp: u21) bool {
if (cp > 0x10FFFF) return false;
const high = cp >> 8;
const low = cp & 0xFF;
const stage2_idx = ${table}.stage1[high];
const bit_pos = stage2_idx + low;
const u64_idx = bit_pos >> 6;
const bit_idx = @as(u6, @intCast(bit_pos & 63));
return (${table}.stage2[u64_idx] & (@as(u64, 1) << bit_idx)) != 0;
}
const ${table} = struct {
pub const stage1 = [_]u16{${tables.stage1.join(",")}};
pub const stage2 = [_]u64{${bitsToU64Array(tables.stage2)
.map(n => n.toString())
.join(",")}};
};
`;
}
async function main() {
const functions = [
{
name: "isIDStartES5",
table: "idStartES5",
check: (cp: number) => idStartES5Set.has(cp),
},
{
name: "isIDContinueES5",
table: "idContinueES5",
check: (cp: number) => idContinueES5Set.has(cp),
},
{
name: "isIDStartESNext",
table: "idStartESNext",
check: (cp: number) => idStartESNextSet.has(cp),
},
{
name: "isIDContinueESNext",
table: "idContinueESNext",
check: (cp: number) => idContinueESNextSet.has(cp) && !ID_Continue_mistake.has(cp),
},
];
const results = await Promise.all(
functions.map(async ({ name, check, table }) => {
const code = await generateTable(table, name, check);
return `
/// ${name} checks if a codepoint is valid in the ${name} category
${code}`;
}),
);
console.log(`/// This file is auto-generated. Do not edit.
${results.join("\n\n")}`);
}
main();