1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139import crypto from "crypto";
// Types to mirror Zig's structures
interface Context<Elem> {
get(codepoint: number): Promise<Elem> | Elem;
eql(a: Elem, b: Elem): boolean;
}
interface Tables<Elem> {
stage1: number[];
stage2: number[];
stage3: Elem[];
}
class Generator<Elem> {
private static readonly BLOCK_SIZE = 256;
private readonly ctx: Context<Elem>;
private readonly blockMap = new Map<string, number>();
constructor(ctx: Context<Elem>) {
this.ctx = ctx;
}
private hashBlock(block: number[]): string {
const hash = crypto.createHash("sha256");
hash.update(Buffer.from(new Uint16Array(block).buffer));
return hash.digest("hex");
}
async generate(): Promise<Tables<Elem>> {
const stage1: number[] = [];
const stage2: number[] = [];
const stage3: Elem[] = [];
let block = new Array(Generator.BLOCK_SIZE).fill(0);
let blockLen = 0;
// Maximum Unicode codepoint is 0x10FFFF
for (let cp = 0; cp <= 0x10ffff; cp++) {
// Get the mapping for this codepoint
const elem = await this.ctx.get(cp);
// Find or add the element in stage3
let blockIdx = stage3.findIndex(item => this.ctx.eql(item, elem));
if (blockIdx === -1) {
blockIdx = stage3.length;
stage3.push(elem);
}
if (blockIdx > 0xffff) {
throw new Error("Block index too large");
}
// Add to current block
block[blockLen] = blockIdx;
blockLen++;
// Check if we need to finalize this block
if (blockLen < Generator.BLOCK_SIZE && cp !== 0x10ffff) {
continue;
}
// Fill remaining block space with zeros if needed
if (blockLen < Generator.BLOCK_SIZE) {
block.fill(0, blockLen);
}
// Get or create stage2 index for this block
const blockHash = this.hashBlock(block);
let stage2Idx = this.blockMap.get(blockHash);
if (stage2Idx === undefined) {
stage2Idx = stage2.length;
this.blockMap.set(blockHash, stage2Idx);
stage2.push(...block.slice(0, blockLen));
}
if (stage2Idx > 0xffff) {
throw new Error("Stage2 index too large");
}
// Add mapping to stage1
stage1.push(stage2Idx);
// Reset block
block = new Array(Generator.BLOCK_SIZE).fill(0);
blockLen = 0;
}
return { stage1, stage2, stage3 };
}
// Generates Zig code for the lookup tables
static writeZig<Elem>(tableName: string, tables: Tables<Elem>, elemToString: (elem: Elem) => string): string {
let output = `/// Auto-generated. Do not edit.\n`;
output += `fn ${tableName}(comptime Elem: type) type {\n`;
output += " return struct {\n";
// Stage 1
output += `pub const stage1: [${tables.stage1.length}]u16 = .{`;
output += tables.stage1.join(",");
output += "};\n\n";
// Stage 2
output += `pub const stage2: [${tables.stage2.length}]u8 = .{`;
output += tables.stage2.join(",");
output += "};\n\n";
// Stage 3
output += `pub const stage3: [${tables.stage3.length}]Elem = .{`;
output += tables.stage3.map(elemToString).join(",");
output += "};\n";
output += " };\n}\n";
return output;
}
}
// Example usage:
async function example() {
// Example context that maps codepoints to their category
const ctx: Context<string> = {
get: async (cp: number) => {
// This would normally look up the actual Unicode category
return "Lu";
},
eql: (a: string, b: string) => a === b,
};
const generator = new Generator(ctx);
const tables = await generator.generate();
// Generate Zig code
const zigCode = Generator.writeZig(tables, (elem: string) => `"${elem}"`);
console.log(zigCode);
}
export { Generator, type Context, type Tables };