author | wenzelm |
Wed, 22 Jan 2025 22:22:19 +0100 | |
changeset 81954 | 6f2bcdfa9a19 |
parent 75261 | ed83c58c612a |
permissions | -rw-r--r-- |
75240 | 1 |
/* Author: Makarius |
2 |
||
75246
f32e5d4cf1a3
patch for vscode encoding "UTF-8-Isabelle": clone of "utf8", no symbols yet;
wenzelm
parents:
75245
diff
changeset
|
3 |
UTF-8-Isabelle symbol encoding: for use inside VSCode. |
75240 | 4 |
*/ |
5 |
||
6 |
'use strict'; |
|
7 |
||
8 |
||
75261 | 9 |
/* VSCode interfaces */ |
75240 | 10 |
|
11 |
export interface Options { |
|
12 |
stripBOM?: boolean; |
|
13 |
addBOM?: boolean; |
|
14 |
defaultEncoding?: string; |
|
15 |
} |
|
16 |
||
75245 | 17 |
export interface IDecoderStream { |
18 |
write(input: Uint8Array): string; |
|
19 |
end(): string | undefined; |
|
75240 | 20 |
} |
21 |
||
75261 | 22 |
export interface IEncoderStream { |
23 |
write(input: string): Uint8Array; |
|
24 |
end(): Uint8Array | undefined; |
|
25 |
} |
|
26 |
||
27 |
||
28 |
/* ASCII characters */ |
|
29 |
||
30 |
function is_ascii(c: number): boolean { |
|
31 |
return 0 <= c && c <= 127; |
|
32 |
} |
|
33 |
||
34 |
function is_ascii_letter(c: number): boolean { |
|
35 |
return 65 <= c && c <= 90 || 97 <= c && c <= 122; |
|
36 |
} |
|
37 |
||
38 |
function is_ascii_digit(c: number): boolean { |
|
39 |
return 48 <= c && c <= 57; |
|
40 |
} |
|
41 |
||
42 |
function is_ascii_quasi(c: number): boolean { |
|
43 |
return c === 95 || c === 39; |
|
44 |
} |
|
45 |
||
46 |
function is_ascii_letdig(c: number): boolean { |
|
47 |
return is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c); |
|
48 |
} |
|
49 |
||
50 |
||
51 |
/* string buffer */ |
|
52 |
||
53 |
class String_Buffer { |
|
54 |
state: string[] |
|
55 |
||
56 |
constructor() { |
|
57 |
this.state = []; |
|
58 |
} |
|
59 |
||
60 |
add(s: string) { |
|
61 |
this.state.push(s); |
|
62 |
} |
|
63 |
||
64 |
end(): string { |
|
65 |
const s = this.state.join(''); |
|
66 |
this.state = []; |
|
67 |
return s; |
|
68 |
} |
|
69 |
} |
|
70 |
||
71 |
||
72 |
/* Isabelle symbol codes */ |
|
73 |
||
74 |
function codepoint_string(c: number): string { |
|
75 |
return String.fromCodePoint(c); |
|
76 |
} |
|
77 |
||
78 |
const backslash: number = 92; |
|
79 |
const caret: number = 94; |
|
80 |
const bg_symbol: number = 60; |
|
81 |
const en_symbol: number = 62; |
|
82 |
||
83 |
interface Symbol_Code { |
|
84 |
symbol: string; |
|
85 |
code: number; |
|
86 |
} |
|
87 |
||
88 |
export class Symbol_Codes { |
|
89 |
decode_table: Map<string, string>; |
|
90 |
encode_table: Map<string, string>; |
|
91 |
||
92 |
constructor(symbols: Symbol_Code[]) { |
|
93 |
this.decode_table = new Map(symbols.map(s => [s.symbol, codepoint_string(s.code)])); |
|
94 |
this.encode_table = new Map(symbols.map(s => [codepoint_string(s.code), s.symbol])); |
|
95 |
} |
|
96 |
||
97 |
recode(str: string, do_encode: boolean): string { |
|
98 |
function ok(i: number): boolean { |
|
99 |
return 0 <= i && i < str.length; |
|
100 |
} |
|
101 |
function char(i: number): number { |
|
102 |
return ok(i) ? str.charCodeAt(i) : 0; |
|
103 |
} |
|
104 |
function maybe_char(c: number, i: number): number { |
|
105 |
return char(i) === c ? i + 1 : i; |
|
106 |
} |
|
107 |
||
108 |
function many_ascii_letdig(i: number): number { |
|
109 |
let k = i; |
|
110 |
while (is_ascii_letdig(char(k))) { k += 1 }; |
|
111 |
return k; |
|
112 |
} |
|
113 |
function maybe_ascii_id(i: number): number { |
|
114 |
return is_ascii_letter(char(i)) ? many_ascii_letdig(i + 1) : i |
|
115 |
} |
|
116 |
||
117 |
function scan_ascii(start: number): string { |
|
118 |
let i = start; |
|
119 |
while (ok(i)) { |
|
120 |
const a = char(i); |
|
121 |
const b = char(i + 1); |
|
122 |
if (!is_ascii(a) || a === backslash && b === bg_symbol) { break; } |
|
123 |
else { i += 1; } |
|
124 |
} |
|
125 |
return str.substring(start, i); |
|
126 |
} |
|
127 |
||
128 |
function scan_symbol(i: number): string { |
|
129 |
const a = char(i); |
|
130 |
const b = char(i + 1); |
|
131 |
if (a === backslash && b === bg_symbol) { |
|
132 |
const j = maybe_char(en_symbol, maybe_ascii_id(maybe_char(caret, i + 2))); |
|
133 |
return str.substring(i, j); |
|
134 |
} |
|
135 |
else { return ''; } |
|
136 |
} |
|
137 |
||
138 |
function scan_codepoint(i: number): string { |
|
139 |
const c = str.codePointAt(i); |
|
140 |
return c === undefined ? '' : codepoint_string(c); |
|
141 |
} |
|
142 |
||
143 |
||
144 |
const table = do_encode ? this.encode_table : this.decode_table; |
|
145 |
const result = new String_Buffer(); |
|
146 |
let i = 0; |
|
147 |
while (ok(i)) { |
|
148 |
const ascii = scan_ascii(i) |
|
149 |
if (ascii) { |
|
150 |
result.add(ascii) |
|
151 |
i += ascii.length |
|
152 |
} |
|
153 |
else { |
|
154 |
const s = scan_symbol(i) || scan_codepoint(i); |
|
155 |
if (s) { |
|
156 |
const r = table.get(s); |
|
157 |
result.add(r === undefined ? s : r); |
|
158 |
i += s.length; |
|
159 |
} |
|
160 |
} |
|
161 |
} |
|
162 |
return result.end(); |
|
163 |
} |
|
164 |
} |
|
165 |
||
166 |
const symbol_codes: Symbol_Codes = new Symbol_Codes([/*symbols*/]); |
|
167 |
||
168 |
||
169 |
/* main API */ |
|
170 |
||
171 |
export const ENCODING = 'utf8isabelle'; |
|
172 |
export const LABEL = 'UTF-8-Isabelle'; |
|
173 |
||
174 |
export function getDecoder(): IDecoderStream { |
|
175 |
const utf8_decoder = new TextDecoder(); |
|
176 |
const buffer = new String_Buffer(); |
|
75245 | 177 |
return { |
75261 | 178 |
write(input: Uint8Array): string { |
179 |
buffer.add(utf8_decoder.decode(input, { stream: true })); |
|
180 |
return ''; |
|
75245 | 181 |
}, |
75261 | 182 |
end(): string | undefined { |
183 |
buffer.add(utf8_decoder.decode()); |
|
184 |
const s = buffer.end(); |
|
185 |
return symbol_codes.recode(s, false); |
|
75245 | 186 |
} |
187 |
}; |
|
75240 | 188 |
} |
189 |
||
75261 | 190 |
export function getEncoder(): IEncoderStream { |
191 |
const utf8_encoder = new TextEncoder(); |
|
192 |
const empty = new Uint8Array(0); |
|
193 |
const buffer = new String_Buffer(); |
|
75245 | 194 |
return { |
75261 | 195 |
write(input: string): Uint8Array { |
196 |
buffer.add(input); |
|
197 |
return empty; |
|
75245 | 198 |
}, |
75261 | 199 |
end(): Uint8Array | undefined { |
200 |
const s = buffer.end(); |
|
201 |
return utf8_encoder.encode(symbol_codes.recode(s, true)); |
|
75245 | 202 |
} |
75261 | 203 |
} |
75240 | 204 |
} |