actually decode/encode symbols;
authorwenzelm
Thu, 10 Mar 2022 20:16:19 +0100
changeset 75261 ed83c58c612a
parent 75260 5a15a2ceafdf
child 75262 ec62c5401522
actually decode/encode symbols;
src/Tools/VSCode/patches/isabelle_encoding.ts
--- a/src/Tools/VSCode/patches/isabelle_encoding.ts	Thu Mar 10 12:34:02 2022 +0100
+++ b/src/Tools/VSCode/patches/isabelle_encoding.ts	Thu Mar 10 20:16:19 2022 +0100
@@ -6,26 +6,7 @@
 'use strict';
 
 
-/* defined symbols */
-
-export interface Symbol_Code {
-  symbol: string;
-  code: number;
-}
-
-export const static_symbols: Symbol_Code[] = [/*symbols*/];
-
-export const symbols_decode: Map<string, string> =
-  new Map(static_symbols.map((s: Symbol_Code) => [s.symbol, String.fromCharCode(s.code)]));
-
-export const symbols_encode: Map<string, string> =
-  new Map(static_symbols.map((s: Symbol_Code) => [String.fromCharCode(s.code), s.symbol]));
-
-
-/* encoding */
-
-export const ENCODING = 'utf8isabelle';
-export const LABEL = 'UTF-8-Isabelle';
+/* VSCode interfaces */
 
 export interface Options {
   stripBOM?: boolean;
@@ -33,36 +14,191 @@
   defaultEncoding?: string;
 }
 
-export interface IEncoderStream {
-  write(input: string): Uint8Array;
-  end(): Uint8Array | undefined;
-}
-
 export interface IDecoderStream {
   write(input: Uint8Array): string;
   end(): string | undefined;
 }
 
-export function getEncoder(): IEncoderStream {
-  const utf8_encoder = new TextEncoder();
+export interface IEncoderStream {
+  write(input: string): Uint8Array;
+  end(): Uint8Array | undefined;
+}
+
+
+/* ASCII characters */
+
+function is_ascii(c: number): boolean {
+  return 0 <= c && c <= 127;
+}
+
+function is_ascii_letter(c: number): boolean {
+  return 65 <= c && c <= 90 || 97 <= c && c <= 122;
+}
+
+function is_ascii_digit(c: number): boolean {
+  return 48 <= c && c <= 57;
+}
+
+function is_ascii_quasi(c: number): boolean {
+  return c === 95 || c === 39;
+}
+
+function is_ascii_letdig(c: number): boolean {
+  return is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c);
+}
+
+
+/* string buffer */
+
+class String_Buffer {
+  state: string[]
+
+  constructor() {
+    this.state = [];
+  }
+
+  add(s: string) {
+    this.state.push(s);
+  }
+
+  end(): string {
+    const s = this.state.join('');
+    this.state = [];
+    return s;
+  }
+}
+
+
+/* Isabelle symbol codes */
+
+function codepoint_string(c: number): string {
+  return String.fromCodePoint(c);
+}
+
+const backslash: number = 92;
+const caret: number = 94;
+const bg_symbol: number = 60;
+const en_symbol: number = 62;
+
+interface Symbol_Code {
+  symbol: string;
+  code: number;
+}
+
+export class Symbol_Codes {
+  decode_table: Map<string, string>;
+  encode_table: Map<string, string>;
+
+  constructor(symbols: Symbol_Code[]) {
+    this.decode_table = new Map(symbols.map(s => [s.symbol, codepoint_string(s.code)]));
+    this.encode_table = new Map(symbols.map(s => [codepoint_string(s.code), s.symbol]));
+  }
+
+  recode(str: string, do_encode: boolean): string {
+    function ok(i: number): boolean {
+      return 0 <= i && i < str.length;
+    }
+    function char(i: number): number {
+      return ok(i) ? str.charCodeAt(i) : 0;
+    }
+    function maybe_char(c: number, i: number): number {
+      return char(i) === c ? i + 1 : i;
+    }
+
+    function many_ascii_letdig(i: number): number {
+      let k = i;
+      while (is_ascii_letdig(char(k))) { k += 1 };
+      return k;
+    }
+    function maybe_ascii_id(i: number): number {
+      return is_ascii_letter(char(i)) ? many_ascii_letdig(i + 1) : i
+    }
+
+    function scan_ascii(start: number): string {
+      let i = start;
+      while (ok(i)) {
+        const a = char(i);
+        const b = char(i + 1);
+        if (!is_ascii(a) || a === backslash && b === bg_symbol) { break; }
+        else { i += 1; }
+      }
+      return str.substring(start, i);
+    }
+
+    function scan_symbol(i: number): string {
+      const a = char(i);
+      const b = char(i + 1);
+      if (a === backslash && b === bg_symbol) {
+        const j = maybe_char(en_symbol, maybe_ascii_id(maybe_char(caret, i + 2)));
+        return str.substring(i, j);
+      }
+      else { return ''; }
+    }
+
+    function scan_codepoint(i: number): string {
+      const c = str.codePointAt(i);
+      return c === undefined ? '' : codepoint_string(c);
+    }
+
+
+    const table = do_encode ? this.encode_table : this.decode_table;
+    const result = new String_Buffer();
+    let i = 0;
+    while (ok(i)) {
+      const ascii = scan_ascii(i)
+      if (ascii) {
+        result.add(ascii)
+        i += ascii.length
+      }
+      else {
+        const s = scan_symbol(i) || scan_codepoint(i);
+        if (s) {
+          const r = table.get(s);
+          result.add(r === undefined ? s : r);
+          i += s.length;
+        }
+      }
+    }
+    return result.end();
+  }
+}
+
+const symbol_codes: Symbol_Codes = new Symbol_Codes([/*symbols*/]);
+
+
+/* main API */
+
+export const ENCODING = 'utf8isabelle';
+export const LABEL = 'UTF-8-Isabelle';
+
+export function getDecoder(): IDecoderStream {
+  const utf8_decoder = new TextDecoder();
+  const buffer = new String_Buffer();
   return {
-    write(input: string): Uint8Array {
-      return utf8_encoder.encode(input);
+    write(input: Uint8Array): string {
+      buffer.add(utf8_decoder.decode(input, { stream: true }));
+      return '';
     },
-    end(): Uint8Array | undefined {
-      return utf8_encoder.encode();
+    end(): string | undefined {
+      buffer.add(utf8_decoder.decode());
+      const s = buffer.end();
+      return symbol_codes.recode(s, false);
     }
   };
 }
 
-export function getDecoder(): IDecoderStream {
-  const utf8TextDecoder = new TextDecoder();
+export function getEncoder(): IEncoderStream {
+  const utf8_encoder = new TextEncoder();
+  const empty = new Uint8Array(0);
+  const buffer = new String_Buffer();
   return {
-    write(input: Uint8Array): string {
-      return utf8TextDecoder.decode(input, { stream: true });
+    write(input: string): Uint8Array {
+      buffer.add(input);
+      return empty;
     },
-    end(): string | undefined {
-      return utf8TextDecoder.decode();
+    end(): Uint8Array | undefined {
+      const s = buffer.end();
+      return utf8_encoder.encode(symbol_codes.recode(s, true));
     }
-  };
+  }
 }