src/Tools/WWW_Find/unicode_symbols.ML
author haftmann
Wed May 05 18:25:34 2010 +0200 (2010-05-05)
changeset 36692 54b64d4ad524
parent 33823 24090eae50b6
child 36960 01594f816e3a
permissions -rw-r--r--
farewell to old-style mem infixes -- type inference in situations with mem_int and mem_string should provide enough information to resolve the type of (op =)
     1 (*  Title:      Tools/WWW_Find/unicode_symbols.ML
     2     Author:     Timothy Bourke, NICTA
     3 
     4 Parse the ISABELLE_HOME/etc/symbols file.
     5 *)
     6 
     7 signature UNICODE_SYMBOLS =
     8 sig
     9 val symbol_to_unicode : string -> int option
    10 val abbrev_to_unicode : string -> int option
    11 val unicode_to_symbol : int -> string option
    12 val unicode_to_abbrev : int -> string option
    13 val utf8_to_symbols   : string -> string
    14 val utf8              : int list -> string
    15 end;
    16 
    17 structure UnicodeSymbols : UNICODE_SYMBOLS =
    18 struct
    19 
    20 local (* Lexer *)
    21 
    22 open Basic_Symbol_Pos
    23 
    24 val keywords =
    25   Scan.make_lexicon (map Symbol.explode ["code", "font", "abbrev", ":"]);
    26 
    27 datatype token_kind =
    28   Symbol | AsciiSymbol | Keyword | Name | Code | Space | Comment | EOF;
    29 
    30 datatype token = Token of token_kind * string * Position.range;
    31 
    32 fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss);
    33 
    34 in
    35 
    36 fun mk_eof pos = Token (EOF, "", (pos, Position.none));
    37 
    38 fun str_of_token (Token (_, s, _)) = s;
    39 
    40 fun pos_of_token (Token (_, _, (pos, _))) = pos;
    41 
    42 fun int_of_code (Token (Code, s, _)) = #value (Lexicon.read_xnum s)
    43   | int_of_code _ = error "internal error in UnicodeSymbols.int_of_code"
    44 
    45 fun is_proper (Token (Space, _, _)) = false
    46   | is_proper (Token (Comment, _, _)) = false
    47   | is_proper _ = true;
    48 
    49 fun is_keyword kw (Token (Keyword, kw', _)) = (kw = kw')
    50   | is_keyword _ _ = false;
    51 
    52 fun is_ascii_sym (Token (AsciiSymbol, _, _)) = true
    53   | is_ascii_sym _ = false;
    54 
    55 fun is_hex_code (Token (Code, _, _)) = true
    56   | is_hex_code _ = false;
    57 
    58 fun is_symbol (Token (Symbol, _, _)) = true
    59   | is_symbol _ = false;
    60 
    61 fun is_name (Token (Name, _, _)) = true
    62   | is_name _ = false;
    63 
    64 fun is_eof (Token (EOF, _, _)) = true
    65   | is_eof _ = false;
    66 
    67 fun end_position_of (Token (_, _, (_, epos))) = epos;
    68 
    69 val is_space = symbol #> (fn s => Symbol.is_blank s andalso s <> "\n");
    70 val scan_space =
    71   (Scan.many1 is_space @@@ Scan.optional ($$$ "\n") []
    72    ||
    73    Scan.many is_space @@@ ($$$ "\n")) >> token Space;
    74 
    75 val scan_code = Lexicon.scan_hex #>> token Code;
    76 
    77 val scan_ascii_symbol = Scan.one
    78   ((fn x => Symbol.is_ascii x andalso
    79       not (Symbol.is_ascii_letter x
    80            orelse Symbol.is_ascii_digit x
    81            orelse Symbol.is_ascii_blank x)) o symbol)
    82   -- Scan.many (not o Symbol.is_ascii_blank o symbol)
    83   >> (token AsciiSymbol o op ::);
    84 
    85 fun not_contains xs c = not (member (op =) (explode xs) (symbol c));
    86 val scan_comment =
    87   $$$ "#" |-- (Scan.many (not_contains "\n") @@@ ($$$ "\n"))
    88   >> token Comment;
    89 
    90 fun tokenize syms =
    91   let
    92     val scanner =
    93       Scan.one (Symbol.is_symbolic o symbol) >> (token Symbol o single) ||
    94       scan_comment ||
    95       scan_space ||
    96       scan_code ||
    97       Scan.literal keywords >> token Keyword ||
    98       scan_ascii_symbol ||
    99       Lexicon.scan_id >> token Name;
   100     val scan_token = Symbol_Pos.!!! "Lexical error" (Scan.bulk scanner);
   101   in
   102     (case Scan.error (Scan.finite Symbol_Pos.stopper scan_token) syms of
   103       (toks, []) => toks
   104     | (_, ss) => error ("Lexical error at: " ^ Symbol_Pos.content ss
   105                         ^ Position.str_of (#1 (Symbol_Pos.range ss))))
   106   end;
   107 
   108 val stopper =
   109   Scan.stopper
   110     (fn [] => mk_eof Position.none
   111       | toks => mk_eof (end_position_of (List.last toks))) is_eof;
   112 
   113 end;
   114 
   115 local (* Parser *)
   116 
   117 structure P = OuterParse;
   118 
   119 fun $$$ kw = Scan.one (is_keyword kw) >> str_of_token;
   120 val hex_code = Scan.one is_hex_code >> int_of_code;
   121 val ascii_sym = Scan.one is_ascii_sym >> str_of_token;
   122 val symbol = Scan.one is_symbol >> (fn t => (str_of_token t, pos_of_token t));
   123 val name = Scan.one is_name >> str_of_token;
   124 
   125 val unicode = $$$ "code" -- $$$ ":" |-- hex_code;
   126 val font = Scan.option ($$$ "font" -- $$$ ":" |-- name);
   127 val abbr = Scan.option ($$$ "abbrev" -- $$$ ":"
   128                         |-- (ascii_sym || $$$ ":" || name));
   129 
   130 in
   131 
   132 val line = (symbol -- unicode --| font -- abbr) >> P.triple1;
   133 
   134 val symbols_path =
   135   [getenv "ISABELLE_HOME", "etc", "symbols"]
   136   |> map Path.explode
   137   |> Path.appends;
   138 
   139 end;
   140 
   141 local (* build tables *)
   142 
   143 fun add_entries ((fromsym, fromabbr, tosym, toabbr), ((sym, pos), uni, oabbr)) =
   144   (case oabbr of
   145      NONE =>
   146        (Symtab.update_new (sym, uni) fromsym,
   147         fromabbr,
   148         Inttab.update (uni, sym) tosym,
   149         toabbr)
   150    | SOME abbr =>
   151        (Symtab.update_new (sym, uni) fromsym,
   152         Symtab.update_new (abbr, uni) fromabbr,
   153         Inttab.update (uni, sym) tosym,
   154         Inttab.update (uni, abbr) toabbr))
   155   handle Symtab.DUP sym => error ("Duplicate at" ^ Position.str_of pos)
   156        | Inttab.DUP sym => error ("Duplicate code at" ^ Position.str_of pos);
   157 
   158 in
   159 
   160 fun read_symbols path =
   161   let
   162     val parsed_lines =
   163       File.read path
   164       |> (fn x => Symbol_Pos.explode (x, Position.file (Path.implode path)))
   165       |> tokenize
   166       |> filter is_proper
   167       |> Scan.finite stopper (Scan.repeat line)
   168       |> fst;
   169   in
   170     Library.foldl add_entries
   171       ((Symtab.empty, Symtab.empty, Inttab.empty, Inttab.empty),
   172        parsed_lines)
   173   end;
   174 
   175 end;
   176 
   177 local
   178 val (fromsym, fromabbr, tosym, toabbr) = read_symbols symbols_path;
   179 in
   180 val symbol_to_unicode = Symtab.lookup fromsym;
   181 val abbrev_to_unicode = Symtab.lookup fromabbr;
   182 val unicode_to_symbol = Inttab.lookup tosym;
   183 val unicode_to_abbrev = Inttab.lookup toabbr;
   184 end;
   185 
   186 fun utf8_to_symbols utf8str =
   187   let
   188     val get_next =
   189       Substring.getc
   190       #> Option.map (apfst Byte.charToByte);
   191     val wstr = String.str o Byte.byteToChar;
   192     val replacement_char = "\<questiondown>";
   193 
   194     fun word_to_symbol w =
   195       (case (unicode_to_symbol o Word32.toInt) w of
   196          NONE => "?"
   197        | SOME s => s);
   198 
   199     fun andb32 (w1, w2) =
   200       Word8.andb(w1, w2)
   201       |> Word8.toLarge
   202       |> Word32.fromLarge;
   203 
   204     fun read_next (ss, 0, c) = (word_to_symbol c, ss)
   205       | read_next (ss, n, c) =
   206           (case get_next ss of
   207              NONE => (replacement_char, ss)
   208            | SOME (w, ss') =>
   209                if Word8.andb (w, 0wxc0) <> 0wx80
   210                then (replacement_char, ss')
   211                else
   212                  let
   213                    val w' = (Word8.andb (w, 0wx3f));
   214                    val bw = (Word32.fromLarge o Word8.toLarge) w';
   215                    val c' = Word32.<< (c, 0wx6);
   216                  in read_next (ss', n - 1, Word32.orb (c', bw)) end);
   217 
   218     fun do_char (w, ss) =
   219       if Word8.andb (w, 0wx80) = 0wx00
   220       then (wstr w, ss)
   221       else if Word8.andb (w, 0wx60) = 0wx40
   222       then read_next (ss, 1, andb32 (w, 0wx1f))
   223       else if Word8.andb (w, 0wxf0) = 0wxe0
   224       then read_next (ss, 2, andb32 (w, 0wx0f))
   225       else if Word8.andb (w, 0wxf8) = 0wxf0
   226       then read_next (ss, 3, andb32 (w, 0wx07))
   227       else (replacement_char, ss);
   228 
   229     fun read (rs, ss) =
   230       (case Option.map do_char (get_next ss) of
   231          NONE => (implode o rev) rs
   232        | SOME (s, ss') => read (s::rs, ss'));
   233   in read ([], Substring.full utf8str) end;
   234 
   235 local
   236 
   237 fun consec n =
   238   fn w => (
   239     Word32.>> (w, Word.fromInt (n * 6))
   240     |> (curry Word32.andb) 0wx3f
   241     |> (curry Word32.orb) 0wx80
   242     |> Word8.fromLargeWord o Word32.toLargeWord);
   243 
   244 fun stamp n =
   245   fn w => (
   246     Word32.>> (w, Word.fromInt (n * 6))
   247     |> (curry Word32.andb) (Word32.>> (0wx000000ff, Word.fromInt (n + 2)))
   248     |> (curry Word32.orb) (Word32.<< (0wxffffffff, Word.fromInt (7 - n)))
   249     |> Word8.fromLargeWord o Word32.toLargeWord);
   250 
   251 fun to_utf8_bytes i =
   252   if i <= 0x007f
   253   then [Word8.fromInt i]
   254   else let
   255     val w = Word32.fromInt i;
   256   in
   257     if i < 0x07ff
   258     then [stamp 1 w, consec 0 w]
   259     else if i < 0xffff
   260     then [stamp 2 w, consec 1 w, consec 0 w]
   261     else if i < 0x10ffff
   262     then [stamp 3 w, consec 2 w, consec 1 w, consec 0 w]
   263     else []
   264   end;
   265 
   266 in
   267 
   268 fun utf8 is =
   269   map to_utf8_bytes is
   270   |> flat
   271   |> Word8Vector.fromList
   272   |> Byte.bytesToString;
   273 
   274 end
   275 
   276 end;
   277