src/Tools/WWW_Find/unicode_symbols.ML
author haftmann
Wed May 05 18:25:34 2010 +0200 (2010-05-05)
changeset 36692 54b64d4ad524
parent 33823 24090eae50b6
child 36960 01594f816e3a
permissions -rw-r--r--
farewell to old-style mem infixes -- type inference in situations with mem_int and mem_string should provide enough information to resolve the type of (op =)
wenzelm@33823
     1
(*  Title:      Tools/WWW_Find/unicode_symbols.ML
kleing@33817
     2
    Author:     Timothy Bourke, NICTA
kleing@33817
     3
wenzelm@33823
     4
Parse the ISABELLE_HOME/etc/symbols file.
kleing@33817
     5
*)
kleing@33817
     6
kleing@33817
     7
signature UNICODE_SYMBOLS =
kleing@33817
     8
sig
kleing@33817
     9
val symbol_to_unicode : string -> int option
kleing@33817
    10
val abbrev_to_unicode : string -> int option
kleing@33817
    11
val unicode_to_symbol : int -> string option
kleing@33817
    12
val unicode_to_abbrev : int -> string option
kleing@33817
    13
val utf8_to_symbols   : string -> string
kleing@33817
    14
val utf8              : int list -> string
kleing@33817
    15
end;
kleing@33817
    16
wenzelm@33823
    17
structure UnicodeSymbols : UNICODE_SYMBOLS =
kleing@33817
    18
struct
kleing@33817
    19
kleing@33817
    20
local (* Lexer *)
kleing@33817
    21
kleing@33817
    22
open Basic_Symbol_Pos
kleing@33817
    23
kleing@33817
    24
val keywords =
kleing@33817
    25
  Scan.make_lexicon (map Symbol.explode ["code", "font", "abbrev", ":"]);
kleing@33817
    26
kleing@33817
    27
datatype token_kind =
kleing@33817
    28
  Symbol | AsciiSymbol | Keyword | Name | Code | Space | Comment | EOF;
kleing@33817
    29
kleing@33817
    30
datatype token = Token of token_kind * string * Position.range;
kleing@33817
    31
kleing@33817
    32
fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss);
kleing@33817
    33
kleing@33817
    34
in
kleing@33817
    35
kleing@33817
    36
fun mk_eof pos = Token (EOF, "", (pos, Position.none));
kleing@33817
    37
kleing@33817
    38
fun str_of_token (Token (_, s, _)) = s;
kleing@33817
    39
kleing@33817
    40
fun pos_of_token (Token (_, _, (pos, _))) = pos;
kleing@33817
    41
kleing@33817
    42
fun int_of_code (Token (Code, s, _)) = #value (Lexicon.read_xnum s)
kleing@33817
    43
  | int_of_code _ = error "internal error in UnicodeSymbols.int_of_code"
kleing@33817
    44
kleing@33817
    45
fun is_proper (Token (Space, _, _)) = false
kleing@33817
    46
  | is_proper (Token (Comment, _, _)) = false
kleing@33817
    47
  | is_proper _ = true;
kleing@33817
    48
kleing@33817
    49
fun is_keyword kw (Token (Keyword, kw', _)) = (kw = kw')
kleing@33817
    50
  | is_keyword _ _ = false;
kleing@33817
    51
kleing@33817
    52
fun is_ascii_sym (Token (AsciiSymbol, _, _)) = true
kleing@33817
    53
  | is_ascii_sym _ = false;
kleing@33817
    54
kleing@33817
    55
fun is_hex_code (Token (Code, _, _)) = true
kleing@33817
    56
  | is_hex_code _ = false;
kleing@33817
    57
kleing@33817
    58
fun is_symbol (Token (Symbol, _, _)) = true
kleing@33817
    59
  | is_symbol _ = false;
kleing@33817
    60
kleing@33817
    61
fun is_name (Token (Name, _, _)) = true
kleing@33817
    62
  | is_name _ = false;
kleing@33817
    63
kleing@33817
    64
fun is_eof (Token (EOF, _, _)) = true
kleing@33817
    65
  | is_eof _ = false;
kleing@33817
    66
kleing@33817
    67
fun end_position_of (Token (_, _, (_, epos))) = epos;
kleing@33817
    68
kleing@33817
    69
val is_space = symbol #> (fn s => Symbol.is_blank s andalso s <> "\n");
kleing@33817
    70
val scan_space =
kleing@33817
    71
  (Scan.many1 is_space @@@ Scan.optional ($$$ "\n") []
kleing@33817
    72
   ||
kleing@33817
    73
   Scan.many is_space @@@ ($$$ "\n")) >> token Space;
kleing@33817
    74
kleing@33817
    75
val scan_code = Lexicon.scan_hex #>> token Code;
kleing@33817
    76
kleing@33817
    77
val scan_ascii_symbol = Scan.one
kleing@33817
    78
  ((fn x => Symbol.is_ascii x andalso
kleing@33817
    79
      not (Symbol.is_ascii_letter x
kleing@33817
    80
           orelse Symbol.is_ascii_digit x
kleing@33817
    81
           orelse Symbol.is_ascii_blank x)) o symbol)
kleing@33817
    82
  -- Scan.many (not o Symbol.is_ascii_blank o symbol)
kleing@33817
    83
  >> (token AsciiSymbol o op ::);
kleing@33817
    84
haftmann@36692
    85
fun not_contains xs c = not (member (op =) (explode xs) (symbol c));
kleing@33817
    86
val scan_comment =
kleing@33817
    87
  $$$ "#" |-- (Scan.many (not_contains "\n") @@@ ($$$ "\n"))
kleing@33817
    88
  >> token Comment;
kleing@33817
    89
kleing@33817
    90
fun tokenize syms =
kleing@33817
    91
  let
kleing@33817
    92
    val scanner =
kleing@33817
    93
      Scan.one (Symbol.is_symbolic o symbol) >> (token Symbol o single) ||
kleing@33817
    94
      scan_comment ||
kleing@33817
    95
      scan_space ||
kleing@33817
    96
      scan_code ||
kleing@33817
    97
      Scan.literal keywords >> token Keyword ||
kleing@33817
    98
      scan_ascii_symbol ||
kleing@33817
    99
      Lexicon.scan_id >> token Name;
kleing@33817
   100
    val scan_token = Symbol_Pos.!!! "Lexical error" (Scan.bulk scanner);
kleing@33817
   101
  in
kleing@33817
   102
    (case Scan.error (Scan.finite Symbol_Pos.stopper scan_token) syms of
kleing@33817
   103
      (toks, []) => toks
kleing@33817
   104
    | (_, ss) => error ("Lexical error at: " ^ Symbol_Pos.content ss
kleing@33817
   105
                        ^ Position.str_of (#1 (Symbol_Pos.range ss))))
kleing@33817
   106
  end;
kleing@33817
   107
kleing@33817
   108
val stopper =
kleing@33817
   109
  Scan.stopper
kleing@33817
   110
    (fn [] => mk_eof Position.none
kleing@33817
   111
      | toks => mk_eof (end_position_of (List.last toks))) is_eof;
kleing@33817
   112
kleing@33817
   113
end;
kleing@33817
   114
kleing@33817
   115
local (* Parser *)
kleing@33817
   116
kleing@33817
   117
structure P = OuterParse;
kleing@33817
   118
kleing@33817
   119
fun $$$ kw = Scan.one (is_keyword kw) >> str_of_token;
kleing@33817
   120
val hex_code = Scan.one is_hex_code >> int_of_code;
kleing@33817
   121
val ascii_sym = Scan.one is_ascii_sym >> str_of_token;
kleing@33817
   122
val symbol = Scan.one is_symbol >> (fn t => (str_of_token t, pos_of_token t));
kleing@33817
   123
val name = Scan.one is_name >> str_of_token;
kleing@33817
   124
kleing@33817
   125
val unicode = $$$ "code" -- $$$ ":" |-- hex_code;
kleing@33817
   126
val font = Scan.option ($$$ "font" -- $$$ ":" |-- name);
kleing@33817
   127
val abbr = Scan.option ($$$ "abbrev" -- $$$ ":"
kleing@33817
   128
                        |-- (ascii_sym || $$$ ":" || name));
kleing@33817
   129
kleing@33817
   130
in
kleing@33817
   131
kleing@33817
   132
val line = (symbol -- unicode --| font -- abbr) >> P.triple1;
kleing@33817
   133
kleing@33817
   134
val symbols_path =
kleing@33817
   135
  [getenv "ISABELLE_HOME", "etc", "symbols"]
kleing@33817
   136
  |> map Path.explode
kleing@33817
   137
  |> Path.appends;
kleing@33817
   138
kleing@33817
   139
end;
kleing@33817
   140
kleing@33817
   141
local (* build tables *)
kleing@33817
   142
kleing@33817
   143
fun add_entries ((fromsym, fromabbr, tosym, toabbr), ((sym, pos), uni, oabbr)) =
kleing@33817
   144
  (case oabbr of
kleing@33817
   145
     NONE =>
kleing@33817
   146
       (Symtab.update_new (sym, uni) fromsym,
kleing@33817
   147
        fromabbr,
kleing@33817
   148
        Inttab.update (uni, sym) tosym,
kleing@33817
   149
        toabbr)
kleing@33817
   150
   | SOME abbr =>
kleing@33817
   151
       (Symtab.update_new (sym, uni) fromsym,
kleing@33817
   152
        Symtab.update_new (abbr, uni) fromabbr,
kleing@33817
   153
        Inttab.update (uni, sym) tosym,
kleing@33817
   154
        Inttab.update (uni, abbr) toabbr))
kleing@33817
   155
  handle Symtab.DUP sym => error ("Duplicate at" ^ Position.str_of pos)
kleing@33817
   156
       | Inttab.DUP sym => error ("Duplicate code at" ^ Position.str_of pos);
kleing@33817
   157
kleing@33817
   158
in
kleing@33817
   159
kleing@33817
   160
fun read_symbols path =
kleing@33817
   161
  let
kleing@33817
   162
    val parsed_lines =
kleing@33817
   163
      File.read path
kleing@33817
   164
      |> (fn x => Symbol_Pos.explode (x, Position.file (Path.implode path)))
kleing@33817
   165
      |> tokenize
kleing@33817
   166
      |> filter is_proper
kleing@33817
   167
      |> Scan.finite stopper (Scan.repeat line)
kleing@33817
   168
      |> fst;
kleing@33817
   169
  in
kleing@33817
   170
    Library.foldl add_entries
kleing@33817
   171
      ((Symtab.empty, Symtab.empty, Inttab.empty, Inttab.empty),
kleing@33817
   172
       parsed_lines)
kleing@33817
   173
  end;
kleing@33817
   174
kleing@33817
   175
end;
kleing@33817
   176
kleing@33817
   177
local
kleing@33817
   178
val (fromsym, fromabbr, tosym, toabbr) = read_symbols symbols_path;
kleing@33817
   179
in
kleing@33817
   180
val symbol_to_unicode = Symtab.lookup fromsym;
kleing@33817
   181
val abbrev_to_unicode = Symtab.lookup fromabbr;
kleing@33817
   182
val unicode_to_symbol = Inttab.lookup tosym;
kleing@33817
   183
val unicode_to_abbrev = Inttab.lookup toabbr;
kleing@33817
   184
end;
kleing@33817
   185
kleing@33817
   186
fun utf8_to_symbols utf8str =
kleing@33817
   187
  let
kleing@33817
   188
    val get_next =
kleing@33817
   189
      Substring.getc
kleing@33817
   190
      #> Option.map (apfst Byte.charToByte);
kleing@33817
   191
    val wstr = String.str o Byte.byteToChar;
kleing@33817
   192
    val replacement_char = "\<questiondown>";
kleing@33817
   193
kleing@33817
   194
    fun word_to_symbol w =
kleing@33817
   195
      (case (unicode_to_symbol o Word32.toInt) w of
kleing@33817
   196
         NONE => "?"
kleing@33817
   197
       | SOME s => s);
kleing@33817
   198
kleing@33817
   199
    fun andb32 (w1, w2) =
kleing@33817
   200
      Word8.andb(w1, w2)
kleing@33817
   201
      |> Word8.toLarge
kleing@33817
   202
      |> Word32.fromLarge;
kleing@33817
   203
kleing@33817
   204
    fun read_next (ss, 0, c) = (word_to_symbol c, ss)
kleing@33817
   205
      | read_next (ss, n, c) =
kleing@33817
   206
          (case get_next ss of
kleing@33817
   207
             NONE => (replacement_char, ss)
kleing@33817
   208
           | SOME (w, ss') =>
kleing@33817
   209
               if Word8.andb (w, 0wxc0) <> 0wx80
kleing@33817
   210
               then (replacement_char, ss')
kleing@33817
   211
               else
kleing@33817
   212
                 let
kleing@33817
   213
                   val w' = (Word8.andb (w, 0wx3f));
kleing@33817
   214
                   val bw = (Word32.fromLarge o Word8.toLarge) w';
kleing@33817
   215
                   val c' = Word32.<< (c, 0wx6);
kleing@33817
   216
                 in read_next (ss', n - 1, Word32.orb (c', bw)) end);
kleing@33817
   217
kleing@33817
   218
    fun do_char (w, ss) =
kleing@33817
   219
      if Word8.andb (w, 0wx80) = 0wx00
kleing@33817
   220
      then (wstr w, ss)
kleing@33817
   221
      else if Word8.andb (w, 0wx60) = 0wx40
kleing@33817
   222
      then read_next (ss, 1, andb32 (w, 0wx1f))
kleing@33817
   223
      else if Word8.andb (w, 0wxf0) = 0wxe0
kleing@33817
   224
      then read_next (ss, 2, andb32 (w, 0wx0f))
kleing@33817
   225
      else if Word8.andb (w, 0wxf8) = 0wxf0
kleing@33817
   226
      then read_next (ss, 3, andb32 (w, 0wx07))
kleing@33817
   227
      else (replacement_char, ss);
kleing@33817
   228
kleing@33817
   229
    fun read (rs, ss) =
kleing@33817
   230
      (case Option.map do_char (get_next ss) of
kleing@33817
   231
         NONE => (implode o rev) rs
kleing@33817
   232
       | SOME (s, ss') => read (s::rs, ss'));
kleing@33817
   233
  in read ([], Substring.full utf8str) end;
kleing@33817
   234
kleing@33817
   235
local
kleing@33817
   236
kleing@33817
   237
fun consec n =
kleing@33817
   238
  fn w => (
kleing@33817
   239
    Word32.>> (w, Word.fromInt (n * 6))
kleing@33817
   240
    |> (curry Word32.andb) 0wx3f
kleing@33817
   241
    |> (curry Word32.orb) 0wx80
kleing@33817
   242
    |> Word8.fromLargeWord o Word32.toLargeWord);
kleing@33817
   243
kleing@33817
   244
fun stamp n =
kleing@33817
   245
  fn w => (
kleing@33817
   246
    Word32.>> (w, Word.fromInt (n * 6))
kleing@33817
   247
    |> (curry Word32.andb) (Word32.>> (0wx000000ff, Word.fromInt (n + 2)))
kleing@33817
   248
    |> (curry Word32.orb) (Word32.<< (0wxffffffff, Word.fromInt (7 - n)))
kleing@33817
   249
    |> Word8.fromLargeWord o Word32.toLargeWord);
kleing@33817
   250
kleing@33817
   251
fun to_utf8_bytes i =
kleing@33817
   252
  if i <= 0x007f
kleing@33817
   253
  then [Word8.fromInt i]
kleing@33817
   254
  else let
kleing@33817
   255
    val w = Word32.fromInt i;
kleing@33817
   256
  in
kleing@33817
   257
    if i < 0x07ff
kleing@33817
   258
    then [stamp 1 w, consec 0 w]
kleing@33817
   259
    else if i < 0xffff
kleing@33817
   260
    then [stamp 2 w, consec 1 w, consec 0 w]
kleing@33817
   261
    else if i < 0x10ffff
kleing@33817
   262
    then [stamp 3 w, consec 2 w, consec 1 w, consec 0 w]
kleing@33817
   263
    else []
kleing@33817
   264
  end;
kleing@33817
   265
kleing@33817
   266
in
kleing@33817
   267
kleing@33817
   268
fun utf8 is =
kleing@33817
   269
  map to_utf8_bytes is
kleing@33817
   270
  |> flat
kleing@33817
   271
  |> Word8Vector.fromList
kleing@33817
   272
  |> Byte.bytesToString;
kleing@33817
   273
kleing@33817
   274
end
kleing@33817
   275
kleing@33817
   276
end;
kleing@33817
   277