src/Pure/Syntax/lexicon.ML
author wenzelm
Mon Jul 09 11:44:20 2007 +0200 (2007-07-09)
changeset 23660 18765718cf62
parent 22686 68496cc300a4
child 23784 75e6b9dd5336
permissions -rw-r--r--
type output = string indicates raw system output;
     1 (*  Title:      Pure/Syntax/lexicon.ML
     2     ID:         $Id$
     3     Author:     Tobias Nipkow and Markus Wenzel, TU Muenchen
     4 
     5 Lexer for the inner Isabelle syntax (terms and types).
     6 *)
     7 
     8 signature LEXICON0 =
     9 sig
    10   val is_identifier: string -> bool
    11   val is_ascii_identifier: string -> bool
    12   val is_tid: string -> bool
    13   val implode_xstr: string list -> string
    14   val explode_xstr: string -> string list
    15   val scan_id: string list -> string * string list
    16   val scan_longid: string list -> string * string list
    17   val scan_var: string list -> string * string list
    18   val scan_tid: string list -> string * string list
    19   val scan_tvar: string list -> string * string list
    20   val scan_nat: string list -> string * string list
    21   val scan_int: string list -> string * string list
    22   val scan_hex: string list -> string * string list
    23   val scan_bin: string list -> string * string list
    24   val read_indexname: string -> indexname
    25   val read_var: string -> term
    26   val read_variable: string -> indexname option
    27   val const: string -> term
    28   val free: string -> term
    29   val var: indexname -> term
    30   val read_nat: string -> int option
    31   val read_int: string -> int option
    32   val read_xnum: string -> {radix: int, leading_zeros: int, value: IntInf.int}
    33   val read_idents: string -> string list
    34   val fixedN: string
    35   val constN: string
    36 end;
    37 
    38 signature LEXICON =
    39 sig
    40   include LEXICON0
    41   val is_xid: string -> bool
    42   datatype token =
    43     Token of string |
    44     IdentSy of string |
    45     LongIdentSy of string |
    46     VarSy of string |
    47     TFreeSy of string |
    48     TVarSy of string |
    49     NumSy of string |
    50     XNumSy of string |
    51     StrSy of string |
    52     EndToken
    53   val idT: typ
    54   val longidT: typ
    55   val varT: typ
    56   val tidT: typ
    57   val tvarT: typ
    58   val terminals: string list
    59   val is_terminal: string -> bool
    60   val str_of_token: token -> string
    61   val display_token: token -> string
    62   val matching_tokens: token * token -> bool
    63   val valued_token: token -> bool
    64   val predef_term: string -> token option
    65   val tokenize: Scan.lexicon -> bool -> string list -> token list
    66 end;
    67 
    68 structure Lexicon: LEXICON =
    69 struct
    70 
    71 
    72 (** is_identifier etc. **)
    73 
    74 val is_identifier = Symbol.is_ident o Symbol.explode;
    75 
    76 fun is_ascii_identifier s =
    77   let val cs = Symbol.explode s
    78   in forall Symbol.is_ascii cs andalso Symbol.is_ident cs end;
    79 
    80 fun is_xid s =
    81   (case Symbol.explode s of
    82     "_" :: cs => Symbol.is_ident cs
    83   | cs => Symbol.is_ident cs);
    84 
    85 fun is_tid s =
    86   (case Symbol.explode s of
    87     "'" :: cs => Symbol.is_ident cs
    88   | _ => false);
    89 
    90 
    91 
    92 (** basic scanners **)
    93 
    94 val scan_letter_letdigs = Scan.one Symbol.is_letter -- Scan.many Symbol.is_letdig >> op ::;
    95 val scan_digits1 = Scan.many1 Symbol.is_digit;
    96 val scan_hex1 = Scan.many1 (Symbol.is_digit orf Symbol.is_hex_letter);
    97 val scan_bin1 = Scan.many1 (fn s => s = "0" orelse s = "1");
    98 
    99 val scan_id = scan_letter_letdigs >> implode;
   100 val scan_longid = scan_id ^^ (Scan.repeat1 ($$ "." ^^ scan_id) >> implode);
   101 val scan_tid = $$ "'" ^^ scan_id;
   102 
   103 val scan_nat = scan_digits1 >> implode;
   104 val scan_int = $$ "-" ^^ scan_nat || scan_nat;
   105 
   106 val scan_hex = $$ "0" ^^ $$ "x" ^^ (scan_hex1 >> implode);
   107 val scan_bin = $$ "0" ^^ $$ "b" ^^ (scan_bin1 >> implode);
   108 
   109 val scan_id_nat = scan_id ^^ Scan.optional ($$ "." ^^ scan_nat) "";
   110 val scan_var = $$ "?" ^^ scan_id_nat;
   111 val scan_tvar = $$ "?" ^^ $$ "'" ^^ scan_id_nat;
   112 
   113 
   114 
   115 (** datatype token **)
   116 
   117 datatype token =
   118   Token of string |
   119   IdentSy of string |
   120   LongIdentSy of string |
   121   VarSy of string |
   122   TFreeSy of string |
   123   TVarSy of string |
   124   NumSy of string |
   125   XNumSy of string |
   126   StrSy of string |
   127   EndToken;
   128 
   129 
   130 (* terminal arguments *)
   131 
   132 val idT = Type ("id", []);
   133 val longidT = Type ("longid", []);
   134 val varT = Type ("var", []);
   135 val tidT = Type ("tid", []);
   136 val tvarT = Type ("tvar", []);
   137 
   138 val terminals = ["id", "longid", "var", "tid", "tvar", "num", "xnum", "xstr"];
   139 val is_terminal = member (op =) terminals;
   140 
   141 
   142 (* str_of_token *)
   143 
   144 fun str_of_token (Token s) = s
   145   | str_of_token (IdentSy s) = s
   146   | str_of_token (LongIdentSy s) = s
   147   | str_of_token (VarSy s) = s
   148   | str_of_token (TFreeSy s) = s
   149   | str_of_token (TVarSy s) = s
   150   | str_of_token (NumSy s) = s
   151   | str_of_token (XNumSy s) = s
   152   | str_of_token (StrSy s) = s
   153   | str_of_token EndToken = "EOF";
   154 
   155 
   156 (* display_token *)
   157 
   158 fun display_token (Token s) = quote s
   159   | display_token (IdentSy s) = "id(" ^ s ^ ")"
   160   | display_token (LongIdentSy s) = "longid(" ^ s ^ ")"
   161   | display_token (VarSy s) = "var(" ^ s ^ ")"
   162   | display_token (TFreeSy s) = "tid(" ^ s ^ ")"
   163   | display_token (TVarSy s) = "tvar(" ^ s ^ ")"
   164   | display_token (NumSy s) = "num(" ^ s ^ ")"
   165   | display_token (XNumSy s) = "xnum(" ^ s ^ ")"
   166   | display_token (StrSy s) = "xstr(" ^ s ^ ")"
   167   | display_token EndToken = "";
   168 
   169 
   170 (* matching_tokens *)
   171 
   172 fun matching_tokens (Token x, Token y) = (x = y)
   173   | matching_tokens (IdentSy _, IdentSy _) = true
   174   | matching_tokens (LongIdentSy _, LongIdentSy _) = true
   175   | matching_tokens (VarSy _, VarSy _) = true
   176   | matching_tokens (TFreeSy _, TFreeSy _) = true
   177   | matching_tokens (TVarSy _, TVarSy _) = true
   178   | matching_tokens (NumSy _, NumSy _) = true
   179   | matching_tokens (XNumSy _, XNumSy _) = true
   180   | matching_tokens (StrSy _, StrSy _) = true
   181   | matching_tokens (EndToken, EndToken) = true
   182   | matching_tokens _ = false;
   183 
   184 
   185 (* valued_token *)
   186 
   187 fun valued_token (Token _) = false
   188   | valued_token (IdentSy _) = true
   189   | valued_token (LongIdentSy _) = true
   190   | valued_token (VarSy _) = true
   191   | valued_token (TFreeSy _) = true
   192   | valued_token (TVarSy _) = true
   193   | valued_token (NumSy _) = true
   194   | valued_token (XNumSy _) = true
   195   | valued_token (StrSy _) = true
   196   | valued_token EndToken = false;
   197 
   198 
   199 (* predef_term *)
   200 
   201 fun predef_term "id" = SOME (IdentSy "id")
   202   | predef_term "longid" = SOME (LongIdentSy "longid")
   203   | predef_term "var" = SOME (VarSy "var")
   204   | predef_term "tid" = SOME (TFreeSy "tid")
   205   | predef_term "tvar" = SOME (TVarSy "tvar")
   206   | predef_term "num" = SOME (NumSy "num")
   207   | predef_term "xnum" = SOME (XNumSy "xnum")
   208   | predef_term "xstr" = SOME (StrSy "xstr")
   209   | predef_term _ = NONE;
   210 
   211 
   212 (* xstr tokens *)
   213 
   214 fun lex_err msg prfx (cs, _) =
   215   "Inner lexical error: " ^ msg ^ " at " ^ quote (prfx ^ Symbol.beginning 10 cs);
   216 
   217 val scan_chr =
   218   $$ "\\" |-- $$ "'" ||
   219   Scan.one (fn s => s <> "\\" andalso s <> "'" andalso Symbol.not_eof s) ||
   220   $$ "'" --| Scan.ahead (~$$ "'");
   221 
   222 val scan_str =
   223   $$ "'" |-- $$ "'" |-- !! (lex_err "missing end of string" "''")
   224     (Scan.repeat scan_chr --| $$ "'" --| $$ "'");
   225 
   226 
   227 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs));
   228 
   229 fun explode_xstr str =
   230   (case Scan.read Symbol.stopper scan_str (Symbol.explode str) of
   231     SOME cs => cs
   232   | _ => error ("Inner lexical error: literal string expected at " ^ quote str));
   233 
   234 
   235 (* nested comments *)
   236 
   237 val scan_cmt =
   238   Scan.depend (fn d => $$ "(" ^^ $$ "*" >> pair (d + 1)) ||
   239   Scan.depend (fn 0 => Scan.fail | d => $$ "*" ^^ $$ ")" >> pair (d - 1)) ||
   240   Scan.lift ($$ "*" --| Scan.ahead (~$$ ")")) ||
   241   Scan.lift (Scan.one (fn s => s <> "*" andalso Symbol.not_eof s));
   242 
   243 val scan_comment =
   244   $$ "(" -- $$ "*" -- !! (lex_err "missing end of comment" "(*")
   245     (Scan.pass 0 (Scan.repeat scan_cmt) -- $$ "*" -- $$ ")")
   246   >> K ();
   247 
   248 
   249 
   250 (** tokenize **)
   251 
   252 fun tokenize lex xids chs =
   253   let
   254     val scan_xid =
   255       if xids then $$ "_" ^^ scan_id || scan_id
   256       else scan_id;
   257 
   258     val scan_num = scan_hex || scan_bin || scan_int;
   259 
   260     val scan_val =
   261       scan_tvar >> pair TVarSy ||
   262       scan_var >> pair VarSy ||
   263       scan_tid >> pair TFreeSy ||
   264       scan_num >> pair NumSy ||
   265       $$ "#" ^^ scan_num >> pair XNumSy ||
   266       scan_longid >> pair LongIdentSy ||
   267       scan_xid >> pair IdentSy;
   268 
   269     val scan_lit = Scan.literal lex >> (pair Token o implode);
   270 
   271     val scan_token =
   272       scan_comment >> K NONE ||
   273       Scan.max (op <= o pairself snd) scan_lit scan_val >> (fn (tk, s) => SOME (tk s)) ||
   274       scan_str >> (SOME o StrSy o implode_xstr) ||
   275       Scan.one Symbol.is_blank >> K NONE;
   276   in
   277     (case Scan.error (Scan.finite Symbol.stopper (Scan.repeat scan_token)) chs of
   278       (toks, []) => map_filter I toks @ [EndToken]
   279     | (_, cs) => error ("Inner lexical error at: " ^ quote (implode cs)))
   280   end;
   281 
   282 
   283 
   284 (** scan variables **)
   285 
   286 (* scan_indexname *)
   287 
   288 local
   289 
   290 fun scan_vname chrs =
   291   let
   292     fun nat n [] = n
   293       | nat n (c :: cs) = nat (n * 10 + (ord c - ord "0")) cs;
   294 
   295     fun idxname cs ds = (implode (rev cs), nat 0 ds);
   296     fun chop_idx [] ds = idxname [] ds
   297       | chop_idx (cs as (_ :: "\\<^isub>" :: _)) ds = idxname cs ds
   298       | chop_idx (cs as (_ :: "\\<^isup>" :: _)) ds = idxname cs ds
   299       | chop_idx (c :: cs) ds =
   300           if Symbol.is_digit c then chop_idx cs (c :: ds)
   301           else idxname (c :: cs) ds;
   302 
   303     val scan =
   304       scan_letter_letdigs -- Scan.optional ($$ "." |-- scan_digits1 >> nat 0) ~1;
   305   in
   306     (case scan chrs of
   307       ((cs, ~1), cs') => (chop_idx (rev cs) [], cs')
   308     | ((cs, i), cs') => ((implode cs, i), cs'))
   309   end;
   310 
   311 in
   312 
   313 val scan_indexname =
   314      $$ "'" |-- scan_vname >> (fn (x, i) => ("'" ^ x, i))
   315   || scan_vname;
   316 
   317 end;
   318 
   319 
   320 (* indexname *)
   321 
   322 fun read_indexname s =
   323   (case Scan.read Symbol.stopper scan_indexname (Symbol.explode s) of
   324     SOME xi => xi
   325   | _ => error ("Lexical error in variable name: " ^ quote s));
   326 
   327 
   328 (* read_var *)
   329 
   330 fun const c = Const (c, dummyT);
   331 fun free x = Free (x, dummyT);
   332 fun var xi = Var (xi, dummyT);
   333 
   334 fun read_var str =
   335   let
   336     val scan =
   337       $$ "?" |-- scan_indexname --| Scan.ahead (Scan.one Symbol.is_eof) >> var ||
   338       Scan.many Symbol.not_eof >> (free o implode);
   339   in the (Scan.read Symbol.stopper scan (Symbol.explode str)) end;
   340 
   341 
   342 (* read_variable *)
   343 
   344 fun read_variable str =
   345   let val scan = $$ "?" |-- scan_indexname || scan_indexname
   346   in Scan.read Symbol.stopper scan (Symbol.explode str) end;
   347 
   348 
   349 (* specific identifiers *)
   350 
   351 val constN = "\\<^const>";
   352 val fixedN = "\\<^fixed>";
   353 
   354 
   355 (* read numbers *)
   356 
   357 local
   358 
   359 fun nat cs =
   360   Option.map (#1 o Library.read_int)
   361     (Scan.read Symbol.stopper scan_digits1 cs);
   362 
   363 in
   364 
   365 val read_nat = nat o Symbol.explode;
   366 
   367 fun read_int s =
   368   (case Symbol.explode s of
   369     "-" :: cs => Option.map ~ (nat cs)
   370   | cs => nat cs);
   371 
   372 end;
   373 
   374 
   375 (* read_xnum: hex/bin/decimal *)
   376 
   377 local
   378 
   379 val ten = ord "0" + 10;
   380 val a = ord "a";
   381 val A = ord "A";
   382 val _ = a > A orelse error "Bad ASCII";
   383 
   384 fun remap_hex c =
   385   let val x = ord c in
   386     if x >= a then chr (x - a + ten)
   387     else if x >= A then chr (x - A + ten)
   388     else c
   389   end;
   390 
   391 fun leading_zeros ["0"] = 0
   392   | leading_zeros ("0" :: cs) = 1 + leading_zeros cs
   393   | leading_zeros _ = 0;
   394 
   395 in
   396 
   397 fun read_xnum str =
   398   let
   399     val (sign, radix, digs) =
   400       (case Symbol.explode (perhaps (try (unprefix "#")) str) of
   401         "0" :: "x" :: cs => (1, 16, map remap_hex cs)
   402       | "0" :: "b" :: cs => (1, 2, cs)
   403       | "-" :: cs => (~1, 10, cs)
   404       | cs => (1, 10, cs));
   405     val value = IntInf.fromInt sign * #1 (Library.read_intinf radix digs);
   406   in {radix = radix, leading_zeros = leading_zeros digs, value = value} end;
   407 
   408 end;
   409 
   410 
   411 (* read_ident(s) *)
   412 
   413 fun read_idents str =
   414   let
   415     val blanks = Scan.many Symbol.is_blank;
   416     val junk = Scan.many Symbol.not_eof;
   417     val idents = Scan.repeat1 (blanks |-- scan_id --| blanks) -- junk;
   418   in
   419     (case Scan.read Symbol.stopper idents (Symbol.explode str) of
   420       SOME (ids, []) => ids
   421     | SOME (_, bad) => error ("Bad identifier: " ^ quote (implode bad))
   422     | NONE => error ("No identifier found in: " ^ quote str))
   423   end;
   424 
   425 end;