src/Pure/Syntax/lexicon.ML
author wenzelm
Fri Jun 13 21:04:42 2008 +0200 (2008-06-13)
changeset 27195 bbf4cbc69243
parent 27121 38367dbccae5
child 27773 a52166b228b9
permissions -rw-r--r--
map_const: soft version, no failure here;
     1 (*  Title:      Pure/Syntax/lexicon.ML
     2     ID:         $Id$
     3     Author:     Tobias Nipkow and Markus Wenzel, TU Muenchen
     4 
     5 Lexer for the inner Isabelle syntax (terms and types).
     6 *)
     7 
     8 signature LEXICON0 =
     9 sig
    10   val is_identifier: string -> bool
    11   val is_ascii_identifier: string -> bool
    12   val is_tid: string -> bool
    13   val implode_xstr: string list -> string
    14   val explode_xstr: string -> string list
    15   val scan_id: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    16   val scan_longid: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    17   val scan_tid: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    18   val scan_nat: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    19   val scan_int: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    20   val scan_hex: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    21   val scan_bin: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    22   val scan_var: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    23   val scan_tvar: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
    24   val read_indexname: string -> indexname
    25   val read_var: string -> term
    26   val read_variable: string -> indexname option
    27   val const: string -> term
    28   val free: string -> term
    29   val var: indexname -> term
    30   val read_nat: string -> int option
    31   val read_int: string -> int option
    32   val read_xnum: string -> {radix: int, leading_zeros: int, value: int}
    33   val fixedN: string
    34   val constN: string
    35 end;
    36 
    37 signature LEXICON =
    38 sig
    39   include LEXICON0
    40   val is_xid: string -> bool
    41   datatype token =
    42     Token of string |
    43     IdentSy of string |
    44     LongIdentSy of string |
    45     VarSy of string |
    46     TFreeSy of string |
    47     TVarSy of string |
    48     NumSy of string |
    49     XNumSy of string |
    50     StrSy of string |
    51     EndToken
    52   val idT: typ
    53   val longidT: typ
    54   val varT: typ
    55   val tidT: typ
    56   val tvarT: typ
    57   val terminals: string list
    58   val is_terminal: string -> bool
    59   val str_of_token: token -> string
    60   val display_token: token -> string
    61   val matching_tokens: token * token -> bool
    62   val valued_token: token -> bool
    63   val predef_term: string -> token option
    64   val tokenize: Scan.lexicon -> bool -> string list -> token list
    65 end;
    66 
    67 structure Lexicon: LEXICON =
    68 struct
    69 
    70 (** is_identifier etc. **)
    71 
    72 val is_identifier = Symbol.is_ident o Symbol.explode;
    73 
    74 fun is_ascii_identifier s =
    75   let val cs = Symbol.explode s
    76   in forall Symbol.is_ascii cs andalso Symbol.is_ident cs end;
    77 
    78 fun is_xid s =
    79   (case Symbol.explode s of
    80     "_" :: cs => Symbol.is_ident cs
    81   | cs => Symbol.is_ident cs);
    82 
    83 fun is_tid s =
    84   (case Symbol.explode s of
    85     "'" :: cs => Symbol.is_ident cs
    86   | _ => false);
    87 
    88 
    89 
    90 (** basic scanners **)
    91 
    92 val scan_id = Scan.one Symbol.is_letter ::: Scan.many Symbol.is_letdig;
    93 val scan_longid = scan_id @@@ (Scan.repeat1 ($$ "." ::: scan_id) >> flat);
    94 val scan_tid = $$ "'" ::: scan_id;
    95 
    96 val scan_nat = Scan.many1 Symbol.is_digit;
    97 val scan_int = $$ "-" ::: scan_nat || scan_nat;
    98 val scan_hex = $$ "0" ::: $$ "x" ::: Scan.many1 Symbol.is_ascii_hex;
    99 val scan_bin = $$ "0" ::: $$ "b" ::: Scan.many1 (fn s => s = "0" orelse s = "1");
   100 
   101 val scan_id_nat = scan_id @@@ Scan.optional ($$ "." ::: scan_nat) [];
   102 val scan_var = $$ "?" ::: scan_id_nat;
   103 val scan_tvar = $$ "?" ::: $$ "'" ::: scan_id_nat;
   104 
   105 
   106 
   107 (** datatype token **)
   108 
   109 datatype token =
   110   Token of string |
   111   IdentSy of string |
   112   LongIdentSy of string |
   113   VarSy of string |
   114   TFreeSy of string |
   115   TVarSy of string |
   116   NumSy of string |
   117   XNumSy of string |
   118   StrSy of string |
   119   EndToken;
   120 
   121 
   122 (* terminal arguments *)
   123 
   124 val idT = Type ("id", []);
   125 val longidT = Type ("longid", []);
   126 val varT = Type ("var", []);
   127 val tidT = Type ("tid", []);
   128 val tvarT = Type ("tvar", []);
   129 
   130 val terminals = ["id", "longid", "var", "tid", "tvar", "num", "xnum", "xstr"];
   131 val is_terminal = member (op =) terminals;
   132 
   133 
   134 (* str_of_token *)
   135 
   136 fun str_of_token (Token s) = s
   137   | str_of_token (IdentSy s) = s
   138   | str_of_token (LongIdentSy s) = s
   139   | str_of_token (VarSy s) = s
   140   | str_of_token (TFreeSy s) = s
   141   | str_of_token (TVarSy s) = s
   142   | str_of_token (NumSy s) = s
   143   | str_of_token (XNumSy s) = s
   144   | str_of_token (StrSy s) = s
   145   | str_of_token EndToken = "EOF";
   146 
   147 
   148 (* display_token *)
   149 
   150 fun display_token (Token s) = quote s
   151   | display_token (IdentSy s) = "id(" ^ s ^ ")"
   152   | display_token (LongIdentSy s) = "longid(" ^ s ^ ")"
   153   | display_token (VarSy s) = "var(" ^ s ^ ")"
   154   | display_token (TFreeSy s) = "tid(" ^ s ^ ")"
   155   | display_token (TVarSy s) = "tvar(" ^ s ^ ")"
   156   | display_token (NumSy s) = "num(" ^ s ^ ")"
   157   | display_token (XNumSy s) = "xnum(" ^ s ^ ")"
   158   | display_token (StrSy s) = "xstr(" ^ s ^ ")"
   159   | display_token EndToken = "";
   160 
   161 
   162 (* matching_tokens *)
   163 
   164 fun matching_tokens (Token x, Token y) = (x = y)
   165   | matching_tokens (IdentSy _, IdentSy _) = true
   166   | matching_tokens (LongIdentSy _, LongIdentSy _) = true
   167   | matching_tokens (VarSy _, VarSy _) = true
   168   | matching_tokens (TFreeSy _, TFreeSy _) = true
   169   | matching_tokens (TVarSy _, TVarSy _) = true
   170   | matching_tokens (NumSy _, NumSy _) = true
   171   | matching_tokens (XNumSy _, XNumSy _) = true
   172   | matching_tokens (StrSy _, StrSy _) = true
   173   | matching_tokens (EndToken, EndToken) = true
   174   | matching_tokens _ = false;
   175 
   176 
   177 (* valued_token *)
   178 
   179 fun valued_token (Token _) = false
   180   | valued_token (IdentSy _) = true
   181   | valued_token (LongIdentSy _) = true
   182   | valued_token (VarSy _) = true
   183   | valued_token (TFreeSy _) = true
   184   | valued_token (TVarSy _) = true
   185   | valued_token (NumSy _) = true
   186   | valued_token (XNumSy _) = true
   187   | valued_token (StrSy _) = true
   188   | valued_token EndToken = false;
   189 
   190 
   191 (* predef_term *)
   192 
   193 fun predef_term "id" = SOME (IdentSy "id")
   194   | predef_term "longid" = SOME (LongIdentSy "longid")
   195   | predef_term "var" = SOME (VarSy "var")
   196   | predef_term "tid" = SOME (TFreeSy "tid")
   197   | predef_term "tvar" = SOME (TVarSy "tvar")
   198   | predef_term "num" = SOME (NumSy "num")
   199   | predef_term "xnum" = SOME (XNumSy "xnum")
   200   | predef_term "xstr" = SOME (StrSy "xstr")
   201   | predef_term _ = NONE;
   202 
   203 
   204 (* xstr tokens *)
   205 
   206 fun lex_err msg prfx (cs, _) =
   207   "Inner lexical error: " ^ msg ^ " at " ^ quote (prfx ^ Symbol.beginning 10 cs);
   208 
   209 val scan_chr =
   210   $$ "\\" |-- $$ "'" ||
   211   Scan.one (fn s => s <> "\\" andalso s <> "'" andalso Symbol.is_regular s) ||
   212   $$ "'" --| Scan.ahead (~$$ "'");
   213 
   214 val scan_str =
   215   $$ "'" |-- $$ "'" |-- !! (lex_err "missing end of string" "''")
   216     (Scan.repeat scan_chr --| $$ "'" --| $$ "'");
   217 
   218 
   219 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs));
   220 
   221 fun explode_xstr str =
   222   (case Scan.read Symbol.stopper scan_str (Symbol.explode str) of
   223     SOME cs => cs
   224   | _ => error ("Inner lexical error: literal string expected at " ^ quote str));
   225 
   226 
   227 (* nested comments *)
   228 
   229 val scan_cmt =
   230   Scan.depend (fn d => $$ "(" ^^ $$ "*" >> pair (d + 1)) ||
   231   Scan.depend (fn 0 => Scan.fail | d => $$ "*" ^^ $$ ")" >> pair (d - 1)) ||
   232   Scan.lift ($$ "*" --| Scan.ahead (~$$ ")")) ||
   233   Scan.lift (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s));
   234 
   235 val scan_comment =
   236   $$ "(" -- $$ "*" -- !! (lex_err "missing end of comment" "(*")
   237     (Scan.pass 0 (Scan.repeat scan_cmt) -- $$ "*" -- $$ ")")
   238   >> K ();
   239 
   240 
   241 
   242 (** tokenize **)
   243 
   244 fun tokenize lex xids chs =
   245   let
   246     fun token kind cs = (kind, implode cs);
   247 
   248     val scan_xid =
   249       if xids then $$ "_" ::: scan_id || scan_id
   250       else scan_id;
   251 
   252     val scan_num = scan_hex || scan_bin || scan_int;
   253 
   254     val scan_val =
   255       scan_tvar >> token TVarSy ||
   256       scan_var >> token VarSy ||
   257       scan_tid >> token TFreeSy ||
   258       scan_num >> token NumSy ||
   259       $$ "#" ::: scan_num >> token XNumSy ||
   260       scan_longid >> token LongIdentSy ||
   261       scan_xid >> token IdentSy;
   262 
   263     val scan_lit = Scan.literal lex >> token Token;
   264 
   265     val scan_token =
   266       scan_comment >> K NONE ||
   267       Scan.max (op <= o pairself snd) scan_lit scan_val >> (fn (tk, s) => SOME (tk s)) ||
   268       scan_str >> (SOME o StrSy o implode_xstr) ||
   269       Scan.one Symbol.is_blank >> K NONE;
   270   in
   271     (case Scan.error (Scan.finite Symbol.stopper (Scan.repeat scan_token)) chs of
   272       (toks, []) => map_filter I toks
   273     | (_, cs) => error ("Inner lexical error at: " ^ quote (implode cs)))
   274   end;
   275 
   276 
   277 
   278 (** scan variables **)
   279 
   280 (* scan_indexname *)
   281 
   282 local
   283 
   284 fun scan_vname chrs =
   285   let
   286     fun nat n [] = n
   287       | nat n (c :: cs) = nat (n * 10 + (ord c - ord "0")) cs;
   288 
   289     fun idxname cs ds = (implode (rev cs), nat 0 ds);
   290     fun chop_idx [] ds = idxname [] ds
   291       | chop_idx (cs as (_ :: "\\<^isub>" :: _)) ds = idxname cs ds
   292       | chop_idx (cs as (_ :: "\\<^isup>" :: _)) ds = idxname cs ds
   293       | chop_idx (c :: cs) ds =
   294           if Symbol.is_digit c then chop_idx cs (c :: ds)
   295           else idxname (c :: cs) ds;
   296 
   297     val scan = scan_id -- Scan.optional ($$ "." |-- scan_nat >> nat 0) ~1;
   298   in
   299     (case scan chrs of
   300       ((cs, ~1), cs') => (chop_idx (rev cs) [], cs')
   301     | ((cs, i), cs') => ((implode cs, i), cs'))
   302   end;
   303 
   304 in
   305 
   306 val scan_indexname =
   307      $$ "'" |-- scan_vname >> (fn (x, i) => ("'" ^ x, i))
   308   || scan_vname;
   309 
   310 end;
   311 
   312 
   313 (* indexname *)
   314 
   315 fun read_indexname s =
   316   (case Scan.read Symbol.stopper scan_indexname (Symbol.explode s) of
   317     SOME xi => xi
   318   | _ => error ("Lexical error in variable name: " ^ quote s));
   319 
   320 
   321 (* read_var *)
   322 
   323 fun const c = Const (c, dummyT);
   324 fun free x = Free (x, dummyT);
   325 fun var xi = Var (xi, dummyT);
   326 
   327 fun read_var str =
   328   let
   329     val scan =
   330       $$ "?" |-- scan_indexname --| Scan.ahead (Scan.one Symbol.is_eof) >> var ||
   331       Scan.many Symbol.is_regular >> (free o implode);
   332   in the (Scan.read Symbol.stopper scan (Symbol.explode str)) end;
   333 
   334 
   335 (* read_variable *)
   336 
   337 fun read_variable str =
   338   let val scan = $$ "?" |-- scan_indexname || scan_indexname
   339   in Scan.read Symbol.stopper scan (Symbol.explode str) end;
   340 
   341 
   342 (* specific identifiers *)
   343 
   344 val constN = "\\<^const>";
   345 val fixedN = "\\<^fixed>";
   346 
   347 
   348 (* read numbers *)
   349 
   350 local
   351 
   352 fun nat cs =
   353   Option.map (#1 o Library.read_int)
   354     (Scan.read Symbol.stopper scan_nat cs);
   355 
   356 in
   357 
   358 val read_nat = nat o Symbol.explode;
   359 
   360 fun read_int s =
   361   (case Symbol.explode s of
   362     "-" :: cs => Option.map ~ (nat cs)
   363   | cs => nat cs);
   364 
   365 end;
   366 
   367 
   368 (* read_xnum: hex/bin/decimal *)
   369 
   370 local
   371 
   372 val ten = ord "0" + 10;
   373 val a = ord "a";
   374 val A = ord "A";
   375 val _ = a > A orelse sys_error "Bad ASCII";
   376 
   377 fun remap_hex c =
   378   let val x = ord c in
   379     if x >= a then chr (x - a + ten)
   380     else if x >= A then chr (x - A + ten)
   381     else c
   382   end;
   383 
   384 fun leading_zeros ["0"] = 0
   385   | leading_zeros ("0" :: cs) = 1 + leading_zeros cs
   386   | leading_zeros _ = 0;
   387 
   388 in
   389 
   390 fun read_xnum str =
   391   let
   392     val (sign, radix, digs) =
   393       (case Symbol.explode (perhaps (try (unprefix "#")) str) of
   394         "0" :: "x" :: cs => (1, 16, map remap_hex cs)
   395       | "0" :: "b" :: cs => (1, 2, cs)
   396       | "-" :: cs => (~1, 10, cs)
   397       | cs => (1, 10, cs));
   398     val value = sign * #1 (Library.read_radix_int radix digs);
   399   in {radix = radix, leading_zeros = leading_zeros digs, value = value} end;
   400 
   401 end;
   402 
   403 end;