src/Pure/Syntax/lexicon.ML
changeset 4703 a50ab39756db
parent 4587 6bce9ef27d7e
child 4902 8fbccead3695
equal deleted inserted replaced
4702:ffbaf431665d 4703:a50ab39756db
     1 (*  Title:      Pure/Syntax/lexicon.ML
     1 (*  Title:      Pure/Syntax/lexicon.ML
     2     ID:         $Id$
     2     ID:         $Id$
     3     Author:     Tobias Nipkow and Markus Wenzel, TU Muenchen
     3     Author:     Tobias Nipkow and Markus Wenzel, TU Muenchen
     4 
     4 
     5 Scanner combinators and the main lexer (for terms and types).
     5 Lexer for the inner Isabelle syntax (terms and types).
     6 *)
     6 *)
     7 
       
     8 infix 5 -- ^^;
       
     9 infix 3 >>;
       
    10 infix 0 ||;
       
    11 
       
    12 signature SCANNER =
       
    13 sig
       
    14   exception LEXICAL_ERROR
       
    15   val >> : ('a -> 'b * 'c) * ('b -> 'd) -> 'a -> 'd * 'c
       
    16   val || : ('a -> 'b) * ('a -> 'b) -> 'a -> 'b
       
    17   val -- : ('a -> 'b * 'c) * ('c -> 'd * 'e) -> 'a -> ('b * 'd) * 'e
       
    18   val ^^ : ('a -> string * 'b) * ('b -> string * 'c) -> 'a -> string * 'c
       
    19   val $$ : ''a -> ''a list -> ''a * ''a list
       
    20   val scan_empty: 'a list -> 'b list * 'a list
       
    21   val scan_one: ('a -> bool) -> 'a list -> 'a * 'a list
       
    22   val scan_any: ('a -> bool) -> 'a list -> 'a list * 'a list
       
    23   val scan_any1: ('a -> bool) -> 'a list -> 'a list * 'a list
       
    24   val scan_end: 'a list -> 'b list * 'a list
       
    25   val optional: ('a -> 'b * 'a) -> 'a -> 'b option * 'a
       
    26   val repeat: ('a -> 'b * 'a) -> 'a -> 'b list * 'a
       
    27   val repeat1: ('a -> 'b * 'a) -> 'a -> 'b list * 'a
       
    28   val max_of: ('a -> ('b * string) * 'a) -> ('a -> ('b * string) * 'a)
       
    29     -> 'a -> ('b * string) option * 'a
       
    30   val scan_id: string list -> string * string list
       
    31   val scan_longid: string list -> string * string list
       
    32   val scan_tid: string list -> string * string list
       
    33   val scan_nat: string list -> string * string list
       
    34   type lexicon
       
    35   val dest_lexicon: lexicon -> string list
       
    36   val empty_lexicon: lexicon
       
    37   val extend_lexicon: lexicon -> string list -> lexicon
       
    38   val make_lexicon: string list -> lexicon
       
    39   val merge_lexicons: lexicon -> lexicon -> lexicon
       
    40   val scan_literal: lexicon -> string list -> string * string list
       
    41 end;
       
    42 
     7 
    43 signature LEXICON0 =
     8 signature LEXICON0 =
    44 sig
     9 sig
    45   val is_identifier: string -> bool
    10   val is_identifier: string -> bool
    46   val implode_xstr: string list -> string
    11   val implode_xstr: string list -> string
    47   val explode_xstr: string -> string list
    12   val explode_xstr: string -> string list
       
    13   val scan_id: string list -> string * string list
       
    14   val scan_longid: string list -> string * string list
       
    15   val scan_var: string list -> string * string list
       
    16   val scan_tid: string list -> string * string list
       
    17   val scan_nat: string list -> string * string list
       
    18   val scan_int: string list -> string * string list
    48   val string_of_vname: indexname -> string
    19   val string_of_vname: indexname -> string
    49   val string_of_vname': indexname -> string
    20   val string_of_vname': indexname -> string
    50   val scan_varname: string list -> indexname * string list
    21   val indexname: string list -> indexname
    51   val scan_var: string -> term
    22   val read_var: string -> term
    52   val const: string -> term
    23   val const: string -> term
    53   val free: string -> term
    24   val free: string -> term
    54   val var: indexname -> term
    25   val var: indexname -> term
    55   val read_var: string -> string * int
       
    56 end;
    26 end;
    57 
    27 
    58 signature LEXICON =
    28 signature LEXICON =
    59 sig
    29 sig
    60   include SCANNER
       
    61   include LEXICON0
    30   include LEXICON0
    62   val is_xid: string -> bool
    31   val is_xid: string -> bool
    63   val is_tid: string -> bool
    32   val is_tid: string -> bool
    64   datatype token =
    33   datatype token =
    65     Token of string |
    34     Token of string |
    82   val display_token: token -> string
    51   val display_token: token -> string
    83   val matching_tokens: token * token -> bool
    52   val matching_tokens: token * token -> bool
    84   val token_assoc: (token option * 'a list) list * token -> 'a list
    53   val token_assoc: (token option * 'a list) list * token -> 'a list
    85   val valued_token: token -> bool
    54   val valued_token: token -> bool
    86   val predef_term: string -> token option
    55   val predef_term: string -> token option
    87   val tokenize: lexicon -> bool -> string list -> token list
    56   val tokenize: Scan.lexicon -> bool -> string list -> token list
    88 end;
    57 end;
    89 
    58 
    90 structure Lexicon : LEXICON =
    59 structure Lexicon : LEXICON =
    91 struct
    60 struct
    92 
    61 
    93 
    62 
    94 (** is_identifier etc. **)
    63 (** is_identifier etc. **)
    95 
    64 
    96 fun is_ident [] = false
    65 fun is_ident [] = false
    97   | is_ident (c :: cs) = is_letter c andalso forall is_letdig cs;
    66   | is_ident (c :: cs) = Symbol.is_letter c andalso forall Symbol.is_letdig cs;
    98 
    67 
    99 val is_identifier = is_ident o explode;
    68 val is_identifier = is_ident o Symbol.explode;
   100 
    69 
   101 fun is_xid s =
    70 fun is_xid s =
   102   (case explode s of
    71   (case Symbol.explode s of
   103     "_" :: cs => is_ident cs
    72     "_" :: cs => is_ident cs
   104   | cs => is_ident cs);
    73   | cs => is_ident cs);
   105 
    74 
   106 fun is_tid s =
    75 fun is_tid s =
   107   (case explode s of
    76   (case Symbol.explode s of
   108     "'" :: cs => is_ident cs
    77     "'" :: cs => is_ident cs
   109   | _ => false);
    78   | _ => false);
   110 
    79 
   111 
    80 
   112 
    81 
       
    82 (** basic scanners **)
       
    83 
       
    84 val scan_letter_letdigs = Scan.one Symbol.is_letter -- Scan.any Symbol.is_letdig >> op ::;
       
    85 val scan_digits1 = Scan.any1 Symbol.is_digit;
       
    86 
       
    87 val scan_id = scan_letter_letdigs >> implode;
       
    88 val scan_longid = scan_id ^^ (Scan.repeat1 ($$ "." ^^ scan_id) >> implode);
       
    89 val scan_tid = $$ "'" ^^ scan_id;
       
    90 
       
    91 val scan_nat = scan_digits1 >> implode;
       
    92 val scan_int = $$ "~" ^^ scan_nat || scan_nat;
       
    93 
       
    94 val scan_id_nat = scan_id ^^ Scan.optional ($$ "." ^^ scan_nat) "";
       
    95 val scan_var = $$ "?" ^^ scan_id_nat;
       
    96 
       
    97 
       
    98 
   113 (** string_of_vname **)
    99 (** string_of_vname **)
   114 
   100 
   115 fun string_of_vname (x, i) =
   101 fun string_of_vname (x, i) =
   116   let
   102   let
   117     val si = string_of_int i;
   103     val si = string_of_int i;
       
   104     val dot = Symbol.is_digit (last_elem (Symbol.explode x)) handle _ => true;
   118   in
   105   in
   119     (if is_digit (last_elem (explode x)) then "?" ^ x ^ "." ^ si
   106     if dot then "?" ^ x ^ "." ^ si
   120      else if i = 0 then "?" ^ x
   107     else if i = 0 then "?" ^ x
   121      else "?" ^ x ^ si)
   108     else "?" ^ x ^ si
   122     handle LIST _ => "?NULL_VARIABLE." ^ si
       
   123   end;
   109   end;
   124 
   110 
   125 fun string_of_vname' (xi as (x, i)) =
   111 fun string_of_vname' (x, ~1) = x
   126   if i < 0 then x else string_of_vname xi;
   112   | string_of_vname' xi = string_of_vname xi;
   127 
   113 
   128 
   114 
   129 
   115 
   130 (** datatype token **)
   116 (** datatype token **)
   131 
   117 
   229   | predef_term "xnum" = Some (NumSy "xnum")
   215   | predef_term "xnum" = Some (NumSy "xnum")
   230   | predef_term "xstr" = Some (StrSy "xstr")
   216   | predef_term "xstr" = Some (StrSy "xstr")
   231   | predef_term _ = None;
   217   | predef_term _ = None;
   232 
   218 
   233 
   219 
   234 (*  xstr tokens *)
   220 (* xstr tokens *)
   235 
   221 
   236 fun implode_xstr cs = enclose "''" "''" (implode cs);
   222 val scan_chr =
       
   223   $$ "\\" |-- Scan.one Symbol.not_eof ||
       
   224   Scan.one (not_equal "'" andf Symbol.not_eof) ||
       
   225   $$ "'" --| Scan.ahead (Scan.one (not_equal "'"));
       
   226 
       
   227 val scan_str =
       
   228   $$ "'" |-- $$ "'" |--
       
   229     !! (fn cs => "Inner lexical error: malformed literal string at " ^ quote ("''" ^ beginning cs))
       
   230       (Scan.repeat scan_chr --| $$ "'" --| $$ "'");
       
   231 
       
   232 
       
   233 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs));
   237 
   234 
   238 fun explode_xstr str =
   235 fun explode_xstr str =
   239   rev (tl (tl (rev (tl (tl (explode str))))));
   236   #1 (Scan.error (Scan.finite Symbol.eof scan_str) (Symbol.explode str));
   240 
       
   241 
       
   242 
       
   243 (** datatype lexicon **)
       
   244 
       
   245 datatype lexicon =
       
   246   Empty |
       
   247   Branch of string * string * lexicon * lexicon * lexicon;
       
   248 
       
   249 val no_token = "";
       
   250 
       
   251 
       
   252 (* dest_lexicon *)
       
   253 
       
   254 fun dest_lexicon Empty = []
       
   255   | dest_lexicon (Branch (_, "", lt, eq, gt)) =
       
   256       dest_lexicon eq @ dest_lexicon lt @ dest_lexicon gt
       
   257   | dest_lexicon (Branch (_, str, lt, eq, gt)) =
       
   258       str :: (dest_lexicon eq @ dest_lexicon lt @ dest_lexicon gt);
       
   259 
       
   260 
       
   261 (* empty, extend, make, merge lexicons *)
       
   262 
       
   263 val empty_lexicon = Empty;
       
   264 
       
   265 fun extend_lexicon lexicon strs =
       
   266   let
       
   267     fun ext (lex, s) =
       
   268       let
       
   269         fun add (Branch (d, a, lt, eq, gt)) (chs as c :: cs) =
       
   270               if c < d then Branch (d, a, add lt chs, eq, gt)
       
   271               else if c > d then Branch (d, a, lt, eq, add gt chs)
       
   272               else Branch (d, if null cs then s else a, lt, add eq cs, gt)
       
   273           | add Empty [c] =
       
   274               Branch (c, s, Empty, Empty, Empty)
       
   275           | add Empty (c :: cs) =
       
   276               Branch (c, no_token, Empty, add Empty cs, Empty)
       
   277           | add lex [] = lex;
       
   278 
       
   279         val cs = explode s;
       
   280       in
       
   281         if exists is_blank cs then
       
   282           sys_error ("extend_lexicon: blank in delimiter " ^ quote s)
       
   283         else add lex cs
       
   284       end;
       
   285   in
       
   286     foldl ext (lexicon, strs \\ dest_lexicon lexicon)
       
   287   end;
       
   288 
       
   289 val make_lexicon = extend_lexicon empty_lexicon;
       
   290 
       
   291 fun merge_lexicons lex1 lex2 =
       
   292   let
       
   293     val strs1 = dest_lexicon lex1;
       
   294     val strs2 = dest_lexicon lex2;
       
   295   in
       
   296     if strs2 subset strs1 then lex1
       
   297     else if strs1 subset strs2 then lex2
       
   298     else extend_lexicon lex1 strs2
       
   299   end;
       
   300 
       
   301 
       
   302 
       
   303 (** scanners **)
       
   304 
       
   305 exception LEXICAL_ERROR;
       
   306 
       
   307 
       
   308 (* scanner combinators *)
       
   309 
       
   310 fun (scan >> f) cs = apfst f (scan cs);
       
   311 
       
   312 fun (scan1 || scan2) cs = scan1 cs handle LEXICAL_ERROR => scan2 cs;
       
   313 
       
   314 fun (scan1 -- scan2) cs =
       
   315   let
       
   316     val (x, cs') = scan1 cs;
       
   317     val (y, cs'') = scan2 cs';
       
   318   in
       
   319     ((x, y), cs'')
       
   320   end;
       
   321 
       
   322 fun (scan1 ^^ scan2) = scan1 -- scan2 >> op ^;
       
   323 
       
   324 
       
   325 (* generic scanners *)
       
   326 
       
   327 fun $$ _ [] = raise LEXICAL_ERROR
       
   328   | $$ a (c :: cs) =
       
   329       if a = c then (c, cs) else raise LEXICAL_ERROR;
       
   330 
       
   331 fun scan_empty cs = ([], cs);
       
   332 
       
   333 fun scan_one _ [] = raise LEXICAL_ERROR
       
   334   | scan_one pred (c :: cs) =
       
   335       if pred c then (c, cs) else raise LEXICAL_ERROR;
       
   336 
       
   337 fun scan_any _ [] = ([], [])
       
   338   | scan_any pred (chs as c :: cs) =
       
   339       if pred c then apfst (cons c) (scan_any pred cs)
       
   340       else ([], chs);
       
   341 
       
   342 fun scan_any1 pred = scan_one pred -- scan_any pred >> op ::;
       
   343 
       
   344 fun scan_rest cs = (cs, []);
       
   345 
       
   346 fun scan_end [] = ([], [])
       
   347   | scan_end _ = raise LEXICAL_ERROR;
       
   348 
       
   349 fun optional scan = scan >> Some || scan_empty >> K None;
       
   350 
       
   351 fun repeat scan cs = (scan -- repeat scan >> op :: || scan_empty) cs;
       
   352 
       
   353 fun repeat1 scan = scan -- repeat scan >> op ::;
       
   354 
       
   355 fun max_of scan1 scan2 cs =
       
   356   (case (optional scan1 cs, optional scan2 cs) of
       
   357     (tok1, (None, _)) => tok1
       
   358   | ((None, _), tok2) => tok2
       
   359   | (tok1 as (Some (_, s1), _), tok2 as (Some (_, s2), _)) =>
       
   360       if size s1 >= size s2 then tok1 else tok2);
       
   361 
       
   362 
       
   363 (* other scanners *)
       
   364 
       
   365 val scan_letter_letdigs = scan_one is_letter -- scan_any is_letdig >> op ::;
       
   366 
       
   367 val scan_digits1 = scan_any1 is_digit;
       
   368 
       
   369 val scan_id = scan_letter_letdigs >> implode;
       
   370 
       
   371 val scan_longid = scan_id ^^ (repeat1 ($$ "." ^^ scan_id) >> implode);
       
   372 
       
   373 val scan_id_nat =
       
   374   scan_id ^^ ($$ "." ^^ (scan_digits1 >> implode) || scan_empty >> K "");
       
   375 
       
   376 val scan_tid = $$ "'" ^^ scan_id;
       
   377 
       
   378 val scan_nat = scan_digits1 >> implode;
       
   379 
       
   380 val scan_int = $$ "~" ^^ scan_nat || scan_nat;
       
   381 
       
   382 
       
   383 (* scan_literal *)
       
   384 
       
   385 fun scan_literal lex chrs =
       
   386   let
       
   387     fun scan_lit _ s_cs [] = s_cs
       
   388       | scan_lit Empty s_cs _ = s_cs
       
   389       | scan_lit (Branch (d, a, lt, eq, gt)) s_cs (chs as c :: cs) =
       
   390           if c < d then scan_lit lt s_cs chs
       
   391           else if c > d then scan_lit gt s_cs chs
       
   392           else scan_lit eq (if a = no_token then s_cs else Some (a, cs)) cs;
       
   393   in
       
   394     (case scan_lit lex None chrs of
       
   395       None => raise LEXICAL_ERROR
       
   396     | Some s_cs => s_cs)
       
   397   end;
       
   398 
   237 
   399 
   238 
   400 
   239 
   401 (** tokenize **)
   240 (** tokenize **)
   402 
   241 
   404   let
   243   let
   405     val scan_xid =
   244     val scan_xid =
   406       if xids then $$ "_" ^^ scan_id || scan_id
   245       if xids then $$ "_" ^^ scan_id || scan_id
   407       else scan_id;
   246       else scan_id;
   408 
   247 
   409     val scan_lit = scan_literal lex >> pair Token;
       
   410 
       
   411     val scan_val =
   248     val scan_val =
   412       $$ "?" ^^ $$ "'" ^^ scan_id_nat >> pair TVarSy ||
   249       $$ "?" ^^ $$ "'" ^^ scan_id_nat >> pair TVarSy ||
   413       $$ "?" ^^ scan_id_nat >> pair VarSy ||
   250       scan_var >> pair VarSy ||
   414       $$ "'" ^^ scan_id >> pair TFreeSy ||
   251       scan_tid >> pair TFreeSy ||
   415       $$ "#" ^^ scan_int >> pair NumSy ||
   252       $$ "#" ^^ scan_int >> pair NumSy ||		(* FIXME remove "#" *)
   416       scan_longid >> pair LongIdentSy ||
   253       scan_longid >> pair LongIdentSy ||
   417       scan_xid >> pair IdentSy;
   254       scan_xid >> pair IdentSy;
   418 
   255 
   419     fun scan_str ("'" :: "'" :: cs) = ([], cs)
   256     val scan_lit = Scan.literal lex >> (pair Token o implode);
   420       | scan_str ("\\" :: c :: cs) = apfst (cons c) (scan_str cs)
   257 
   421       | scan_str (c :: cs) = apfst (cons c) (scan_str cs)
   258     val scan_token =
   422       | scan_str [] = raise ERROR;
   259       Scan.max (op <= o pairself snd) scan_lit scan_val >> (fn (tk, s) => Some (tk s)) ||
   423 
   260       scan_str >> (Some o StrSy o implode_xstr) ||
   424 
   261       Scan.one Symbol.is_blank >> K None;
   425     fun scan (rev_toks, []) = rev (EndToken :: rev_toks)
       
   426       | scan (rev_toks, chs as "'" :: "'" :: cs) =
       
   427           let
       
   428             val (cs', cs'') = scan_str cs handle ERROR =>
       
   429               error ("Lexical error: missing quotes at end of string " ^
       
   430                 quote (implode chs));
       
   431           in
       
   432             scan (StrSy (implode_xstr cs') :: rev_toks, cs'')
       
   433           end
       
   434       | scan (rev_toks, chs as c :: cs) =
       
   435           if is_blank c then scan (rev_toks, cs)
       
   436           else
       
   437             (case max_of scan_lit scan_val chs of
       
   438               (None, _) => error ("Lexical error at: " ^ quote (implode chs))
       
   439             | (Some (tk, s), chs') => scan (tk s :: rev_toks, chs'));
       
   440   in
   262   in
   441     scan ([], chs)
   263     (case Scan.error (Scan.finite Symbol.eof (Scan.repeat scan_token)) chs of
       
   264       (toks, []) => mapfilter I toks @ [EndToken]
       
   265     | (_, cs) => error ("Inner lexical error at: " ^ quote (implode cs)))
   442   end;
   266   end;
   443 
   267 
   444 
   268 
   445 
   269 
   446 (** scan variables **)
   270 (** scan variables **)
   450 fun scan_vname chrs =
   274 fun scan_vname chrs =
   451   let
   275   let
   452     fun nat_of_chs n [] = n
   276     fun nat_of_chs n [] = n
   453       | nat_of_chs n (c :: cs) = nat_of_chs (n * 10 + (ord c - ord "0")) cs;
   277       | nat_of_chs n (c :: cs) = nat_of_chs (n * 10 + (ord c - ord "0")) cs;
   454 
   278 
   455     val nat_of = nat_of_chs 0;
   279     val nat = nat_of_chs 0;
   456 
   280 
   457     fun split_vname chs =
   281     fun split_vname chs =
   458       let val (cs, ds) = take_suffix is_digit chs
   282       let val (cs, ds) = take_suffix Symbol.is_digit chs
   459       in (implode cs, nat_of ds) end
   283       in (implode cs, nat ds) end
   460 
   284 
   461     val scan =
   285     val scan =
   462       scan_letter_letdigs --
   286       scan_letter_letdigs -- Scan.optional ($$ "." |-- scan_digits1 >> nat) ~1;
   463         ($$ "." -- scan_digits1 >> (nat_of o #2) || scan_empty >> K ~1);
       
   464   in
   287   in
   465     (case scan chrs of
   288     (case scan chrs of
   466       ((cs, ~1), cs') => (split_vname cs, cs')
   289       ((cs, ~1), cs') => (split_vname cs, cs')
   467     | ((cs, i), cs') => ((implode cs, i), cs'))
   290     | ((cs, i), cs') => ((implode cs, i), cs'))
   468   end;
   291   end;
   469 
   292 
   470 
   293 
   471 (* scan_varname *)
   294 (* indexname *)
   472 
   295 
   473 fun scan_varname chs =
   296 fun indexname cs =
   474   scan_vname chs handle LEXICAL_ERROR
   297   (case Scan.error (Scan.finite Symbol.eof (Scan.option scan_vname)) cs of
   475     => error ("scan_varname: bad varname " ^ quote (implode chs));
   298     (Some xi, []) => xi
   476 
   299   | _ => error ("Lexical error in variable name: " ^ quote (implode cs)));
   477 
   300 
   478 (* scan_var *)
   301 
       
   302 (* read_var *)
   479 
   303 
   480 fun const c = Const (c, dummyT);
   304 fun const c = Const (c, dummyT);
   481 fun free x = Free (x, dummyT);
   305 fun free x = Free (x, dummyT);
   482 fun var xi = Var (xi, dummyT);
   306 fun var xi = Var (xi, dummyT);
   483 
   307 
   484 fun scan_var str =
   308 fun read_var str =
   485   let
   309   let
   486     fun tvar (x, i) = var ("'" ^ x, i);
   310     fun tvar (x, i) = var ("'" ^ x, i);
   487 
   311 
   488     val scan =
   312     val scan =
   489       $$ "?" -- $$ "'" -- scan_vname -- scan_end >> (tvar o #2 o #1) ||
   313       $$ "?" |-- $$ "'" |-- scan_vname >> tvar ||
   490       $$ "?" -- scan_vname -- scan_end >> (var o #2 o #1) ||
   314       $$ "?" |-- scan_vname >> var ||
   491       scan_rest >> (free o implode);
   315       Scan.any Symbol.not_eof >> (free o implode);
   492   in
   316   in
   493     #1 (scan (explode str))
   317     #1 (Scan.error (Scan.finite Symbol.eof scan) (Symbol.explode str))
   494   end;
   318   end;
   495 
   319 
   496 
   320 
   497 (* read_var *)
       
   498 
       
   499 fun read_var str =
       
   500   let val scan = $$ "?" -- scan_vname -- scan_end >> (#2 o #1) in
       
   501     #1 (scan (explode str)) handle LEXICAL_ERROR
       
   502       => error ("Bad variable " ^ quote str)
       
   503   end;
       
   504 
       
   505 
       
   506 end;
   321 end;