src/Pure/Syntax/lexicon.ML
changeset 27773 a52166b228b9
parent 27121 38367dbccae5
child 27800 df444ddeff56
equal deleted inserted replaced
27772:b933c997eab3 27773:a52166b228b9
     8 signature LEXICON0 =
     8 signature LEXICON0 =
     9 sig
     9 sig
    10   val is_identifier: string -> bool
    10   val is_identifier: string -> bool
    11   val is_ascii_identifier: string -> bool
    11   val is_ascii_identifier: string -> bool
    12   val is_tid: string -> bool
    12   val is_tid: string -> bool
       
    13   val scan_id: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    14   val scan_longid: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    15   val scan_tid: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    16   val scan_nat: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    17   val scan_int: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    18   val scan_hex: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    19   val scan_bin: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    20   val scan_var: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
       
    21   val scan_tvar: SymbolPos.T list -> SymbolPos.T list * SymbolPos.T list
    13   val implode_xstr: string list -> string
    22   val implode_xstr: string list -> string
    14   val explode_xstr: string -> string list
    23   val explode_xstr: string -> string list
    15   val scan_id: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    16   val scan_longid: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    17   val scan_tid: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    18   val scan_nat: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    19   val scan_int: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    20   val scan_hex: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    21   val scan_bin: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    22   val scan_var: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    23   val scan_tvar: Symbol.symbol list -> Symbol.symbol list * Symbol.symbol list
       
    24   val read_indexname: string -> indexname
    24   val read_indexname: string -> indexname
    25   val read_var: string -> term
    25   val read_var: string -> term
    26   val read_variable: string -> indexname option
    26   val read_variable: string -> indexname option
    27   val const: string -> term
    27   val const: string -> term
    28   val free: string -> term
    28   val free: string -> term
    59   val str_of_token: token -> string
    59   val str_of_token: token -> string
    60   val display_token: token -> string
    60   val display_token: token -> string
    61   val matching_tokens: token * token -> bool
    61   val matching_tokens: token * token -> bool
    62   val valued_token: token -> bool
    62   val valued_token: token -> bool
    63   val predef_term: string -> token option
    63   val predef_term: string -> token option
    64   val tokenize: Scan.lexicon -> bool -> string list -> token list
    64   val tokenize: Scan.lexicon -> bool -> SymbolPos.T list -> token list
    65 end;
    65 end;
    66 
    66 
    67 structure Lexicon: LEXICON =
    67 structure Lexicon: LEXICON =
    68 struct
    68 struct
    69 
    69 
    87 
    87 
    88 
    88 
    89 
    89 
    90 (** basic scanners **)
    90 (** basic scanners **)
    91 
    91 
    92 val scan_id = Scan.one Symbol.is_letter ::: Scan.many Symbol.is_letdig;
    92 open BasicSymbolPos;
    93 val scan_longid = scan_id @@@ (Scan.repeat1 ($$ "." ::: scan_id) >> flat);
    93 
    94 val scan_tid = $$ "'" ::: scan_id;
    94 fun !!! msg = SymbolPos.!!! ("Inner lexical error: " ^ msg);
    95 
    95 
    96 val scan_nat = Scan.many1 Symbol.is_digit;
    96 val scan_id = Scan.one (Symbol.is_letter o symbol) ::: Scan.many (Symbol.is_letdig o symbol);
    97 val scan_int = $$ "-" ::: scan_nat || scan_nat;
    97 val scan_longid = scan_id @@@ (Scan.repeat1 ($$$ "." @@@ scan_id) >> flat);
    98 val scan_hex = $$ "0" ::: $$ "x" ::: Scan.many1 Symbol.is_ascii_hex;
    98 val scan_tid = $$$ "'" @@@ scan_id;
    99 val scan_bin = $$ "0" ::: $$ "b" ::: Scan.many1 (fn s => s = "0" orelse s = "1");
    99 
   100 
   100 val scan_nat = Scan.many1 (Symbol.is_digit o symbol);
   101 val scan_id_nat = scan_id @@@ Scan.optional ($$ "." ::: scan_nat) [];
   101 val scan_int = $$$ "-" @@@ scan_nat || scan_nat;
   102 val scan_var = $$ "?" ::: scan_id_nat;
   102 val scan_hex = $$$ "0" @@@ $$$ "x" @@@ Scan.many1 (Symbol.is_ascii_hex o symbol);
   103 val scan_tvar = $$ "?" ::: $$ "'" ::: scan_id_nat;
   103 val scan_bin = $$$ "0" @@@ $$$ "b" @@@ Scan.many1 (fn (s, _) => s = "0" orelse s = "1");
       
   104 
       
   105 val scan_id_nat = scan_id @@@ Scan.optional ($$$ "." @@@ scan_nat) [];
       
   106 val scan_var = $$$ "?" @@@ scan_id_nat;
       
   107 val scan_tvar = $$$ "?" @@@ $$$ "'" @@@ scan_id_nat;
   104 
   108 
   105 
   109 
   106 
   110 
   107 (** datatype token **)
   111 (** datatype token **)
   108 
   112 
   201   | predef_term _ = NONE;
   205   | predef_term _ = NONE;
   202 
   206 
   203 
   207 
   204 (* xstr tokens *)
   208 (* xstr tokens *)
   205 
   209 
   206 fun lex_err msg prfx (cs, _) =
       
   207   "Inner lexical error: " ^ msg ^ " at " ^ quote (prfx ^ Symbol.beginning 10 cs);
       
   208 
       
   209 val scan_chr =
   210 val scan_chr =
   210   $$ "\\" |-- $$ "'" ||
   211   $$$ "\\" |-- $$$ "'" ||
   211   Scan.one (fn s => s <> "\\" andalso s <> "'" andalso Symbol.is_regular s) ||
   212   Scan.one ((fn s => s <> "\\" andalso s <> "'" andalso Symbol.is_regular s) o symbol) >> single ||
   212   $$ "'" --| Scan.ahead (~$$ "'");
   213   $$$ "'" --| Scan.ahead (~$$$ "'");
   213 
   214 
   214 val scan_str =
   215 val scan_str =
   215   $$ "'" |-- $$ "'" |-- !! (lex_err "missing end of string" "''")
   216   $$$ "'" |-- $$$ "'" |-- !!! "missing end of string"
   216     (Scan.repeat scan_chr --| $$ "'" --| $$ "'");
   217     ((Scan.repeat scan_chr >> flat) --| $$$ "'" --| $$$ "'");
   217 
   218 
   218 
   219 
   219 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs));
   220 fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'" | c => c) cs));
   220 
   221 
   221 fun explode_xstr str =
   222 fun explode_xstr str =
   222   (case Scan.read Symbol.stopper scan_str (Symbol.explode str) of
   223   (case Scan.read SymbolPos.stopper scan_str (SymbolPos.explode (str, Position.none)) of
   223     SOME cs => cs
   224     SOME cs => map symbol cs
   224   | _ => error ("Inner lexical error: literal string expected at " ^ quote str));
   225   | _ => error ("Inner lexical error: literal string expected at " ^ quote str));
   225 
   226 
   226 
   227 
   227 (* nested comments *)
       
   228 
       
   229 val scan_cmt =
       
   230   Scan.depend (fn d => $$ "(" ^^ $$ "*" >> pair (d + 1)) ||
       
   231   Scan.depend (fn 0 => Scan.fail | d => $$ "*" ^^ $$ ")" >> pair (d - 1)) ||
       
   232   Scan.lift ($$ "*" --| Scan.ahead (~$$ ")")) ||
       
   233   Scan.lift (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s));
       
   234 
       
   235 val scan_comment =
       
   236   $$ "(" -- $$ "*" -- !! (lex_err "missing end of comment" "(*")
       
   237     (Scan.pass 0 (Scan.repeat scan_cmt) -- $$ "*" -- $$ ")")
       
   238   >> K ();
       
   239 
       
   240 
       
   241 
       
   242 (** tokenize **)
   228 (** tokenize **)
   243 
   229 
   244 fun tokenize lex xids chs =
   230 fun tokenize lex xids inp =
   245   let
   231   let
   246     fun token kind cs = (kind, implode cs);
   232     fun token kind ss = (kind, #1 (SymbolPos.implode ss));
   247 
   233 
   248     val scan_xid =
   234     val scan_xid =
   249       if xids then $$ "_" ::: scan_id || scan_id
   235       if xids then $$$ "_" @@@ scan_id || scan_id
   250       else scan_id;
   236       else scan_id;
   251 
   237 
   252     val scan_num = scan_hex || scan_bin || scan_int;
   238     val scan_num = scan_hex || scan_bin || scan_int;
   253 
   239 
   254     val scan_val =
   240     val scan_val =
   255       scan_tvar >> token TVarSy ||
   241       scan_tvar >> token TVarSy ||
   256       scan_var >> token VarSy ||
   242       scan_var >> token VarSy ||
   257       scan_tid >> token TFreeSy ||
   243       scan_tid >> token TFreeSy ||
   258       scan_num >> token NumSy ||
   244       scan_num >> token NumSy ||
   259       $$ "#" ::: scan_num >> token XNumSy ||
   245       $$$ "#" @@@ scan_num >> token XNumSy ||
   260       scan_longid >> token LongIdentSy ||
   246       scan_longid >> token LongIdentSy ||
   261       scan_xid >> token IdentSy;
   247       scan_xid >> token IdentSy;
   262 
   248 
   263     val scan_lit = Scan.literal lex >> token Token;
   249     val scan_lit = Scan.literal lex >> token Token;
   264 
   250 
   265     val scan_token =
   251     val scan_token =
   266       scan_comment >> K NONE ||
   252       SymbolPos.scan_comment !!! >> K NONE ||
   267       Scan.max (op <= o pairself snd) scan_lit scan_val >> (fn (tk, s) => SOME (tk s)) ||
   253       Scan.max (op <= o pairself snd) scan_lit scan_val >> (fn (tk, s) => SOME (tk s)) ||
   268       scan_str >> (SOME o StrSy o implode_xstr) ||
   254       scan_str >> (SOME o StrSy o implode_xstr o map symbol) ||
   269       Scan.one Symbol.is_blank >> K NONE;
   255       Scan.one (Symbol.is_blank o symbol) >> K NONE;
   270   in
   256   in
   271     (case Scan.error (Scan.finite Symbol.stopper (Scan.repeat scan_token)) chs of
   257     (case Scan.error
       
   258         (Scan.finite SymbolPos.stopper (Scan.repeat scan_token)) inp of
   272       (toks, []) => map_filter I toks
   259       (toks, []) => map_filter I toks
   273     | (_, cs) => error ("Inner lexical error at: " ^ quote (implode cs)))
   260     | (_, cs) =>
       
   261         let val (s, (pos, _)) = SymbolPos.implode cs
       
   262         in error ("Inner lexical error at: " ^ quote s ^ Position.str_of pos) end)
   274   end;
   263   end;
   275 
   264 
   276 
   265 
   277 
   266 
   278 (** scan variables **)
   267 (** scan variables **)
   279 
   268 
   280 (* scan_indexname *)
   269 (* scan_indexname *)
   281 
   270 
   282 local
   271 local
   283 
   272 
   284 fun scan_vname chrs =
   273 val scan_vname =
   285   let
   274   let
   286     fun nat n [] = n
   275     fun nat n [] = n
   287       | nat n (c :: cs) = nat (n * 10 + (ord c - ord "0")) cs;
   276       | nat n (c :: cs) = nat (n * 10 + (ord c - ord "0")) cs;
   288 
   277 
   289     fun idxname cs ds = (implode (rev cs), nat 0 ds);
   278     fun idxname cs ds = (implode (rev cs), nat 0 ds);
   292       | chop_idx (cs as (_ :: "\\<^isup>" :: _)) ds = idxname cs ds
   281       | chop_idx (cs as (_ :: "\\<^isup>" :: _)) ds = idxname cs ds
   293       | chop_idx (c :: cs) ds =
   282       | chop_idx (c :: cs) ds =
   294           if Symbol.is_digit c then chop_idx cs (c :: ds)
   283           if Symbol.is_digit c then chop_idx cs (c :: ds)
   295           else idxname (c :: cs) ds;
   284           else idxname (c :: cs) ds;
   296 
   285 
   297     val scan = scan_id -- Scan.optional ($$ "." |-- scan_nat >> nat 0) ~1;
   286     val scan = (scan_id >> map symbol) --
       
   287       Scan.optional ($$$ "." |-- scan_nat >> (nat 0 o map symbol)) ~1;
   298   in
   288   in
   299     (case scan chrs of
   289     scan >>
   300       ((cs, ~1), cs') => (chop_idx (rev cs) [], cs')
   290       (fn (cs, ~1) => chop_idx (rev cs) []
   301     | ((cs, i), cs') => ((implode cs, i), cs'))
   291         | (cs, i) => (implode cs, i))
   302   end;
   292   end;
   303 
   293 
   304 in
   294 in
   305 
   295 
   306 val scan_indexname =
   296 val scan_indexname = $$$ "'" |-- scan_vname >> (fn (x, i) => ("'" ^ x, i)) || scan_vname;
   307      $$ "'" |-- scan_vname >> (fn (x, i) => ("'" ^ x, i))
       
   308   || scan_vname;
       
   309 
   297 
   310 end;
   298 end;
   311 
   299 
   312 
   300 
   313 (* indexname *)
   301 (* indexname *)
   314 
   302 
   315 fun read_indexname s =
   303 fun read_indexname s =
   316   (case Scan.read Symbol.stopper scan_indexname (Symbol.explode s) of
   304   (case Scan.read SymbolPos.stopper scan_indexname (SymbolPos.explode (s, Position.none)) of
   317     SOME xi => xi
   305     SOME xi => xi
   318   | _ => error ("Lexical error in variable name: " ^ quote s));
   306   | _ => error ("Lexical error in variable name: " ^ quote s));
   319 
   307 
   320 
   308 
   321 (* read_var *)
   309 (* read_var *)
   325 fun var xi = Var (xi, dummyT);
   313 fun var xi = Var (xi, dummyT);
   326 
   314 
   327 fun read_var str =
   315 fun read_var str =
   328   let
   316   let
   329     val scan =
   317     val scan =
   330       $$ "?" |-- scan_indexname --| Scan.ahead (Scan.one Symbol.is_eof) >> var ||
   318       $$$ "?" |-- scan_indexname --| Scan.ahead (Scan.one SymbolPos.is_eof) >> var ||
   331       Scan.many Symbol.is_regular >> (free o implode);
   319       Scan.many (Symbol.is_regular o symbol) >> (free o implode o map symbol);
   332   in the (Scan.read Symbol.stopper scan (Symbol.explode str)) end;
   320   in the (Scan.read SymbolPos.stopper scan (SymbolPos.explode (str, Position.none))) end;
   333 
   321 
   334 
   322 
   335 (* read_variable *)
   323 (* read_variable *)
   336 
   324 
   337 fun read_variable str =
   325 fun read_variable str =
   338   let val scan = $$ "?" |-- scan_indexname || scan_indexname
   326   let val scan = $$$ "?" |-- scan_indexname || scan_indexname
   339   in Scan.read Symbol.stopper scan (Symbol.explode str) end;
   327   in Scan.read SymbolPos.stopper scan (SymbolPos.explode (str, Position.none)) end;
   340 
   328 
   341 
   329 
   342 (* specific identifiers *)
   330 (* specific identifiers *)
   343 
   331 
   344 val constN = "\\<^const>";
   332 val constN = "\\<^const>";
   348 (* read numbers *)
   336 (* read numbers *)
   349 
   337 
   350 local
   338 local
   351 
   339 
   352 fun nat cs =
   340 fun nat cs =
   353   Option.map (#1 o Library.read_int)
   341   Option.map (#1 o Library.read_int o map symbol)
   354     (Scan.read Symbol.stopper scan_nat cs);
   342     (Scan.read SymbolPos.stopper scan_nat cs);
   355 
   343 
   356 in
   344 in
   357 
   345 
   358 val read_nat = nat o Symbol.explode;
   346 fun read_nat s = nat (SymbolPos.explode (s, Position.none));
   359 
   347 
   360 fun read_int s =
   348 fun read_int s =
   361   (case Symbol.explode s of
   349   (case SymbolPos.explode (s, Position.none) of
   362     "-" :: cs => Option.map ~ (nat cs)
   350     ("-", _) :: cs => Option.map ~ (nat cs)
   363   | cs => nat cs);
   351   | cs => nat cs);
   364 
   352 
   365 end;
   353 end;
   366 
   354 
   367 
   355