src/Pure/Isar/token.ML
author wenzelm
Thu Aug 23 17:46:03 2012 +0200 (2012-08-23)
changeset 48911 5debc3e4fa81
parent 48905 04576657cf94
child 48992 0518bf89c777
permissions -rw-r--r--
tuned messages: end-of-input rarely means physical end-of-file from the past;
     1 (*  Title:      Pure/Isar/token.ML
     2     Author:     Markus Wenzel, TU Muenchen
     3 
     4 Outer token syntax for Isabelle/Isar.
     5 *)
     6 
     7 signature TOKEN =
     8 sig
     9   datatype kind =
    10     Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    11     Nat | Float | String | AltString | Verbatim | Space | Comment | InternalValue |
    12     Error of string | Sync | EOF
    13   type file = {src_path: Path.T, text: string, pos: Position.T}
    14   datatype value =
    15     Text of string | Typ of typ | Term of term | Fact of thm list |
    16     Attribute of morphism -> attribute | Files of file list
    17   type T
    18   val str_of_kind: kind -> string
    19   val position_of: T -> Position.T
    20   val end_position_of: T -> Position.T
    21   val pos_of: T -> string
    22   val eof: T
    23   val is_eof: T -> bool
    24   val not_eof: T -> bool
    25   val not_sync: T -> bool
    26   val stopper: T Scan.stopper
    27   val kind_of: T -> kind
    28   val is_kind: kind -> T -> bool
    29   val keyword_with: (string -> bool) -> T -> bool
    30   val ident_with: (string -> bool) -> T -> bool
    31   val is_command: T -> bool
    32   val is_name: T -> bool
    33   val is_proper: T -> bool
    34   val is_semicolon: T -> bool
    35   val is_comment: T -> bool
    36   val is_begin_ignore: T -> bool
    37   val is_end_ignore: T -> bool
    38   val is_error: T -> bool
    39   val is_space: T -> bool
    40   val is_blank: T -> bool
    41   val is_newline: T -> bool
    42   val source_of: T -> string
    43   val source_position_of: T -> Symbol_Pos.text * Position.T
    44   val content_of: T -> string
    45   val unparse: T -> string
    46   val text_of: T -> string * string
    47   val get_files: T -> file list option
    48   val put_files: file list -> T -> T
    49   val get_value: T -> value option
    50   val map_value: (value -> value) -> T -> T
    51   val mk_text: string -> T
    52   val mk_typ: typ -> T
    53   val mk_term: term -> T
    54   val mk_fact: thm list -> T
    55   val mk_attribute: (morphism -> attribute) -> T
    56   val assignable: T -> T
    57   val assign: value option -> T -> unit
    58   val closure: T -> T
    59   val ident_or_symbolic: string -> bool
    60   val source_proper: (T, 'a) Source.source -> (T, (T, 'a) Source.source) Source.source
    61   val source': {do_recover: bool option} -> (unit -> Scan.lexicon * Scan.lexicon) ->
    62     (Symbol_Pos.T, 'a) Source.source -> (T, (Symbol_Pos.T, 'a) Source.source) Source.source
    63   val source: {do_recover: bool option} -> (unit -> Scan.lexicon * Scan.lexicon) ->
    64     Position.T -> (Symbol.symbol, 'a) Source.source -> (T,
    65       (Symbol_Pos.T, Position.T * (Symbol.symbol, 'a) Source.source) Source.source) Source.source
    66   val read_antiq: Scan.lexicon -> (T list -> 'a * T list) -> Symbol_Pos.T list * Position.T -> 'a
    67 end;
    68 
    69 structure Token: TOKEN =
    70 struct
    71 
    72 (** tokens **)
    73 
    74 (* token values *)
    75 
    76 (*The value slot assigns an (optional) internal value to a token,
    77   usually as a side-effect of special scanner setup (see also
    78   args.ML).  Note that an assignable ref designates an intermediate
    79   state of internalization -- it is NOT meant to persist.*)
    80 
    81 type file = {src_path: Path.T, text: string, pos: Position.T};
    82 
    83 datatype value =
    84   Text of string |
    85   Typ of typ |
    86   Term of term |
    87   Fact of thm list |
    88   Attribute of morphism -> attribute |
    89   Files of file list;
    90 
    91 datatype slot =
    92   Slot |
    93   Value of value option |
    94   Assignable of value option Unsynchronized.ref;
    95 
    96 
    97 (* datatype token *)
    98 
    99 datatype kind =
   100   Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
   101   Nat | Float | String | AltString | Verbatim | Space | Comment | InternalValue |
   102   Error of string | Sync | EOF;
   103 
   104 datatype T = Token of (Symbol_Pos.text * Position.range) * (kind * string) * slot;
   105 
   106 val str_of_kind =
   107  fn Command => "command"
   108   | Keyword => "keyword"
   109   | Ident => "identifier"
   110   | LongIdent => "long identifier"
   111   | SymIdent => "symbolic identifier"
   112   | Var => "schematic variable"
   113   | TypeIdent => "type variable"
   114   | TypeVar => "schematic type variable"
   115   | Nat => "natural number"
   116   | Float => "floating-point number"
   117   | String => "string"
   118   | AltString => "back-quoted string"
   119   | Verbatim => "verbatim text"
   120   | Space => "white space"
   121   | Comment => "comment text"
   122   | InternalValue => "internal value"
   123   | Error _ => "bad input"
   124   | Sync => "sync marker"
   125   | EOF => "end-of-input";
   126 
   127 
   128 (* position *)
   129 
   130 fun position_of (Token ((_, (pos, _)), _, _)) = pos;
   131 fun end_position_of (Token ((_, (_, pos)), _, _)) = pos;
   132 
   133 val pos_of = Position.str_of o position_of;
   134 
   135 
   136 (* control tokens *)
   137 
   138 fun mk_eof pos = Token (("", (pos, Position.none)), (EOF, ""), Slot);
   139 val eof = mk_eof Position.none;
   140 
   141 fun is_eof (Token (_, (EOF, _), _)) = true
   142   | is_eof _ = false;
   143 
   144 val not_eof = not o is_eof;
   145 
   146 fun not_sync (Token (_, (Sync, _), _)) = false
   147   | not_sync _ = true;
   148 
   149 val stopper =
   150   Scan.stopper (fn [] => eof | toks => mk_eof (end_position_of (List.last toks))) is_eof;
   151 
   152 
   153 (* kind of token *)
   154 
   155 fun kind_of (Token (_, (k, _), _)) = k;
   156 fun is_kind k (Token (_, (k', _), _)) = k = k';
   157 
   158 val is_command = is_kind Command;
   159 val is_name = is_kind Ident orf is_kind SymIdent orf is_kind String orf is_kind Nat;
   160 
   161 fun keyword_with pred (Token (_, (Keyword, x), _)) = pred x
   162   | keyword_with _ _ = false;
   163 
   164 fun ident_with pred (Token (_, (Ident, x), _)) = pred x
   165   | ident_with _ _ = false;
   166 
   167 fun is_proper (Token (_, (Space, _), _)) = false
   168   | is_proper (Token (_, (Comment, _), _)) = false
   169   | is_proper _ = true;
   170 
   171 fun is_semicolon (Token (_, (Keyword, ";"), _)) = true
   172   | is_semicolon _ = false;
   173 
   174 fun is_comment (Token (_, (Comment, _), _)) = true
   175   | is_comment _ = false;
   176 
   177 fun is_begin_ignore (Token (_, (Comment, "<"), _)) = true
   178   | is_begin_ignore _ = false;
   179 
   180 fun is_end_ignore (Token (_, (Comment, ">"), _)) = true
   181   | is_end_ignore _ = false;
   182 
   183 fun is_error (Token (_, (Error _, _), _)) = true
   184   | is_error _ = false;
   185 
   186 
   187 (* blanks and newlines -- space tokens obey lines *)
   188 
   189 fun is_space (Token (_, (Space, _), _)) = true
   190   | is_space _ = false;
   191 
   192 fun is_blank (Token (_, (Space, x), _)) = not (String.isSuffix "\n" x)
   193   | is_blank _ = false;
   194 
   195 fun is_newline (Token (_, (Space, x), _)) = String.isSuffix "\n" x
   196   | is_newline _ = false;
   197 
   198 
   199 (* token content *)
   200 
   201 fun source_of (Token ((source, (pos, _)), (_, x), _)) =
   202   if YXML.detect x then x
   203   else
   204     YXML.string_of
   205       (XML.Elem (Isabelle_Markup.token (Position.properties_of pos), [XML.Text source]));
   206 
   207 fun source_position_of (Token ((source, (pos, _)), _, _)) = (source, pos);
   208 
   209 fun content_of (Token (_, (_, x), _)) = x;
   210 
   211 
   212 (* unparse *)
   213 
   214 fun unparse (Token (_, (kind, x), _)) =
   215   (case kind of
   216     String => Symbol_Pos.quote_string_qq x
   217   | AltString => Symbol_Pos.quote_string_bq x
   218   | Verbatim => enclose "{*" "*}" x
   219   | Comment => enclose "(*" "*)" x
   220   | Sync => ""
   221   | EOF => ""
   222   | _ => x);
   223 
   224 fun text_of tok =
   225   if is_semicolon tok then ("terminator", "")
   226   else
   227     let
   228       val k = str_of_kind (kind_of tok);
   229       val s = unparse tok;
   230     in
   231       if s = "" then (k, "")
   232       else if size s < 40 andalso not (exists_string (fn c => c = "\n") s) then (k ^ " " ^ s, "")
   233       else (k, s)
   234     end;
   235 
   236 
   237 
   238 (** associated values **)
   239 
   240 (* inlined file content *)
   241 
   242 fun get_files (Token (_, _, Value (SOME (Files files)))) = SOME files
   243   | get_files _ = NONE;
   244 
   245 fun put_files files (Token (x, y, Slot)) = Token (x, y, Value (SOME (Files files)))
   246   | put_files _ tok =
   247       raise Fail ("Cannot put inlined files here" ^ Position.str_of (position_of tok));
   248 
   249 
   250 (* access values *)
   251 
   252 fun get_value (Token (_, _, Value v)) = v
   253   | get_value _ = NONE;
   254 
   255 fun map_value f (Token (x, y, Value (SOME v))) = Token (x, y, Value (SOME (f v)))
   256   | map_value _ tok = tok;
   257 
   258 
   259 (* make values *)
   260 
   261 fun mk_value k v = Token ((k, Position.no_range), (InternalValue, k), Value (SOME v));
   262 
   263 val mk_text = mk_value "<text>" o Text;
   264 val mk_typ = mk_value "<typ>" o Typ;
   265 val mk_term = mk_value "<term>" o Term;
   266 val mk_fact = mk_value "<fact>" o Fact;
   267 val mk_attribute = mk_value "<attribute>" o Attribute;
   268 
   269 
   270 (* static binding *)
   271 
   272 (*1st stage: make empty slots assignable*)
   273 fun assignable (Token (x, y, Slot)) = Token (x, y, Assignable (Unsynchronized.ref NONE))
   274   | assignable tok = tok;
   275 
   276 (*2nd stage: assign values as side-effect of scanning*)
   277 fun assign v (Token (_, _, Assignable r)) = r := v
   278   | assign _ _ = ();
   279 
   280 (*3rd stage: static closure of final values*)
   281 fun closure (Token (x, y, Assignable (Unsynchronized.ref v))) = Token (x, y, Value v)
   282   | closure tok = tok;
   283 
   284 
   285 
   286 (** scanners **)
   287 
   288 open Basic_Symbol_Pos;
   289 
   290 val err_prefix = "Outer lexical error: ";
   291 
   292 fun !!! msg = Symbol_Pos.!!! (fn () => err_prefix ^ msg);
   293 
   294 
   295 (* scan symbolic idents *)
   296 
   297 val is_sym_char = member (op =) (raw_explode "!#$%&*+-/<=>?@^_|~");
   298 
   299 val scan_symid =
   300   Scan.many1 (is_sym_char o Symbol_Pos.symbol) ||
   301   Scan.one (Symbol.is_symbolic o Symbol_Pos.symbol) >> single;
   302 
   303 fun is_symid str =
   304   (case try Symbol.explode str of
   305     SOME [s] => Symbol.is_symbolic s orelse is_sym_char s
   306   | SOME ss => forall is_sym_char ss
   307   | _ => false);
   308 
   309 fun ident_or_symbolic "begin" = false
   310   | ident_or_symbolic ":" = true
   311   | ident_or_symbolic "::" = true
   312   | ident_or_symbolic s = Lexicon.is_identifier s orelse is_symid s;
   313 
   314 
   315 (* scan verbatim text *)
   316 
   317 val scan_verb =
   318   $$$ "*" --| Scan.ahead (~$$$ "}") ||
   319   Scan.one (fn (s, _) => s <> "*" andalso Symbol.is_regular s) >> single;
   320 
   321 val scan_verbatim =
   322   (Symbol_Pos.scan_pos --| $$$ "{" --| $$$ "*") -- !!! "missing end of verbatim text"
   323     (Symbol_Pos.change_prompt
   324       ((Scan.repeat scan_verb >> flat) -- ($$$ "*" |-- $$$ "}" |-- Symbol_Pos.scan_pos)));
   325 
   326 val recover_verbatim =
   327   $$$ "{" @@@ $$$ "*" @@@ (Scan.repeat scan_verb >> flat);
   328 
   329 
   330 (* scan space *)
   331 
   332 fun space_symbol (s, _) = Symbol.is_blank s andalso s <> "\n";
   333 
   334 val scan_space =
   335   Scan.many1 space_symbol @@@ Scan.optional ($$$ "\n") [] ||
   336   Scan.many space_symbol @@@ $$$ "\n";
   337 
   338 
   339 (* scan comment *)
   340 
   341 val scan_comment =
   342   Symbol_Pos.scan_pos -- (Symbol_Pos.scan_comment_body !!! -- Symbol_Pos.scan_pos);
   343 
   344 
   345 
   346 (** token sources **)
   347 
   348 fun source_proper src = src |> Source.filter is_proper;
   349 
   350 local
   351 
   352 fun token_leq ((_, syms1), (_, syms2)) = length syms1 <= length syms2;
   353 
   354 fun token k ss =
   355   Token ((Symbol_Pos.implode ss, Symbol_Pos.range ss), (k, Symbol_Pos.content ss), Slot);
   356 
   357 fun token_range k (pos1, (ss, pos2)) =
   358   Token (Symbol_Pos.implode_range pos1 pos2 ss, (k, Symbol_Pos.content ss), Slot);
   359 
   360 fun scan (lex1, lex2) = !!! "bad input"
   361   (Symbol_Pos.scan_string_qq err_prefix >> token_range String ||
   362     Symbol_Pos.scan_string_bq err_prefix >> token_range AltString ||
   363     scan_verbatim >> token_range Verbatim ||
   364     scan_comment >> token_range Comment ||
   365     scan_space >> token Space ||
   366     Scan.one (Symbol.is_sync o Symbol_Pos.symbol) >> (token Sync o single) ||
   367     (Scan.max token_leq
   368       (Scan.max token_leq
   369         (Scan.literal lex2 >> pair Command)
   370         (Scan.literal lex1 >> pair Keyword))
   371       (Lexicon.scan_longid >> pair LongIdent ||
   372         Lexicon.scan_id >> pair Ident ||
   373         Lexicon.scan_var >> pair Var ||
   374         Lexicon.scan_tid >> pair TypeIdent ||
   375         Lexicon.scan_tvar >> pair TypeVar ||
   376         Lexicon.scan_float >> pair Float ||
   377         Lexicon.scan_nat >> pair Nat ||
   378         scan_symid >> pair SymIdent) >> uncurry token));
   379 
   380 fun recover msg =
   381   (Symbol_Pos.recover_string_qq ||
   382     Symbol_Pos.recover_string_bq ||
   383     recover_verbatim ||
   384     Symbol_Pos.recover_comment ||
   385     Scan.one (Symbol.is_regular o Symbol_Pos.symbol) >> single)
   386   >> (single o token (Error msg));
   387 
   388 in
   389 
   390 fun source' {do_recover} get_lex =
   391   Source.source Symbol_Pos.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs))
   392     (Option.map (rpair recover) do_recover);
   393 
   394 fun source do_recover get_lex pos src =
   395   Symbol_Pos.source pos src
   396   |> source' do_recover get_lex;
   397 
   398 end;
   399 
   400 
   401 (* read_antiq *)
   402 
   403 fun read_antiq lex scan (syms, pos) =
   404   let
   405     fun err msg = cat_error msg ("Malformed antiquotation" ^ Position.str_of pos ^ ":\n" ^
   406       "@{" ^ Symbol_Pos.content syms ^ "}");
   407 
   408     val res =
   409       Source.of_list syms
   410       |> source' {do_recover = NONE} (K (lex, Scan.empty_lexicon))
   411       |> source_proper
   412       |> Source.source stopper (Scan.error (Scan.bulk scan)) NONE
   413       |> Source.exhaust;
   414   in (case res of [x] => x | _ => err "") handle ERROR msg => err msg end;
   415 
   416 end;