src/Pure/Isar/outer_lex.ML
author wenzelm
Sat Jul 28 20:40:27 2007 +0200 (2007-07-28)
changeset 24022 ab76c73b3b58
parent 23788 54ce229dc858
child 24577 c6acb6d79757
permissions -rw-r--r--
tuned;
     1 (*  Title:      Pure/Isar/outer_lex.ML
     2     ID:         $Id$
     3     Author:     Markus Wenzel, TU Muenchen
     4 
     5 Outer lexical syntax for Isabelle/Isar.
     6 *)
     7 
     8 signature OUTER_LEX =
     9 sig
    10   datatype token_kind =
    11     Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | Nat |
    12     String | AltString | Verbatim | Space | Comment | Malformed | Error of string | Sync | EOF
    13   eqtype token
    14   val str_of_kind: token_kind -> string
    15   val stopper: token * (token -> bool)
    16   val not_sync: token -> bool
    17   val not_eof: token -> bool
    18   val position_of: token -> Position.T
    19   val pos_of: token -> string
    20   val kind_of: token -> token_kind
    21   val is_kind: token_kind -> token -> bool
    22   val keyword_with: (string -> bool) -> token -> bool
    23   val ident_with: (string -> bool) -> token -> bool
    24   val is_proper: token -> bool
    25   val is_semicolon: token -> bool
    26   val is_comment: token -> bool
    27   val is_begin_ignore: token -> bool
    28   val is_end_ignore: token -> bool
    29   val is_blank: token -> bool
    30   val is_newline: token -> bool
    31   val unparse: token -> string
    32   val text_of: token -> string * string
    33   val val_of: token -> string
    34   val is_sid: string -> bool
    35   val !!! : string -> (Position.T * 'a -> 'b) -> Position.T * 'a -> 'b
    36   val incr_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c)
    37   val keep_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c)
    38   val scan_blank: Position.T * Symbol.symbol list
    39     -> Symbol.symbol * (Position.T * Symbol.symbol list)
    40   val scan_string: Position.T * Symbol.symbol list -> string * (Position.T * Symbol.symbol list)
    41   val scan: (Scan.lexicon * Scan.lexicon) ->
    42     Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list)
    43   val source: bool option -> (unit -> Scan.lexicon * Scan.lexicon) ->
    44     Position.T -> (Symbol.symbol, 'a) Source.source ->
    45     (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source
    46   val source_proper: (token, 'a) Source.source ->
    47     (token, (token, 'a) Source.source) Source.source
    48   val make_lexicon: string list -> Scan.lexicon
    49 end;
    50 
    51 structure OuterLex: OUTER_LEX =
    52 struct
    53 
    54 
    55 (** tokens **)
    56 
    57 (* datatype token *)
    58 
    59 datatype token_kind =
    60   Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar | Nat |
    61   String | AltString | Verbatim | Space | Comment | Malformed | Error of string | Sync | EOF;
    62 
    63 datatype token = Token of Position.T * (token_kind * string);
    64 
    65 val str_of_kind =
    66  fn Command => "command"
    67   | Keyword => "keyword"
    68   | Ident => "identifier"
    69   | LongIdent => "long identifier"
    70   | SymIdent => "symbolic identifier"
    71   | Var => "schematic variable"
    72   | TypeIdent => "type variable"
    73   | TypeVar => "schematic type variable"
    74   | Nat => "number"
    75   | String => "string"
    76   | AltString => "back-quoted string"
    77   | Verbatim => "verbatim text"
    78   | Space => "white space"
    79   | Comment => "comment text"
    80   | Malformed => "malformed symbolic character"
    81   | Error _ => "bad input"
    82   | Sync => "sync marker"
    83   | EOF => "end-of-file";
    84 
    85 
    86 (* control tokens *)
    87 
    88 val eof = Token (Position.none, (EOF, ""));
    89 
    90 fun is_eof (Token (_, (EOF, _))) = true
    91   | is_eof _ = false;
    92 
    93 val stopper = (eof, is_eof);
    94 val not_eof = not o is_eof;
    95 
    96 
    97 fun not_sync (Token (_, (Sync, _))) = false
    98   | not_sync _ = true;
    99 
   100 
   101 (* get position *)
   102 
   103 fun position_of (Token (pos, _)) = pos;
   104 val pos_of = Position.str_of o position_of;
   105 
   106 
   107 (* kind of token *)
   108 
   109 fun kind_of (Token (_, (k, _))) = k;
   110 
   111 fun is_kind k (Token (_, (k', _))) = k = k';
   112 
   113 fun keyword_with pred (Token (_, (Keyword, x))) = pred x
   114   | keyword_with _ _ = false;
   115 
   116 fun ident_with pred (Token (_, (Ident, x))) = pred x
   117   | ident_with _ _ = false;
   118 
   119 fun is_proper (Token (_, (Space, _))) = false
   120   | is_proper (Token (_, (Comment, _))) = false
   121   | is_proper _ = true;
   122 
   123 fun is_semicolon (Token (_, (Keyword, ";"))) = true
   124   | is_semicolon _ = false;
   125 
   126 fun is_comment (Token (_, (Comment, _))) = true
   127   | is_comment _ = false;
   128 
   129 fun is_begin_ignore (Token (_, (Comment, "<"))) = true
   130   | is_begin_ignore _ = false;
   131 
   132 fun is_end_ignore (Token (_, (Comment, ">"))) = true
   133   | is_end_ignore _ = false;
   134 
   135 
   136 (* blanks and newlines -- space tokens obey lines *)
   137 
   138 fun is_blank (Token (_, (Space, x))) = not (String.isSuffix "\n" x)
   139   | is_blank _ = false;
   140 
   141 fun is_newline (Token (_, (Space, x))) = String.isSuffix "\n" x
   142   | is_newline _ = false;
   143 
   144 
   145 (* token content *)
   146 
   147 fun escape q =
   148   implode o map (fn s => if s = q orelse s = "\\" then "\\" ^ s else s) o Symbol.explode;
   149 
   150 fun unparse (Token (_, (kind, x))) =
   151   (case kind of
   152     String => x |> quote o escape "\""
   153   | AltString => x |> enclose "`" "`" o escape "`"
   154   | Verbatim => x |> enclose "{*" "*}"
   155   | Comment => x |> enclose "(*" "*)"
   156   | Malformed => Output.escape_malformed x
   157   | Sync => ""
   158   | EOF => ""
   159   | _ => x);
   160 
   161 fun text_of tok =
   162   if is_semicolon tok then ("terminator", "")
   163   else
   164     let
   165       val k = str_of_kind (kind_of tok);
   166       val s = unparse tok;
   167     in
   168       if s = "" then (k, "")
   169       else if size s < 40 andalso not (exists_string (fn c => c = "\n") s) then (k ^ " " ^ s, "")
   170       else (k, s)
   171     end;
   172 
   173 fun val_of (Token (_, (_, x))) = x;
   174 
   175 fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
   176 
   177 
   178 
   179 (** scanners **)
   180 
   181 fun change_prompt scan = Scan.prompt "# " scan;
   182 
   183 
   184 (* diagnostics *)
   185 
   186 fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs;
   187 fun !!! msg scan = Scan.!! (lex_err (K msg)) scan;
   188 
   189 
   190 (* line numbering *)
   191 
   192 fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos));
   193 val keep_line = Scan.lift;
   194 
   195 val scan_blank =
   196   incr_line ($$ "\n") ||
   197   keep_line (Scan.one Symbol.is_blank);
   198 
   199 
   200 (* scan symbolic idents *)
   201 
   202 val is_sym_char = member (op =) (explode "!#$%&*+-/<=>?@^_|~");
   203 
   204 val scan_symid =
   205   Scan.many1 is_sym_char >> implode ||
   206   Scan.one Symbol.is_symbolic;
   207 
   208 fun is_symid str =
   209   (case try Symbol.explode str of
   210     SOME [s] => Symbol.is_symbolic s orelse is_sym_char s
   211   | SOME ss => forall is_sym_char ss
   212   | _ => false);
   213 
   214 fun is_sid "begin" = false
   215   | is_sid ":" = true
   216   | is_sid "::" = true
   217   | is_sid s = is_symid s orelse Syntax.is_identifier s;
   218 
   219 
   220 (* scan strings *)
   221 
   222 local
   223 
   224 fun scan_str q =
   225   scan_blank ||
   226   keep_line ($$ "\\") |-- !!! "bad escape character in string"
   227       (scan_blank || keep_line ($$ q || $$ "\\")) ||
   228   keep_line (Scan.one (fn s => s <> q andalso s <> "\\" andalso Symbol.is_regular s));
   229 
   230 fun scan_strs q =
   231   keep_line ($$ q) |--
   232     !!! "missing quote at end of string"
   233       (change_prompt ((Scan.repeat (scan_str q) >> implode) --| keep_line ($$ q)));
   234 
   235 in
   236 
   237 val scan_string = scan_strs "\"";
   238 val scan_alt_string = scan_strs "`";
   239 
   240 end;
   241 
   242 
   243 (* scan verbatim text *)
   244 
   245 val scan_verb =
   246   scan_blank ||
   247   keep_line ($$ "*" --| Scan.ahead (~$$ "}")) ||
   248   keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s));
   249 
   250 val scan_verbatim =
   251   keep_line ($$ "{" -- $$ "*") |--
   252     !!! "missing end of verbatim text"
   253       (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}")));
   254 
   255 
   256 (* scan space *)
   257 
   258 fun is_space s = Symbol.is_blank s andalso s <> "\n";
   259 
   260 val scan_space =
   261   (keep_line (Scan.many1 is_space) -- Scan.optional (incr_line ($$ "\n")) "" ||
   262     keep_line (Scan.many is_space) -- incr_line ($$ "\n")) >> (fn (cs, c) => implode cs ^ c);
   263 
   264 
   265 (* scan nested comments *)
   266 
   267 val scan_cmt =
   268   Scan.lift scan_blank ||
   269   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   270   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   271   Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
   272   Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
   273 
   274 val scan_comment =
   275   keep_line ($$ "(" -- $$ "*") |--
   276     !!! "missing end of comment"
   277       (change_prompt
   278         (Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| keep_line ($$ "*" -- $$ ")")));
   279 
   280 
   281 (* scan malformed symbols *)
   282 
   283 local
   284 
   285 val scan_mal =
   286   scan_blank ||
   287   keep_line (Scan.one Symbol.is_regular);
   288 
   289 in
   290 
   291 val scan_malformed =
   292   keep_line ($$ Symbol.malformed) |--
   293     change_prompt (Scan.repeat scan_mal >> implode)
   294       --| keep_line (Scan.option ($$ Symbol.end_malformed));
   295 
   296 end;
   297 
   298 
   299 (* scan token *)
   300 
   301 fun scan (lex1, lex2) =
   302   let
   303     val scanner = Scan.state :|-- (fn pos =>
   304       let
   305         fun token k x = Token (pos, (k, x));
   306         fun sync _ = token Sync Symbol.sync;
   307       in
   308         scan_string >> token String ||
   309         scan_alt_string >> token AltString ||
   310         scan_verbatim >> token Verbatim ||
   311         scan_space >> token Space ||
   312         scan_comment >> token Comment ||
   313         scan_malformed >> token Malformed ||
   314         keep_line (Scan.one Symbol.is_sync >> sync) ||
   315         keep_line (Scan.max token_leq
   316           (Scan.max token_leq
   317             (Scan.literal lex1 >> (token Keyword o implode))
   318             (Scan.literal lex2 >> (token Command o implode)))
   319           (Syntax.scan_longid >> token LongIdent ||
   320             Syntax.scan_id >> token Ident ||
   321             Syntax.scan_var >> token Var ||
   322             Syntax.scan_tid >> token TypeIdent ||
   323             Syntax.scan_tvar >> token TypeVar ||
   324             Syntax.scan_nat >> token Nat ||
   325             scan_symid >> token SymIdent))
   326       end);
   327   in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
   328 
   329 
   330 (* token sources *)
   331 
   332 local
   333 
   334 val is_junk = (not o Symbol.is_blank) andf Symbol.is_regular;
   335 
   336 fun recover msg = Scan.state :|-- (fn pos =>
   337   keep_line (Scan.many is_junk) >> (fn xs => [Token (pos, (Error msg, implode xs))]));
   338 
   339 in
   340 
   341 fun source do_recover get_lex pos src =
   342   Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs))
   343     (Option.map (rpair recover) do_recover) src;
   344 
   345 end;
   346 
   347 fun source_proper src = src |> Source.filter is_proper;
   348 
   349 
   350 (* lexicons *)
   351 
   352 val make_lexicon = Scan.make_lexicon o map Symbol.explode;
   353 
   354 end;