src/Pure/Isar/outer_lex.ML
author wenzelm
Fri Apr 07 17:36:25 2000 +0200 (2000-04-07 ago)
changeset 8681 957a5fe9b212
parent 8663 38d7ec8ef683
child 8807 0046be1769f9
permissions -rw-r--r--
apply etc.: comments;
     1 (*  Title:      Pure/Isar/outer_lex.ML
     2     ID:         $Id$
     3     Author:     Markus Wenzel, TU Muenchen
     4 
     5 Outer lexical syntax for Isabelle/Isar.
     6 *)
     7 
     8 signature OUTER_LEX =
     9 sig
    10   datatype token_kind =
    11     Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    12     Nat | String | Verbatim | Space | Comment | Sync | EOF
    13   type token
    14   val str_of_kind: token_kind -> string
    15   val stopper: token * (token -> bool)
    16   val not_sync: token -> bool
    17   val not_eof: token -> bool
    18   val position_of: token -> Position.T
    19   val pos_of: token -> string
    20   val is_kind: token_kind -> token -> bool
    21   val keyword_with: (string -> bool) -> token -> bool
    22   val name_of: token -> string
    23   val is_proper: token -> bool
    24   val is_begin_ignore: token -> bool
    25   val is_end_ignore: token -> bool
    26   val is_newline: token -> bool
    27   val is_indent: token -> bool
    28   val val_of: token -> string
    29   val is_sid: string -> bool
    30   val scan: (Scan.lexicon * Scan.lexicon) ->
    31     Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list)
    32   val source: bool -> (unit -> (Scan.lexicon * Scan.lexicon)) ->
    33     Position.T -> (Symbol.symbol, 'a) Source.source ->
    34     (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source
    35 end;
    36 
    37 structure OuterLex: OUTER_LEX =
    38 struct
    39 
    40 
    41 (** tokens **)
    42 
    43 (* datatype token *)
    44 
    45 datatype token_kind =
    46   Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    47   Nat | String | Verbatim | Space | Comment | Sync | EOF;
    48 
    49 datatype token = Token of Position.T * (token_kind * string);
    50 
    51 val str_of_kind =
    52  fn Command => "command"
    53   | Keyword => "keyword"
    54   | Ident => "identifier"
    55   | LongIdent => "long identifier"
    56   | SymIdent => "symbolic identifier"
    57   | Var => "schematic variable"
    58   | TypeIdent => "type variable"
    59   | TypeVar => "schematic type variable"
    60   | Nat => "number"
    61   | String => "string"
    62   | Verbatim => "verbatim text"
    63   | Space => "white space"
    64   | Comment => "comment text"
    65   | Sync => "sync marker"
    66   | EOF => "end-of-file";
    67 
    68 
    69 (* sync token *)
    70 
    71 fun not_sync (Token (_, (Sync, _))) = false
    72   | not_sync _ = true;
    73 
    74 
    75 (* eof token *)
    76 
    77 val eof = Token (Position.none, (EOF, ""));
    78 
    79 fun is_eof (Token (_, (EOF, _))) = true
    80   | is_eof _ = false;
    81 
    82 val stopper = (eof, is_eof);
    83 val not_eof = not o is_eof;
    84 
    85 
    86 (* get position *)
    87 
    88 fun position_of (Token (pos, _)) = pos;
    89 val pos_of = Position.str_of o position_of;
    90 
    91 
    92 (* kind of token *)
    93 
    94 fun is_kind k (Token (_, (k', _))) = k = k';
    95 
    96 fun keyword_with pred (Token (_, (Keyword, x))) = pred x
    97   | keyword_with _ _ = false;
    98 
    99 fun name_of (Token (_, (k, _))) = str_of_kind k;
   100 
   101 fun is_proper (Token (_, (Space, _))) = false
   102   | is_proper (Token (_, (Comment, _))) = false
   103   | is_proper _ = true;
   104 
   105 fun is_begin_ignore (Token (_, (Comment, "<"))) = true
   106   | is_begin_ignore _ = false;
   107 
   108 fun is_end_ignore (Token (_, (Comment, ">"))) = true
   109   | is_end_ignore _ = false;
   110 
   111 
   112 (* newline and indentations (note that space tokens obey lines) *)
   113 
   114 fun is_newline (Token (_, (Space, "\n"))) = true
   115   | is_newline _ = false;
   116 
   117 fun is_indent (Token (_, (Space, s))) =
   118       let val n = size s in n > 0 andalso String.substring (s, n - 1, 1) <> "\n" end
   119   | is_indent _ = false;
   120 
   121 
   122 (* value of token *)
   123 
   124 fun val_of (Token (_, (_, x))) = x;
   125 
   126 fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
   127 
   128 
   129 
   130 (** scanners **)
   131 
   132 fun change_prompt scan = Scan.prompt "# " scan;
   133 
   134 
   135 (* diagnostics *)
   136 
   137 fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs;
   138 
   139 
   140 (* line numbering *)
   141 
   142 fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos));
   143 val keep_line = Scan.lift;
   144 
   145 val scan_blank =
   146   incr_line ($$ "\n") ||
   147   keep_line (Scan.one Symbol.is_blank);
   148 
   149 
   150 (* scan symbolic idents *)
   151 
   152 val sym_chars = explode "!#$%&*+-/:<=>?@^_`|~";
   153 fun is_sym_char s = s mem sym_chars;
   154 
   155 val scan_symid =
   156   Scan.any1 is_sym_char >> implode ||
   157   Scan.one Symbol.is_symbolic;
   158 
   159 fun is_symid str =
   160   (case try Symbol.explode str of
   161     Some [s] => Symbol.is_symbolic s orelse is_sym_char s
   162   | Some ss => forall is_sym_char ss
   163   | _ => false);
   164 
   165 val is_sid = is_symid orf Syntax.is_identifier;
   166 
   167 
   168 (* scan strings *)
   169 
   170 val scan_str =
   171   scan_blank ||
   172   keep_line ($$ "\\" |-- Scan.one (Symbol.not_sync andf Symbol.not_eof)) ||
   173   keep_line (Scan.one (not_equal "\\" andf not_equal "\"" andf
   174     Symbol.not_sync andf Symbol.not_eof));
   175 
   176 val scan_string =
   177   keep_line ($$ "\"") |--
   178     !! (lex_err (K "missing quote at end of string"))
   179       (change_prompt ((Scan.repeat scan_str >> implode) --| keep_line ($$ "\"")));
   180 
   181 
   182 (* scan verbatim text *)
   183 
   184 val scan_verb =
   185   scan_blank ||
   186   keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal "}"))) ||
   187   keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof));
   188 
   189 val scan_verbatim =
   190   keep_line ($$ "{" -- $$ "*") |--
   191     !! (lex_err (K "missing end of verbatim text"))
   192       (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}")));
   193 
   194 
   195 (* scan space *)
   196 
   197 val is_space = Symbol.is_blank andf not_equal "\n";
   198 
   199 val scan_space =
   200   (keep_line (Scan.any1 is_space) -- Scan.optional (incr_line ($$ "\n")) "" ||
   201     keep_line (Scan.any is_space) -- incr_line ($$ "\n")) >> (fn (cs, c) => implode cs ^ c);
   202 
   203 
   204 (* scan nested comments *)
   205 
   206 val scan_cmt =
   207   Scan.lift scan_blank ||
   208   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   209   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   210   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
   211   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof)));
   212 
   213 val scan_comment =
   214   keep_line ($$ "(" -- $$ "*") |--
   215     !! (lex_err (K "missing end of comment"))
   216       (change_prompt
   217         (Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| keep_line ($$ "*" -- $$ ")")));
   218 
   219 
   220 (* scan token *)
   221 
   222 fun scan (lex1, lex2) (pos, cs) =
   223   let
   224     fun token k x = Token (pos, (k, x));
   225     fun sync _ = token Sync Symbol.sync;
   226 
   227     val scanner =
   228       scan_string >> token String ||
   229       scan_verbatim >> token Verbatim ||
   230       scan_space >> token Space ||
   231       scan_comment >> token Comment ||
   232       keep_line (Scan.one Symbol.is_sync >> sync) ||
   233       keep_line (Scan.max token_leq
   234         (Scan.max token_leq
   235           (Scan.literal lex1 >> (token Keyword o implode))
   236           (Scan.literal lex2 >> (token Command o implode)))
   237         (Syntax.scan_longid >> token LongIdent ||
   238           Syntax.scan_id >> token Ident ||
   239           Syntax.scan_var >> token Var ||
   240           Syntax.scan_tid >> token TypeIdent ||
   241           Syntax.scan_tvar >> token TypeVar ||
   242           Syntax.scan_nat >> token Nat ||
   243           scan_symid >> token SymIdent));
   244   in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning cs))) scanner (pos, cs) end;
   245 
   246 
   247 (* source of (proper) tokens *)
   248 
   249 val is_junk = (not o Symbol.is_blank) andf Symbol.not_sync andf Symbol.not_eof;
   250 fun recover xs = keep_line (Scan.any is_junk) xs;
   251 
   252 fun source do_recover get_lex pos src =
   253   Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs))
   254     (if do_recover then Some recover else None) src;
   255 
   256 
   257 end;