src/Pure/Isar/outer_lex.ML
author wenzelm
Sun Oct 03 15:51:38 1999 +0200 (1999-10-03 ago)
changeset 7682 46de8064c93c
parent 7477 c7caea1ce78c
child 7902 10fd5d922c97
permissions -rw-r--r--
added Space, Comment token kinds (keep actual text);
source: do not filter proper;
     1 (*  Title:      Pure/Isar/outer_lex.ML
     2     ID:         $Id$
     3     Author:     Markus Wenzel, TU Muenchen
     4 
     5 Outer lexical syntax for Isabelle/Isar.
     6 *)
     7 
     8 signature OUTER_LEX =
     9 sig
    10   datatype token_kind =
    11     Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    12     Nat | String | Verbatim | Space | Comment | Sync | EOF
    13   type token
    14   val str_of_kind: token_kind -> string
    15   val stopper: token * (token -> bool)
    16   val not_sync: token -> bool
    17   val not_eof: token -> bool
    18   val position_of: token -> Position.T
    19   val pos_of: token -> string
    20   val is_kind: token_kind -> token -> bool
    21   val keyword_with: (string -> bool) -> token -> bool
    22   val name_of: token -> string
    23   val is_proper: token -> bool
    24   val val_of: token -> string
    25   val is_sid: string -> bool
    26   val scan: (Scan.lexicon * Scan.lexicon) ->
    27     Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list)
    28   val source: bool -> (unit -> (Scan.lexicon * Scan.lexicon)) ->
    29     Position.T -> (Symbol.symbol, 'a) Source.source ->
    30     (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source
    31 end;
    32 
    33 structure OuterLex: OUTER_LEX =
    34 struct
    35 
    36 
    37 (** tokens **)
    38 
    39 (* datatype token *)
    40 
    41 datatype token_kind =
    42   Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    43   Nat | String | Verbatim | Space | Comment | Sync | EOF;
    44 
    45 datatype token = Token of Position.T * (token_kind * string);
    46 
    47 val str_of_kind =
    48  fn Command => "command"
    49   | Keyword => "keyword"
    50   | Ident => "identifier"
    51   | LongIdent => "long identifier"
    52   | SymIdent => "symbolic identifier"
    53   | Var => "schematic variable"
    54   | TypeIdent => "type variable"
    55   | TypeVar => "schematic type variable"
    56   | Nat => "number"
    57   | String => "string"
    58   | Verbatim => "verbatim text"
    59   | Space => "white space"
    60   | Comment => "comment text"
    61   | Sync => "sync marker"
    62   | EOF => "end-of-file";
    63 
    64 
    65 (* sync token *)
    66 
    67 fun not_sync (Token (_, (Sync, _))) = false
    68   | not_sync _ = true;
    69 
    70 
    71 (* eof token *)
    72 
    73 val eof = Token (Position.none, (EOF, ""));
    74 
    75 fun is_eof (Token (_, (EOF, _))) = true
    76   | is_eof _ = false;
    77 
    78 val stopper = (eof, is_eof);
    79 val not_eof = not o is_eof;
    80 
    81 
    82 (* get position *)
    83 
    84 fun position_of (Token (pos, _)) = pos;
    85 val pos_of = Position.str_of o position_of;
    86 
    87 
    88 (* kind of token *)
    89 
    90 fun is_kind k (Token (_, (k', _))) = k = k';
    91 
    92 fun keyword_with pred (Token (_, (Keyword, x))) = pred x
    93   | keyword_with _ _ = false;
    94 
    95 fun name_of (Token (_, (k, _))) = str_of_kind k;
    96 
    97 fun is_proper (Token (_, (Space, _))) = false
    98   | is_proper (Token (_, (Comment, _))) = false
    99   | is_proper _ = true;
   100 
   101 
   102 (* value of token *)
   103 
   104 fun val_of (Token (_, (_, x))) = x;
   105 
   106 fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
   107 
   108 
   109 
   110 (** scanners **)
   111 
   112 fun change_prompt scan = Scan.prompt "# " scan;
   113 
   114 
   115 (* diagnostics *)
   116 
   117 fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs;
   118 
   119 
   120 (* line numbering *)
   121 
   122 fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos));
   123 val keep_line = Scan.lift;
   124 
   125 val scan_blank =
   126   incr_line ($$ "\n") ||
   127   keep_line (Scan.one Symbol.is_blank);
   128 
   129 
   130 (* scan symbolic idents *)
   131 
   132 val sym_chars = explode "!#$%&*+-/:<=>?@^_`|~";
   133 fun is_sym_char s = s mem sym_chars;
   134 
   135 val scan_symid = Scan.any1 is_sym_char >> implode;
   136 
   137 fun is_symid s = s <> "" andalso forall is_sym_char (Symbol.explode s);
   138 val is_sid = is_symid orf Syntax.is_identifier;
   139 
   140 
   141 (* scan strings *)
   142 
   143 val scan_str =
   144   scan_blank ||
   145   keep_line ($$ "\\" |-- Scan.one (Symbol.not_sync andf Symbol.not_eof)) ||
   146   keep_line (Scan.one (not_equal "\\" andf not_equal "\"" andf
   147     Symbol.not_sync andf Symbol.not_eof));
   148 
   149 val scan_string =
   150   keep_line ($$ "\"") |--
   151     !! (lex_err (K "missing quote at end of string"))
   152       (change_prompt ((Scan.repeat scan_str >> implode) --| keep_line ($$ "\"")));
   153 
   154 
   155 (* scan verbatim text *)
   156 
   157 val scan_verb =
   158   scan_blank ||
   159   keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal "}"))) ||
   160   keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof));
   161 
   162 val scan_verbatim =
   163   keep_line ($$ "{" -- $$ "*") |--
   164     !! (lex_err (K "missing end of verbatim text"))
   165       (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}")));
   166 
   167 
   168 (* scan space *)
   169 
   170 val is_space = Symbol.is_blank andf not_equal "\n";
   171 
   172 val scan_space =
   173   (keep_line (Scan.any1 is_space) -- Scan.optional (incr_line ($$ "\n")) "" ||
   174     keep_line (Scan.any is_space) -- incr_line ($$ "\n")) >> (fn (cs, c) => implode cs ^ c);
   175 
   176 
   177 (* scan nested comments *)
   178 
   179 val scan_cmt =
   180   Scan.lift scan_blank ||
   181   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   182   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   183   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
   184   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof)));
   185 
   186 val scan_comment =
   187   keep_line ($$ "(" -- $$ "*") |--
   188     !! (lex_err (K "missing end of comment"))
   189       (change_prompt
   190         (Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| keep_line ($$ "*" -- $$ ")")));
   191 
   192 
   193 (* scan token *)
   194 
   195 fun scan (lex1, lex2) (pos, cs) =
   196   let
   197     fun token k x = Token (pos, (k, x));
   198     fun sync _ = token Sync Symbol.sync;
   199 
   200     val scanner =
   201       scan_string >> token String ||
   202       scan_verbatim >> token Verbatim ||
   203       scan_space >> token Space ||
   204       scan_comment >> token Comment ||
   205       keep_line (Scan.one Symbol.is_sync >> sync) ||
   206       keep_line (Scan.max token_leq
   207         (Scan.max token_leq
   208           (Scan.literal lex1 >> (token Keyword o implode))
   209           (Scan.literal lex2 >> (token Command o implode)))
   210         (Syntax.scan_longid >> token LongIdent ||
   211           Syntax.scan_id >> token Ident ||
   212           Syntax.scan_var >> token Var ||
   213           Syntax.scan_tid >> token TypeIdent ||
   214           Syntax.scan_tvar >> token TypeVar ||
   215           Syntax.scan_nat >> token Nat ||
   216           scan_symid >> token SymIdent));
   217   in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning cs))) scanner (pos, cs) end;
   218 
   219 
   220 (* source of (proper) tokens *)
   221 
   222 val is_junk = (not o Symbol.is_blank) andf Symbol.not_sync andf Symbol.not_eof;
   223 fun recover xs = keep_line (Scan.any1 is_junk) xs;
   224 
   225 fun source do_recover get_lex pos src =
   226   Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs))
   227     (if do_recover then Some recover else None) src;
   228 
   229 
   230 end;