src/Pure/Isar/outer_lex.ML
author wenzelm
Sun Sep 26 16:42:14 1999 +0200 (1999-09-26 ago)
changeset 7613 fe818734c387
parent 7477 c7caea1ce78c
child 7682 46de8064c93c
permissions -rw-r--r--
help: unknown theory context;
     1 (*  Title:      Pure/Isar/outer_lex.ML
     2     ID:         $Id$
     3     Author:     Markus Wenzel, TU Muenchen
     4 
     5 Outer lexical syntax for Isabelle/Isar.
     6 *)
     7 
     8 signature OUTER_LEX =
     9 sig
    10   datatype token_kind =
    11     Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    12     Nat | String | Verbatim | Ignore | Sync | EOF
    13   type token
    14   val str_of_kind: token_kind -> string
    15   val stopper: token * (token -> bool)
    16   val not_sync: token -> bool
    17   val not_eof: token -> bool
    18   val position_of: token -> Position.T
    19   val pos_of: token -> string
    20   val is_kind: token_kind -> token -> bool
    21   val keyword_with: (string -> bool) -> token -> bool
    22   val name_of: token -> string
    23   val is_proper: token -> bool
    24   val val_of: token -> string
    25   val is_sid: string -> bool
    26   val scan: (Scan.lexicon * Scan.lexicon) ->
    27     Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list)
    28   val source: bool -> (unit -> (Scan.lexicon * Scan.lexicon)) ->
    29     Position.T -> (Symbol.symbol, 'a) Source.source ->
    30     (token, (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source) Source.source
    31 end;
    32 
    33 structure OuterLex: OUTER_LEX =
    34 struct
    35 
    36 
    37 (** tokens **)
    38 
    39 (* datatype token *)
    40 
    41 datatype token_kind =
    42   Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    43   Nat | String | Verbatim | Ignore | Sync | EOF;
    44 
    45 datatype token = Token of Position.T * (token_kind * string);
    46 
    47 val str_of_kind =
    48  fn Command => "command"
    49   | Keyword => "keyword"
    50   | Ident => "identifier"
    51   | LongIdent => "long identifier"
    52   | SymIdent => "symbolic identifier"
    53   | Var => "schematic variable"
    54   | TypeIdent => "type variable"
    55   | TypeVar => "schematic type variable"
    56   | Nat => "number"
    57   | String => "string"
    58   | Verbatim => "verbatim text"
    59   | Ignore => "ignored text"
    60   | Sync => "sync marker"
    61   | EOF => "end-of-file";
    62 
    63 
    64 (* sync token *)
    65 
    66 fun not_sync (Token (_, (Sync, _))) = false
    67   | not_sync _ = true;
    68 
    69 
    70 (* eof token *)
    71 
    72 val eof = Token (Position.none, (EOF, ""));
    73 
    74 fun is_eof (Token (_, (EOF, _))) = true
    75   | is_eof _ = false;
    76 
    77 val stopper = (eof, is_eof);
    78 val not_eof = not o is_eof;
    79 
    80 
    81 (* get position *)
    82 
    83 fun position_of (Token (pos, _)) = pos;
    84 val pos_of = Position.str_of o position_of;
    85 
    86 
    87 (* kind of token *)
    88 
    89 fun is_kind k (Token (_, (k', _))) = k = k';
    90 
    91 fun keyword_with pred (Token (_, (Keyword, x))) = pred x
    92   | keyword_with _ _ = false;
    93 
    94 fun name_of (Token (_, (k, _))) = str_of_kind k;
    95 
    96 fun is_proper (Token (_, (Ignore, _))) = false
    97   | is_proper _ = true;
    98 
    99 
   100 (* value of token *)
   101 
   102 fun val_of (Token (_, (_, x))) = x;
   103 
   104 fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
   105 
   106 
   107 
   108 (** scanners **)
   109 
   110 fun change_prompt scan = Scan.prompt "# " scan;
   111 
   112 
   113 (* diagnostics *)
   114 
   115 fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs;
   116 
   117 
   118 (* line numbering *)
   119 
   120 fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos));
   121 val keep_line = Scan.lift;
   122 
   123 val scan_blank =
   124   incr_line ($$ "\n") ||
   125   keep_line (Scan.one Symbol.is_blank);
   126 
   127 
   128 (* scan symbolic idents *)
   129 
   130 val sym_chars = explode "!#$%&*+-/:<=>?@^_`|~";
   131 fun is_sym_char s = s mem sym_chars;
   132 
   133 val scan_symid = Scan.any1 is_sym_char >> implode;
   134 
   135 fun is_symid s = s <> "" andalso forall is_sym_char (Symbol.explode s);
   136 val is_sid = is_symid orf Syntax.is_identifier;
   137 
   138 
   139 (* scan strings *)
   140 
   141 val scan_str =
   142   scan_blank >> K Symbol.space ||
   143   keep_line ($$ "\\" |-- Scan.one (Symbol.not_sync andf Symbol.not_eof)) ||
   144   keep_line (Scan.one (not_equal "\\" andf not_equal "\"" andf
   145     Symbol.not_sync andf Symbol.not_eof));
   146 
   147 val scan_string =
   148   keep_line ($$ "\"") |--
   149     !! (lex_err (K "missing quote at end of string"))
   150       (change_prompt ((Scan.repeat scan_str >> implode) --| keep_line ($$ "\"")));
   151 
   152 
   153 (* scan verbatim text *)
   154 
   155 val scan_verb =
   156   scan_blank ||
   157   keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal "}"))) ||
   158   keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof));
   159 
   160 val scan_verbatim =
   161   keep_line ($$ "{" -- $$ "*") |--
   162     !! (lex_err (K "missing end of verbatim text"))
   163       (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}")));
   164 
   165 
   166 (* scan space *)
   167 
   168 val is_space = Symbol.is_blank andf not_equal "\n";
   169 
   170 val scan_space =
   171   keep_line (Scan.any1 is_space) |-- Scan.optional (incr_line ($$ "\n")) "" ||
   172   keep_line (Scan.any is_space) |-- incr_line ($$ "\n");
   173 
   174 
   175 (* scan nested comments *)
   176 
   177 val scan_cmt =
   178   Scan.lift scan_blank ||
   179   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   180   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   181   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
   182   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof)));
   183 
   184 val scan_comment =
   185   keep_line ($$ "(" -- $$ "*") |--
   186     !! (lex_err (K "missing end of comment"))
   187       (change_prompt
   188         (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K ""));
   189 
   190 
   191 (* scan token *)
   192 
   193 fun scan (lex1, lex2) (pos, cs) =
   194   let
   195     fun token k x = Token (pos, (k, x));
   196     fun ignore _ = token Ignore "";
   197     fun sync _ = token Sync Symbol.sync;
   198 
   199     val scanner =
   200       scan_string >> token String ||
   201       scan_verbatim >> token Verbatim ||
   202       scan_space >> ignore ||
   203       scan_comment >> ignore ||
   204       keep_line (Scan.one Symbol.is_sync >> sync) ||
   205       keep_line (Scan.max token_leq
   206         (Scan.max token_leq
   207           (Scan.literal lex1 >> (token Keyword o implode))
   208           (Scan.literal lex2 >> (token Command o implode)))
   209         (Syntax.scan_longid >> token LongIdent ||
   210           Syntax.scan_id >> token Ident ||
   211           Syntax.scan_var >> token Var ||
   212           Syntax.scan_tid >> token TypeIdent ||
   213           Syntax.scan_tvar >> token TypeVar ||
   214           Syntax.scan_nat >> token Nat ||
   215           scan_symid >> token SymIdent));
   216   in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning cs))) scanner (pos, cs) end;
   217 
   218 
   219 (* source of (proper) tokens *)
   220 
   221 val is_junk = (not o Symbol.is_blank) andf Symbol.not_sync andf Symbol.not_eof;
   222 fun recover xs = keep_line (Scan.any1 is_junk) xs;
   223 
   224 fun source do_recover get_lex pos src =
   225   Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs))
   226     (if do_recover then Some recover else None) src
   227   |> Source.filter is_proper;
   228 
   229 
   230 end;