src/Pure/Isar/outer_lex.ML
author wenzelm
Wed Dec 05 03:18:03 2001 +0100 (2001-12-05 ago)
changeset 12385 389d11fb62c8
parent 10748 74ed77fa5310
child 14729 0e987111a17e
permissions -rw-r--r--
removed unused functionality (weight etc.);
     1 (*  Title:      Pure/Isar/outer_lex.ML
     2     ID:         $Id$
     3     Author:     Markus Wenzel, TU Muenchen
     4     License:    GPL (GNU GENERAL PUBLIC LICENSE)
     5 
     6 Outer lexical syntax for Isabelle/Isar.
     7 *)
     8 
     9 signature OUTER_LEX =
    10 sig
    11   datatype token_kind =
    12     Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    13     Nat | String | Verbatim | Space | Comment | Sync | Malformed | EOF
    14   type token
    15   val str_of_kind: token_kind -> string
    16   val stopper: token * (token -> bool)
    17   val not_sync: token -> bool
    18   val not_eof: token -> bool
    19   val position_of: token -> Position.T
    20   val pos_of: token -> string
    21   val is_kind: token_kind -> token -> bool
    22   val keyword_with: (string -> bool) -> token -> bool
    23   val name_of: token -> string
    24   val is_proper: token -> bool
    25   val is_semicolon: token -> bool
    26   val is_begin_ignore: token -> bool
    27   val is_end_ignore: token -> bool
    28   val is_newline: token -> bool
    29   val is_indent: token -> bool
    30   val val_of: token -> string
    31   val is_sid: string -> bool
    32   val !!! : string -> (Position.T * 'a -> 'b) -> Position.T * 'a -> 'b
    33   val incr_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c)
    34   val keep_line: ('a -> 'b * 'c) -> Position.T * 'a -> 'b * (Position.T * 'c)
    35   val scan_blank: Position.T * Symbol.symbol list
    36     -> Symbol.symbol * (Position.T * Symbol.symbol list)
    37   val scan_string: Position.T * Symbol.symbol list -> string * (Position.T * Symbol.symbol list)
    38   val scan: (Scan.lexicon * Scan.lexicon) ->
    39     Position.T * Symbol.symbol list -> token * (Position.T * Symbol.symbol list)
    40   val source: bool -> (unit -> (Scan.lexicon * Scan.lexicon)) ->
    41     Position.T -> (Symbol.symbol, 'a) Source.source ->
    42     (token, Position.T * (Symbol.symbol, 'a) Source.source) Source.source
    43   val source_proper: (token, 'a) Source.source -> (token, (token, 'a) Source.source) Source.source
    44   val make_lexicon: string list -> Scan.lexicon
    45 end;
    46 
    47 structure OuterLex: OUTER_LEX =
    48 struct
    49 
    50 
    51 (** tokens **)
    52 
    53 (* datatype token *)
    54 
    55 datatype token_kind =
    56   Command | Keyword | Ident | LongIdent | SymIdent | Var | TypeIdent | TypeVar |
    57   Nat | String | Verbatim | Space | Comment | Sync | Malformed | EOF;
    58 
    59 datatype token = Token of Position.T * (token_kind * string);
    60 
    61 val str_of_kind =
    62  fn Command => "command"
    63   | Keyword => "keyword"
    64   | Ident => "identifier"
    65   | LongIdent => "long identifier"
    66   | SymIdent => "symbolic identifier"
    67   | Var => "schematic variable"
    68   | TypeIdent => "type variable"
    69   | TypeVar => "schematic type variable"
    70   | Nat => "number"
    71   | String => "string"
    72   | Verbatim => "verbatim text"
    73   | Space => "white space"
    74   | Comment => "comment text"
    75   | Sync => "sync marker"
    76   | Malformed => "bad input"
    77   | EOF => "end-of-file";
    78 
    79 
    80 (* control tokens *)
    81 
    82 fun not_sync (Token (_, (Sync, _))) = false
    83   | not_sync _ = true;
    84 
    85 val malformed = Token (Position.none, (Malformed, ""));
    86 
    87 
    88 (* eof token *)
    89 
    90 val eof = Token (Position.none, (EOF, ""));
    91 
    92 fun is_eof (Token (_, (EOF, _))) = true
    93   | is_eof _ = false;
    94 
    95 val stopper = (eof, is_eof);
    96 val not_eof = not o is_eof;
    97 
    98 
    99 (* get position *)
   100 
   101 fun position_of (Token (pos, _)) = pos;
   102 val pos_of = Position.str_of o position_of;
   103 
   104 
   105 (* kind of token *)
   106 
   107 fun is_kind k (Token (_, (k', _))) = k = k';
   108 
   109 fun keyword_with pred (Token (_, (Keyword, x))) = pred x
   110   | keyword_with _ _ = false;
   111 
   112 fun is_proper (Token (_, (Space, _))) = false
   113   | is_proper (Token (_, (Comment, _))) = false
   114   | is_proper _ = true;
   115 
   116 fun is_semicolon (Token (_, (Keyword, ";"))) = true
   117   | is_semicolon _ = false;
   118 
   119 fun is_begin_ignore (Token (_, (Comment, "<"))) = true
   120   | is_begin_ignore _ = false;
   121 
   122 fun is_end_ignore (Token (_, (Comment, ">"))) = true
   123   | is_end_ignore _ = false;
   124 
   125 
   126 (* newline and indentations (note that space tokens obey lines) *)
   127 
   128 fun is_newline (Token (_, (Space, "\n"))) = true
   129   | is_newline _ = false;
   130 
   131 fun is_indent (Token (_, (Space, s))) =
   132       let val n = size s in n > 0 andalso String.substring (s, n - 1, 1) <> "\n" end
   133   | is_indent _ = false;
   134 
   135 
   136 (* name and value of token *)
   137 
   138 fun name_of (tok as Token (_, (k, x))) =
   139   if is_semicolon tok then "terminator"
   140   else if x = "" then str_of_kind k
   141   else str_of_kind k ^ " " ^ quote x;
   142 
   143 fun val_of (Token (_, (_, x))) = x;
   144 
   145 fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
   146 
   147 
   148 
   149 (** scanners **)
   150 
   151 fun change_prompt scan = Scan.prompt "# " scan;
   152 
   153 
   154 (* diagnostics *)
   155 
   156 fun lex_err msg ((pos, cs), _) = "Outer lexical error" ^ Position.str_of pos ^ ": " ^ msg cs;
   157 fun !!! msg scan = Scan.!! (lex_err (K msg)) scan;
   158 
   159 
   160 (* line numbering *)
   161 
   162 fun incr_line scan = Scan.depend (fn pos => scan >> pair (Position.inc pos));
   163 val keep_line = Scan.lift;
   164 
   165 val scan_blank =
   166   incr_line ($$ "\n") ||
   167   keep_line (Scan.one Symbol.is_blank);
   168 
   169 
   170 (* scan symbolic idents *)
   171 
   172 val sym_chars = explode "!#$%&*+-/:<=>?@^_`|~";
   173 fun is_sym_char s = s mem sym_chars;
   174 
   175 val scan_symid =
   176   Scan.any1 is_sym_char >> implode ||
   177   Scan.one Symbol.is_symbolic;
   178 
   179 fun is_symid str =
   180   (case try Symbol.explode str of
   181     Some [s] => Symbol.is_symbolic s orelse is_sym_char s
   182   | Some ss => forall is_sym_char ss
   183   | _ => false);
   184 
   185 val is_sid = is_symid orf Syntax.is_identifier;
   186 
   187 
   188 (* scan strings *)
   189 
   190 val scan_str =
   191   scan_blank ||
   192   keep_line ($$ "\\") |-- !!! "bad escape character in string"
   193       (scan_blank || keep_line ($$ "\"" || $$ "\\")) ||
   194   keep_line (Scan.one (not_equal "\\" andf not_equal "\"" andf
   195     Symbol.not_sync andf Symbol.not_eof));
   196 
   197 val scan_string =
   198   keep_line ($$ "\"") |--
   199     !!! "missing quote at end of string"
   200       (change_prompt ((Scan.repeat scan_str >> implode) --| keep_line ($$ "\"")));
   201 
   202 
   203 (* scan verbatim text *)
   204 
   205 val scan_verb =
   206   scan_blank ||
   207   keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal "}"))) ||
   208   keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof));
   209 
   210 val scan_verbatim =
   211   keep_line ($$ "{" -- $$ "*") |--
   212     !!! "missing end of verbatim text"
   213       (change_prompt ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "*" -- $$ "}")));
   214 
   215 
   216 (* scan space *)
   217 
   218 val is_space = Symbol.is_blank andf not_equal "\n";
   219 
   220 val scan_space =
   221   (keep_line (Scan.any1 is_space) -- Scan.optional (incr_line ($$ "\n")) "" ||
   222     keep_line (Scan.any is_space) -- incr_line ($$ "\n")) >> (fn (cs, c) => implode cs ^ c);
   223 
   224 
   225 (* scan nested comments *)
   226 
   227 val scan_cmt =
   228   Scan.lift scan_blank ||
   229   Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
   230   Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
   231   Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
   232   Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_sync andf Symbol.not_eof)));
   233 
   234 val scan_comment =
   235   keep_line ($$ "(" -- $$ "*") |--
   236     !!! "missing end of comment"
   237       (change_prompt
   238         (Scan.pass 0 (Scan.repeat scan_cmt >> implode) --| keep_line ($$ "*" -- $$ ")")));
   239 
   240 
   241 (* scan token *)
   242 
   243 fun scan (lex1, lex2) =
   244   let
   245     val scanner = Scan.state :-- (fn pos =>
   246       let
   247         fun token k x = Token (pos, (k, x));
   248         fun sync _ = token Sync Symbol.sync;
   249       in
   250         scan_string >> token String ||
   251         scan_verbatim >> token Verbatim ||
   252         scan_space >> token Space ||
   253         scan_comment >> token Comment ||
   254         keep_line (Scan.one Symbol.is_sync >> sync) ||
   255         keep_line (Scan.max token_leq
   256           (Scan.max token_leq
   257             (Scan.literal lex1 >> (token Keyword o implode))
   258             (Scan.literal lex2 >> (token Command o implode)))
   259           (Syntax.scan_longid >> token LongIdent ||
   260             Syntax.scan_id >> token Ident ||
   261             Syntax.scan_var >> token Var ||
   262             Syntax.scan_tid >> token TypeIdent ||
   263             Syntax.scan_tvar >> token TypeVar ||
   264             Syntax.scan_nat >> token Nat ||
   265             scan_symid >> token SymIdent))
   266       end) >> #2;
   267   in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning cs))) scanner end;
   268 
   269 
   270 (* token sources *)
   271 
   272 val is_junk = (not o Symbol.is_blank) andf Symbol.not_sync andf Symbol.not_eof;
   273 fun recover xs = (keep_line (Scan.any is_junk) >> K [malformed]) xs;
   274 
   275 fun source do_recover get_lex pos src =
   276   Source.source' pos Symbol.stopper (Scan.bulk (fn xs => scan (get_lex ()) xs))
   277     (if do_recover then Some recover else None) src;
   278 
   279 fun source_proper src = src |> Source.filter is_proper;
   280 
   281 
   282 (* lexicons *)
   283 
   284 val make_lexicon = Scan.make_lexicon o map Symbol.explode;
   285 
   286 end;