4705
|
1 |
(* Title: Pure/Thy/thy_scan.ML
|
|
2 |
ID: $Id$
|
|
3 |
Author: Markus Wenzel, TU Muenchen
|
388
|
4 |
|
4705
|
5 |
Lexer for the outer Isabelle syntax.
|
|
6 |
|
|
7 |
TODO:
|
|
8 |
- old vs. new: interpreted strings, no 'ML', var!?;
|
388
|
9 |
*)
|
|
10 |
|
|
11 |
signature THY_SCAN =
|
4705
|
12 |
sig
|
388
|
13 |
datatype token_kind =
|
4705
|
14 |
Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF
|
388
|
15 |
val name_of_kind: token_kind -> string
|
6207
|
16 |
val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list
|
4705
|
17 |
end;
|
388
|
18 |
|
4705
|
19 |
structure ThyScan: THY_SCAN =
|
388
|
20 |
struct
|
|
21 |
|
|
22 |
|
|
23 |
(** token kinds **)
|
|
24 |
|
|
25 |
datatype token_kind =
|
4705
|
26 |
Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF;
|
388
|
27 |
|
|
28 |
fun name_of_kind Keyword = "keyword"
|
|
29 |
| name_of_kind Ident = "identifier"
|
|
30 |
| name_of_kind LongIdent = "long identifier"
|
4705
|
31 |
| name_of_kind Var = "schematic variable"
|
388
|
32 |
| name_of_kind TypeVar = "type variable"
|
|
33 |
| name_of_kind Nat = "natural number"
|
|
34 |
| name_of_kind String = "string"
|
|
35 |
| name_of_kind Verbatim = "verbatim text"
|
4705
|
36 |
| name_of_kind Ignore = "ignore"
|
388
|
37 |
| name_of_kind EOF = "end-of-file";
|
|
38 |
|
|
39 |
|
|
40 |
|
|
41 |
(** scanners **)
|
|
42 |
|
4705
|
43 |
(* diagnostics *)
|
|
44 |
|
|
45 |
fun lex_err None msg = "Outer lexical error: " ^ msg
|
|
46 |
| lex_err (Some n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg;
|
388
|
47 |
|
|
48 |
|
4705
|
49 |
(* line numbering *)
|
|
50 |
|
|
51 |
val incr = apsome (fn n:int => n + 1);
|
388
|
52 |
|
4705
|
53 |
fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n));
|
|
54 |
val keep_line = Scan.lift;
|
388
|
55 |
|
4705
|
56 |
val scan_blank =
|
|
57 |
incr_line ($$ "\n") ||
|
|
58 |
keep_line (Scan.one Symbol.is_blank);
|
388
|
59 |
|
|
60 |
|
4705
|
61 |
(* scan ML-style strings *)
|
388
|
62 |
|
4705
|
63 |
val scan_ctrl =
|
|
64 |
$$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95);
|
388
|
65 |
|
4705
|
66 |
val scan_dig = Scan.one Symbol.is_digit;
|
388
|
67 |
|
|
68 |
val scan_esc =
|
4921
|
69 |
keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string")
|
4705
|
70 |
(keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" ||
|
|
71 |
scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) ||
|
|
72 |
(Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\"));
|
388
|
73 |
|
4705
|
74 |
val scan_str =
|
|
75 |
scan_esc ||
|
5910
|
76 |
scan_blank >> K Symbol.space ||
|
4705
|
77 |
keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof));
|
|
78 |
|
|
79 |
val scan_string =
|
|
80 |
keep_line ($$ "\"") ^^
|
4921
|
81 |
!! (fn ((n, _), _) => lex_err n "missing quote at end of string")
|
4705
|
82 |
((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
|
388
|
83 |
|
|
84 |
|
|
85 |
(* scan verbatim text *)
|
|
86 |
|
4705
|
87 |
val scan_verb =
|
|
88 |
scan_blank ||
|
|
89 |
keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) ||
|
|
90 |
keep_line (Scan.one (not_equal "|" andf Symbol.not_eof));
|
|
91 |
|
|
92 |
val scan_verbatim =
|
|
93 |
keep_line ($$ "{" -- $$ "|") |--
|
4921
|
94 |
!! (fn ((n, _), _) => lex_err n "missing end of verbatim text")
|
4705
|
95 |
((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}"));
|
388
|
96 |
|
|
97 |
|
|
98 |
(* scan nested comments *)
|
|
99 |
|
4705
|
100 |
val scan_cmt =
|
|
101 |
Scan.lift scan_blank ||
|
|
102 |
Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
|
|
103 |
Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
|
|
104 |
Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) ||
|
|
105 |
Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof)));
|
388
|
106 |
|
4705
|
107 |
val scan_comment =
|
|
108 |
keep_line ($$ "(" -- $$ "*") |--
|
4921
|
109 |
!! (fn ((n, _), _) => lex_err n "missing end of comment")
|
4705
|
110 |
(Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K "");
|
388
|
111 |
|
|
112 |
|
4705
|
113 |
(* scan token *)
|
|
114 |
|
|
115 |
fun token k None x = (k, x, 0)
|
|
116 |
| token k (Some n) x = (k, x, n);
|
388
|
117 |
|
4705
|
118 |
fun scan_tok lex (n, cs) =
|
|
119 |
(scan_string >> token String n ||
|
|
120 |
scan_verbatim >> token Verbatim n ||
|
|
121 |
Scan.repeat1 scan_blank >> (token Ignore n o implode) ||
|
|
122 |
scan_comment >> token Ignore n ||
|
|
123 |
keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x')
|
|
124 |
(Scan.literal lex >> (token Keyword n o implode))
|
|
125 |
(Syntax.scan_longid >> token LongIdent n ||
|
|
126 |
Syntax.scan_id >> token Ident n ||
|
|
127 |
Syntax.scan_var >> token Var n ||
|
|
128 |
Syntax.scan_tid >> token TypeVar n ||
|
|
129 |
Syntax.scan_nat >> token Nat n))) (n, cs);
|
388
|
130 |
|
4705
|
131 |
val scan_rest = Scan.any Symbol.not_eof >> implode;
|
388
|
132 |
|
4705
|
133 |
fun scan_token lex x =
|
|
134 |
(case scan_tok lex x of
|
|
135 |
((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (Some n)) x'
|
|
136 |
| tok_x' => tok_x');
|
388
|
137 |
|
|
138 |
|
4705
|
139 |
(* tokenize *)
|
|
140 |
|
6207
|
141 |
fun tokenize lex chs =
|
388
|
142 |
let
|
4705
|
143 |
val scan_toks = Scan.repeat (scan_token lex);
|
|
144 |
val ignore = fn (Ignore, _, _) => true | _ => false;
|
388
|
145 |
in
|
4938
|
146 |
(case Scan.error (Scan.finite' Symbol.stopper scan_toks) (Some 1, chs) of
|
4705
|
147 |
(toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"]
|
5112
|
148 |
| (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning cs))))
|
388
|
149 |
end;
|
|
150 |
|
|
151 |
|
|
152 |
end;
|