1 (* Title: Pure/Thy/thy_scan.ML |
|
2 ID: $Id$ |
|
3 Author: Markus Wenzel, TU Muenchen |
|
4 |
|
5 Lexer for *old-style* outer syntax. |
|
6 *) |
|
7 |
|
8 signature THY_SCAN = |
|
9 sig |
|
10 datatype token_kind = |
|
11 Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF |
|
12 val name_of_kind: token_kind -> string |
|
13 val tokenize: Scan.lexicon -> string list -> (token_kind * string * int) list |
|
14 end; |
|
15 |
|
16 structure ThyScan: THY_SCAN = |
|
17 struct |
|
18 |
|
19 |
|
20 (** token kinds **) |
|
21 |
|
22 datatype token_kind = |
|
23 Keyword | Ident | LongIdent | Var | TypeVar | Nat | String | Verbatim | Ignore | EOF; |
|
24 |
|
25 fun name_of_kind Keyword = "keyword" |
|
26 | name_of_kind Ident = "identifier" |
|
27 | name_of_kind LongIdent = "long identifier" |
|
28 | name_of_kind Var = "schematic variable" |
|
29 | name_of_kind TypeVar = "type variable" |
|
30 | name_of_kind Nat = "natural number" |
|
31 | name_of_kind String = "string" |
|
32 | name_of_kind Verbatim = "verbatim text" |
|
33 | name_of_kind Ignore = "ignore" |
|
34 | name_of_kind EOF = "end-of-file"; |
|
35 |
|
36 |
|
37 |
|
38 (** scanners **) |
|
39 |
|
40 (* diagnostics *) |
|
41 |
|
42 fun lex_err NONE msg = "Outer lexical error: " ^ msg |
|
43 | lex_err (SOME n) msg = "Outer lexical error on line " ^ string_of_int n ^ ": " ^ msg; |
|
44 |
|
45 |
|
46 (* line numbering *) |
|
47 |
|
48 val incr = Option.map (fn n:int => n + 1); |
|
49 |
|
50 fun incr_line scan = Scan.depend (fn n => scan >> pair (incr n)); |
|
51 val keep_line = Scan.lift; |
|
52 |
|
53 val scan_blank = |
|
54 incr_line ($$ "\n") || |
|
55 keep_line (Scan.one Symbol.is_blank); |
|
56 |
|
57 |
|
58 (* scan ML-style strings *) |
|
59 |
|
60 val scan_ctrl = |
|
61 $$ "^" ^^ Scan.one (fn c => Symbol.is_ascii c andalso ord c >= 64 andalso ord c <= 95); |
|
62 |
|
63 val scan_dig = Scan.one Symbol.is_digit; |
|
64 |
|
65 val scan_esc = |
|
66 keep_line ($$ "\\") ^^ !! (fn ((n, _), _) => lex_err n "bad escape sequence in string") |
|
67 (keep_line ($$ "n" || $$ "t" || $$ "\"" || $$ "\\" || |
|
68 scan_ctrl || scan_dig ^^ scan_dig ^^ scan_dig) || |
|
69 (Scan.repeat1 scan_blank >> implode) ^^ keep_line ($$ "\\")); |
|
70 |
|
71 val scan_str = |
|
72 scan_esc || |
|
73 scan_blank >> K Symbol.space || |
|
74 keep_line (Scan.one (not_equal "\"" andf Symbol.not_eof)); |
|
75 |
|
76 val scan_string = |
|
77 keep_line ($$ "\"") ^^ |
|
78 !! (fn ((n, _), _) => lex_err n "missing quote at end of string") |
|
79 ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\"")); |
|
80 |
|
81 |
|
82 (* scan verbatim text *) |
|
83 |
|
84 val scan_verb = |
|
85 scan_blank || |
|
86 keep_line ($$ "|" --| Scan.ahead (Scan.one (not_equal "}"))) || |
|
87 keep_line (Scan.one (not_equal "|" andf Symbol.not_eof)); |
|
88 |
|
89 val scan_verbatim = |
|
90 keep_line ($$ "{" -- $$ "|") |-- |
|
91 !! (fn ((n, _), _) => lex_err n "missing end of verbatim text") |
|
92 ((Scan.repeat scan_verb >> implode) --| keep_line ($$ "|" -- $$ "}")); |
|
93 |
|
94 |
|
95 (* scan nested comments *) |
|
96 |
|
97 val scan_cmt = |
|
98 Scan.lift scan_blank || |
|
99 Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) || |
|
100 Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) || |
|
101 Scan.lift (keep_line ($$ "*" --| Scan.ahead (Scan.one (not_equal ")")))) || |
|
102 Scan.lift (keep_line (Scan.one (not_equal "*" andf Symbol.not_eof))); |
|
103 |
|
104 val scan_comment = |
|
105 keep_line ($$ "(" -- $$ "*") |-- |
|
106 !! (fn ((n, _), _) => lex_err n "missing end of comment") |
|
107 (Scan.pass 0 (Scan.repeat scan_cmt) |-- keep_line ($$ "*" -- $$ ")") >> K ""); |
|
108 |
|
109 |
|
110 (* scan token *) |
|
111 |
|
112 fun token k NONE x = (k, x, 0) |
|
113 | token k (SOME n) x = (k, x, n); |
|
114 |
|
115 fun scan_tok lex (n, cs) = |
|
116 (scan_string >> token String n || |
|
117 scan_verbatim >> token Verbatim n || |
|
118 Scan.repeat1 scan_blank >> (token Ignore n o implode) || |
|
119 scan_comment >> token Ignore n || |
|
120 keep_line (Scan.max (fn ((_, x, _), (_, x', _)) => x <= x') |
|
121 (Scan.literal lex >> (token Keyword n o implode)) |
|
122 (Syntax.scan_longid >> token LongIdent n || |
|
123 Syntax.scan_id >> token Ident n || |
|
124 Syntax.scan_var >> token Var n || |
|
125 Syntax.scan_tid >> token TypeVar n || |
|
126 Syntax.scan_nat >> token Nat n))) (n, cs); |
|
127 |
|
128 val scan_rest = Scan.any Symbol.not_eof >> implode; |
|
129 |
|
130 fun scan_token lex x = |
|
131 (case scan_tok lex x of |
|
132 ((Keyword, "ML", n), x') => (keep_line scan_rest >> token Verbatim (SOME n)) x' |
|
133 | tok_x' => tok_x'); |
|
134 |
|
135 |
|
136 (* tokenize *) |
|
137 |
|
138 fun tokenize lex chs = |
|
139 let |
|
140 val scan_toks = Scan.repeat (scan_token lex); |
|
141 val ignore = fn (Ignore, _, _) => true | _ => false; |
|
142 in |
|
143 (case Scan.error (Scan.finite' Symbol.stopper scan_toks) (SOME 1, chs) of |
|
144 (toks, (n, [])) => filter_out ignore toks @ [token EOF n "end-of-file"] |
|
145 | (_, (n, cs)) => error (lex_err n ("Bad input " ^ quote (Symbol.beginning 10 cs)))) |
|
146 end; |
|
147 |
|
148 |
|
149 end; |
|