author  wenzelm 
Mon, 21 Mar 2011 23:38:32 +0100  
changeset 42046  6341c23baf10 
parent 40525  14a2e686bdac 
child 42048  afd11ca8e018 
permissions  rwrr 
18  1 
(* Title: Pure/Syntax/lexicon.ML 
2 
Author: Tobias Nipkow and Markus Wenzel, TU Muenchen 

0  3 

4703  4 
Lexer for the inner Isabelle syntax (terms and types). 
18  5 
*) 
0  6 

7 
signature LEXICON0 = 

4247
9bba9251bb4d
added implode_xstr: string list > string, explode_xstr: string > string list;
wenzelm
parents:
3828
diff
changeset

8 
sig 
0  9 
val is_identifier: string > bool 
14679  10 
val is_ascii_identifier: string > bool 
20165  11 
val is_tid: string > bool 
30573  12 
val scan_id: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 
13 
val scan_longid: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 

14 
val scan_tid: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 

15 
val scan_nat: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 

16 
val scan_int: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 

40290
47f572aff50a
support for floatingpoint tokens in outer syntax (coinciding with inner syntax version);
wenzelm
parents:
39510
diff
changeset

17 
val scan_float: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 
30573  18 
val scan_hex: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 
19 
val scan_bin: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 

20 
val scan_var: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 

21 
val scan_tvar: Symbol_Pos.T list > Symbol_Pos.T list * Symbol_Pos.T list 

4247
9bba9251bb4d
added implode_xstr: string list > string, explode_xstr: string > string list;
wenzelm
parents:
3828
diff
changeset

22 
val implode_xstr: string list > string 
9bba9251bb4d
added implode_xstr: string list > string, explode_xstr: string > string list;
wenzelm
parents:
3828
diff
changeset

23 
val explode_xstr: string > string list 
20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

24 
val read_indexname: string > indexname 
4703  25 
val read_var: string > term 
15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

26 
val read_variable: string > indexname option 
550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

27 
val const: string > term 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

28 
val free: string > term 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

29 
val var: indexname > term 
5860  30 
val read_nat: string > int option 
20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

31 
val read_int: string > int option 
24630
351a308ab58d
simplified type int (eliminated IntInf.int, integer);
wenzelm
parents:
24583
diff
changeset

32 
val read_xnum: string > {radix: int, leading_zeros: int, value: int} 
28904  33 
val read_float: string > {mant: int, exp: int} 
35428  34 
val mark_class: string > string val unmark_class: string > string 
35 
val mark_type: string > string val unmark_type: string > string 

36 
val mark_const: string > string val unmark_const: string > string 

37 
val mark_fixed: string > string val unmark_fixed: string > string 

38 
val unmark: 

39 
{case_class: string > 'a, 

40 
case_type: string > 'a, 

41 
case_const: string > 'a, 

42 
case_fixed: string > 'a, 

43 
case_default: string > 'a} > string > 'a 

44 
val is_marked: string > bool 

42046
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

45 
val encode_position: Position.T > string 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

46 
val decode_position: string > Position.T option 
4247
9bba9251bb4d
added implode_xstr: string list > string, explode_xstr: string > string list;
wenzelm
parents:
3828
diff
changeset

47 
end; 
0  48 

49 
signature LEXICON = 

4247
9bba9251bb4d
added implode_xstr: string list > string, explode_xstr: string > string list;
wenzelm
parents:
3828
diff
changeset

50 
sig 
18  51 
include LEXICON0 
52 
val is_xid: string > bool 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

53 
datatype token_kind = 
27887  54 
Literal  IdentSy  LongIdentSy  VarSy  TFreeSy  TVarSy  
28904  55 
NumSy  FloatSy  XNumSy  StrSy  Space  Comment  EOF 
27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

56 
datatype token = Token of token_kind * string * Position.range 
27806  57 
val str_of_token: token > string 
58 
val pos_of_token: token > Position.T 

27887  59 
val is_proper: token > bool 
27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

60 
val mk_eof: Position.T > token 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

61 
val eof: token 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

62 
val is_eof: token > bool 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

63 
val stopper: token Scan.stopper 
550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

64 
val idT: typ 
3828  65 
val longidT: typ 
550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

66 
val varT: typ 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

67 
val tidT: typ 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

68 
val tvarT: typ 
237
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
wenzelm
parents:
164
diff
changeset

69 
val terminals: string list 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
wenzelm
parents:
164
diff
changeset

70 
val is_terminal: string > bool 
38237
8b0383334031
prefer Context_Position.report where a proper context is available  notably for "inner" entities;
wenzelm
parents:
35428
diff
changeset

71 
val report_token: Proof.context > token > unit 
39510
d9f5f01faa1b
Syntax.read_asts error: report token ranges within message  no sideeffect here;
wenzelm
parents:
39507
diff
changeset

72 
val reported_token_range: Proof.context > token > string 
18  73 
val matching_tokens: token * token > bool 
74 
val valued_token: token > bool 

237
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
wenzelm
parents:
164
diff
changeset

75 
val predef_term: string > token option 
30573  76 
val tokenize: Scan.lexicon > bool > Symbol_Pos.T list > token list 
4247
9bba9251bb4d
added implode_xstr: string list > string, explode_xstr: string > string list;
wenzelm
parents:
3828
diff
changeset

77 
end; 
0  78 

14679  79 
structure Lexicon: LEXICON = 
0  80 
struct 
81 

18  82 
(** is_identifier etc. **) 
83 

16150  84 
val is_identifier = Symbol.is_ident o Symbol.explode; 
18  85 

14679  86 
fun is_ascii_identifier s = 
87 
let val cs = Symbol.explode s 

16150  88 
in forall Symbol.is_ascii cs andalso Symbol.is_ident cs end; 
14679  89 

18  90 
fun is_xid s = 
4703  91 
(case Symbol.explode s of 
16150  92 
"_" :: cs => Symbol.is_ident cs 
93 
 cs => Symbol.is_ident cs); 

18  94 

330
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
clasohm
parents:
237
diff
changeset

95 
fun is_tid s = 
4703  96 
(case Symbol.explode s of 
16150  97 
"'" :: cs => Symbol.is_ident cs 
18  98 
 _ => false); 
99 

0  100 

101 

4703  102 
(** basic scanners **) 
103 

30573  104 
open Basic_Symbol_Pos; 
27773  105 

30573  106 
fun !!! msg = Symbol_Pos.!!! ("Inner lexical error: " ^ msg); 
27773  107 

40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

108 
val scan_id = 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

109 
Scan.one (Symbol.is_letter o Symbol_Pos.symbol) ::: 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

110 
Scan.many (Symbol.is_letdig o Symbol_Pos.symbol); 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

111 

27773  112 
val scan_longid = scan_id @@@ (Scan.repeat1 ($$$ "." @@@ scan_id) >> flat); 
113 
val scan_tid = $$$ "'" @@@ scan_id; 

4703  114 

40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

115 
val scan_nat = Scan.many1 (Symbol.is_digit o Symbol_Pos.symbol); 
27773  116 
val scan_int = $$$ "" @@@ scan_nat  scan_nat; 
28904  117 
val scan_natdot = scan_nat @@@ $$$ "." @@@ scan_nat; 
118 
val scan_float = $$$ "" @@@ scan_natdot  scan_natdot; 

40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

119 
val scan_hex = $$$ "0" @@@ $$$ "x" @@@ Scan.many1 (Symbol.is_ascii_hex o Symbol_Pos.symbol); 
27773  120 
val scan_bin = $$$ "0" @@@ $$$ "b" @@@ Scan.many1 (fn (s, _) => s = "0" orelse s = "1"); 
4703  121 

27773  122 
val scan_id_nat = scan_id @@@ Scan.optional ($$$ "." @@@ scan_nat) []; 
123 
val scan_var = $$$ "?" @@@ scan_id_nat; 

124 
val scan_tvar = $$$ "?" @@@ $$$ "'" @@@ scan_id_nat; 

4703  125 

126 

127 

18  128 
(** datatype token **) 
0  129 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

130 
datatype token_kind = 
27887  131 
Literal  IdentSy  LongIdentSy  VarSy  TFreeSy  TVarSy  
28904  132 
NumSy  FloatSy  XNumSy  StrSy  Space  Comment  EOF; 
27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

133 

df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

134 
datatype token = Token of token_kind * string * Position.range; 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

135 

27806  136 
fun str_of_token (Token (_, s, _)) = s; 
137 
fun pos_of_token (Token (_, _, (pos, _))) = pos; 

138 

27887  139 
fun is_proper (Token (Space, _, _)) = false 
140 
 is_proper (Token (Comment, _, _)) = false 

141 
 is_proper _ = true; 

142 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

143 

df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

144 
(* stopper *) 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

145 

df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

146 
fun mk_eof pos = Token (EOF, "", (pos, Position.none)); 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

147 
val eof = mk_eof Position.none; 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

148 

df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

149 
fun is_eof (Token (EOF, _, _)) = true 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

150 
 is_eof _ = false; 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

151 

df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

152 
val stopper = Scan.stopper (K eof) is_eof; 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

153 

0  154 

237
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
wenzelm
parents:
164
diff
changeset

155 
(* terminal arguments *) 
0  156 

550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

157 
val idT = Type ("id", []); 
3828  158 
val longidT = Type ("longid", []); 
550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

159 
val varT = Type ("var", []); 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

160 
val tidT = Type ("tid", []); 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

161 
val tvarT = Type ("tvar", []); 
0  162 

29156
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

163 
val terminal_kinds = 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

164 
[("id", IdentSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

165 
("longid", LongIdentSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

166 
("var", VarSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

167 
("tid", TFreeSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

168 
("tvar", TVarSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

169 
("num", NumSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

170 
("float_token", FloatSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

171 
("xnum", XNumSy), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

172 
("xstr", StrSy)]; 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

173 

89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

174 
val terminals = map #1 terminal_kinds; 
20664  175 
val is_terminal = member (op =) terminals; 
237
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
wenzelm
parents:
164
diff
changeset

176 

0  177 

27887  178 
(* markup *) 
179 

180 
val token_kind_markup = 

181 
fn Literal => Markup.literal 

182 
 IdentSy => Markup.ident 

183 
 LongIdentSy => Markup.ident 

184 
 VarSy => Markup.var 

185 
 TFreeSy => Markup.tfree 

186 
 TVarSy => Markup.tvar 

29318
6337d1cb2ba0
added numeral, which supercedes num, xnum, float;
wenzelm
parents:
29156
diff
changeset

187 
 NumSy => Markup.numeral 
6337d1cb2ba0
added numeral, which supercedes num, xnum, float;
wenzelm
parents:
29156
diff
changeset

188 
 FloatSy => Markup.numeral 
6337d1cb2ba0
added numeral, which supercedes num, xnum, float;
wenzelm
parents:
29156
diff
changeset

189 
 XNumSy => Markup.numeral 
6337d1cb2ba0
added numeral, which supercedes num, xnum, float;
wenzelm
parents:
29156
diff
changeset

190 
 StrSy => Markup.inner_string 
38474
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
wenzelm
parents:
38237
diff
changeset

191 
 Space => Markup.empty 
27887  192 
 Comment => Markup.inner_comment 
38474
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
wenzelm
parents:
38237
diff
changeset

193 
 EOF => Markup.empty; 
27887  194 

38237
8b0383334031
prefer Context_Position.report where a proper context is available  notably for "inner" entities;
wenzelm
parents:
35428
diff
changeset

195 
fun report_token ctxt (Token (kind, _, (pos, _))) = 
39507
839873937ddd
tuned signature of (Context_)Position.report variants;
wenzelm
parents:
39168
diff
changeset

196 
Context_Position.report ctxt pos (token_kind_markup kind); 
27887  197 

39510
d9f5f01faa1b
Syntax.read_asts error: report token ranges within message  no sideeffect here;
wenzelm
parents:
39507
diff
changeset

198 
fun reported_token_range ctxt tok = 
39168
e3ac771235f7
report token range after inner parse error  often provides important clues about misunderstanding concerning lexical phase;
wenzelm
parents:
38474
diff
changeset

199 
if is_proper tok 
39510
d9f5f01faa1b
Syntax.read_asts error: report token ranges within message  no sideeffect here;
wenzelm
parents:
39507
diff
changeset

200 
then Context_Position.reported_text ctxt (pos_of_token tok) Markup.token_range "" 
d9f5f01faa1b
Syntax.read_asts error: report token ranges within message  no sideeffect here;
wenzelm
parents:
39507
diff
changeset

201 
else ""; 
39168
e3ac771235f7
report token range after inner parse error  often provides important clues about misunderstanding concerning lexical phase;
wenzelm
parents:
38474
diff
changeset

202 

27887  203 

18  204 
(* matching_tokens *) 
0  205 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

206 
fun matching_tokens (Token (Literal, x, _), Token (Literal, y, _)) = x = y 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

207 
 matching_tokens (Token (k, _, _), Token (k', _, _)) = k = k'; 
0  208 

209 

18  210 
(* valued_token *) 
0  211 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

212 
fun valued_token (Token (Literal, _, _)) = false 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

213 
 valued_token (Token (EOF, _, _)) = false 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

214 
 valued_token _ = true; 
0  215 

216 

18  217 
(* predef_term *) 
0  218 

29156
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

219 
fun predef_term s = 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

220 
(case AList.lookup (op =) terminal_kinds s of 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

221 
SOME sy => SOME (Token (sy, s, Position.no_range)) 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

222 
 NONE => NONE); 
0  223 

224 

4703  225 
(* xstr tokens *) 
18  226 

14730  227 
val scan_chr = 
27773  228 
$$$ "\\"  $$$ "'"  
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

229 
Scan.one 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

230 
((fn s => s <> "\\" andalso s <> "'" andalso Symbol.is_regular s) o 
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

231 
Symbol_Pos.symbol) >> single  
27773  232 
$$$ "'"  Scan.ahead (~$$$ "'"); 
14730  233 

234 
val scan_str = 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

235 
$$$ "'" @@@ $$$ "'" @@@ !!! "missing end of string" 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

236 
((Scan.repeat scan_chr >> flat) @@@ $$$ "'" @@@ $$$ "'"); 
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

237 

df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

238 
val scan_str_body = 
27773  239 
$$$ "'"  $$$ "'"  !!! "missing end of string" 
240 
((Scan.repeat scan_chr >> flat)  $$$ "'"  $$$ "'"); 

14730  241 

0  242 

4703  243 
fun implode_xstr cs = enclose "''" "''" (implode (map (fn "'" => "\\'"  c => c) cs)); 
18  244 

4703  245 
fun explode_xstr str = 
30573  246 
(case Scan.read Symbol_Pos.stopper scan_str_body (Symbol_Pos.explode (str, Position.none)) of 
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

247 
SOME cs => map Symbol_Pos.symbol cs 
5868  248 
 _ => error ("Inner lexical error: literal string expected at " ^ quote str)); 
18  249 

250 

27806  251 

18  252 
(** tokenize **) 
253 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

254 
fun token_leq (Token (_, s1, _), Token (_, s2, _)) = s1 <= s2; 
30573  255 
fun token kind ss = Token (kind, Symbol_Pos.content ss, Symbol_Pos.range ss); 
27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

256 

df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

257 
fun tokenize lex xids syms = 
18  258 
let 
259 
val scan_xid = 

27773  260 
if xids then $$$ "_" @@@ scan_id  scan_id 
18  261 
else scan_id; 
262 

20096  263 
val scan_num = scan_hex  scan_bin  scan_int; 
264 

550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

265 
val scan_val = 
26007
3760d3ff4cce
basic scanners: produce symbol list instead of imploded string;
wenzelm
parents:
24630
diff
changeset

266 
scan_tvar >> token TVarSy  
3760d3ff4cce
basic scanners: produce symbol list instead of imploded string;
wenzelm
parents:
24630
diff
changeset

267 
scan_var >> token VarSy  
3760d3ff4cce
basic scanners: produce symbol list instead of imploded string;
wenzelm
parents:
24630
diff
changeset

268 
scan_tid >> token TFreeSy  
28904  269 
scan_float >> token FloatSy  
26007
3760d3ff4cce
basic scanners: produce symbol list instead of imploded string;
wenzelm
parents:
24630
diff
changeset

270 
scan_num >> token NumSy  
27773  271 
$$$ "#" @@@ scan_num >> token XNumSy  
26007
3760d3ff4cce
basic scanners: produce symbol list instead of imploded string;
wenzelm
parents:
24630
diff
changeset

272 
scan_longid >> token LongIdentSy  
3760d3ff4cce
basic scanners: produce symbol list instead of imploded string;
wenzelm
parents:
24630
diff
changeset

273 
scan_xid >> token IdentSy; 
18  274 

27800
df444ddeff56
datatype token: maintain range, tuned representation;
wenzelm
parents:
27773
diff
changeset

275 
val scan_lit = Scan.literal lex >> token Literal; 
550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

276 

4703  277 
val scan_token = 
30573  278 
Symbol_Pos.scan_comment !!! >> token Comment  
27887  279 
Scan.max token_leq scan_lit scan_val  
280 
scan_str >> token StrSy  

40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

281 
Scan.many1 (Symbol.is_blank o Symbol_Pos.symbol) >> token Space; 
18  282 
in 
27773  283 
(case Scan.error 
30573  284 
(Scan.finite Symbol_Pos.stopper (Scan.repeat scan_token)) syms of 
27887  285 
(toks, []) => toks 
30573  286 
 (_, ss) => error ("Inner lexical error at: " ^ Symbol_Pos.content ss ^ 
287 
Position.str_of (#1 (Symbol_Pos.range ss)))) 

18  288 
end; 
289 

290 

291 

292 
(** scan variables **) 

293 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

294 
(* scan_indexname *) 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

295 

670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

296 
local 
18  297 

27773  298 
val scan_vname = 
18  299 
let 
15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

300 
fun nat n [] = n 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

301 
 nat n (c :: cs) = nat (n * 10 + (ord c  ord "0")) cs; 
18  302 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

303 
fun idxname cs ds = (implode (rev cs), nat 0 ds); 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

304 
fun chop_idx [] ds = idxname [] ds 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

305 
 chop_idx (cs as (_ :: "\\<^isub>" :: _)) ds = idxname cs ds 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

306 
 chop_idx (cs as (_ :: "\\<^isup>" :: _)) ds = idxname cs ds 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

307 
 chop_idx (c :: cs) ds = 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

308 
if Symbol.is_digit c then chop_idx cs (c :: ds) 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

309 
else idxname (c :: cs) ds; 
18  310 

40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

311 
val scan = (scan_id >> map Symbol_Pos.symbol)  
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

312 
Scan.optional ($$$ "."  scan_nat >> (nat 0 o map Symbol_Pos.symbol)) ~1; 
18  313 
in 
27773  314 
scan >> 
315 
(fn (cs, ~1) => chop_idx (rev cs) [] 

316 
 (cs, i) => (implode cs, i)) 

18  317 
end; 
318 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

319 
in 
18  320 

27773  321 
val scan_indexname = $$$ "'"  scan_vname >> (fn (x, i) => ("'" ^ x, i))  scan_vname; 
15443
07f78cc82a73
indexname function now parses type variables as well; changed input
berghofe
parents:
14981
diff
changeset

322 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

323 
end; 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

324 

670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

325 

670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

326 
(* indexname *) 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

327 

20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

328 
fun read_indexname s = 
30573  329 
(case Scan.read Symbol_Pos.stopper scan_indexname (Symbol_Pos.explode (s, Position.none)) of 
15531  330 
SOME xi => xi 
20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

331 
 _ => error ("Lexical error in variable name: " ^ quote s)); 
18  332 

333 

4703  334 
(* read_var *) 
18  335 

550
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

336 
fun const c = Const (c, dummyT); 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

337 
fun free x = Free (x, dummyT); 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

338 
fun var xi = Var (xi, dummyT); 
353eea6ec232
replaced id, var, tid, tvar by idT, varT, tidT, tvarT;
wenzelm
parents:
376
diff
changeset

339 

4703  340 
fun read_var str = 
18  341 
let 
342 
val scan = 

30573  343 
$$$ "?"  scan_indexname  Scan.ahead (Scan.one Symbol_Pos.is_eof) >> var  
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

344 
Scan.many (Symbol.is_regular o Symbol_Pos.symbol) >> (free o implode o map Symbol_Pos.symbol); 
30573  345 
in the (Scan.read Symbol_Pos.stopper scan (Symbol_Pos.explode (str, Position.none))) end; 
15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

346 

670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

347 

670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

348 
(* read_variable *) 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

349 

670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

350 
fun read_variable str = 
27773  351 
let val scan = $$$ "?"  scan_indexname  scan_indexname 
30573  352 
in Scan.read Symbol_Pos.stopper scan (Symbol_Pos.explode (str, Position.none)) end; 
4587  353 

354 

35428  355 
(* logical entities *) 
356 

357 
fun marker s = (prefix s, unprefix s); 

358 

359 
val (mark_class, unmark_class) = marker "\\<^class>"; 

360 
val (mark_type, unmark_type) = marker "\\<^type>"; 

361 
val (mark_const, unmark_const) = marker "\\<^const>"; 

362 
val (mark_fixed, unmark_fixed) = marker "\\<^fixed>"; 

5260  363 

35428  364 
fun unmark {case_class, case_type, case_const, case_fixed, case_default} s = 
365 
(case try unmark_class s of 

366 
SOME c => case_class c 

367 
 NONE => 

368 
(case try unmark_type s of 

369 
SOME c => case_type c 

370 
 NONE => 

371 
(case try unmark_const s of 

372 
SOME c => case_const c 

373 
 NONE => 

374 
(case try unmark_fixed s of 

375 
SOME c => case_fixed c 

376 
 NONE => case_default s)))); 

35262
9ea4445d2ccf
slightly more abstract syntax mark/unmark operations;
wenzelm
parents:
30573
diff
changeset

377 

35428  378 
val is_marked = 
379 
unmark {case_class = K true, case_type = K true, case_const = K true, 

380 
case_fixed = K true, case_default = K false}; 

19002  381 

5260  382 

20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

383 
(* read numbers *) 
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

384 

bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

385 
local 
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

386 

bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

387 
fun nat cs = 
40525
14a2e686bdac
eliminated slightly odd pervasive Symbol_Pos.symbol;
wenzelm
parents:
40290
diff
changeset

388 
Option.map (#1 o Library.read_int o map Symbol_Pos.symbol) 
30573  389 
(Scan.read Symbol_Pos.stopper scan_nat cs); 
5860  390 

20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

391 
in 
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

392 

30573  393 
fun read_nat s = nat (Symbol_Pos.explode (s, Position.none)); 
20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

394 

bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

395 
fun read_int s = 
30573  396 
(case Symbol_Pos.explode (s, Position.none) of 
27773  397 
("", _) :: cs => Option.map ~ (nat cs) 
20313
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

398 
 cs => nat cs); 
bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

399 

bf9101cc4385
renamed Syntax.indexname to Syntax.read_indexname;
wenzelm
parents:
20165
diff
changeset

400 
end; 
5860  401 

402 

20096  403 
(* read_xnum: hex/bin/decimal *) 
9326  404 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

405 
local 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

406 

20096  407 
val ten = ord "0" + 10; 
408 
val a = ord "a"; 

409 
val A = ord "A"; 

35428  410 
val _ = a > A orelse raise Fail "Bad ASCII"; 
20096  411 

412 
fun remap_hex c = 

413 
let val x = ord c in 

414 
if x >= a then chr (x  a + ten) 

415 
else if x >= A then chr (x  A + ten) 

416 
else c 

417 
end; 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

418 

21781  419 
fun leading_zeros ["0"] = 0 
420 
 leading_zeros ("0" :: cs) = 1 + leading_zeros cs 

421 
 leading_zeros _ = 0; 

422 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

423 
in 
15965
f422f8283491
Use of IntInf.int instead of int in most numeric simprocs; avoids
paulson
parents:
15570
diff
changeset

424 

9326  425 
fun read_xnum str = 
426 
let 

20096  427 
val (sign, radix, digs) = 
428 
(case Symbol.explode (perhaps (try (unprefix "#")) str) of 

429 
"0" :: "x" :: cs => (1, 16, map remap_hex cs) 

430 
 "0" :: "b" :: cs => (1, 2, cs) 

431 
 "" :: cs => (~1, 10, cs) 

432 
 cs => (1, 10, cs)); 

29156
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

433 
in 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

434 
{radix = radix, 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

435 
leading_zeros = leading_zeros digs, 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

436 
value = sign * #1 (Library.read_radix_int radix digs)} 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

437 
end; 
9326  438 

15991
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

439 
end; 
670f8e4b5a98
added read_variable: optional question mark on input;
wenzelm
parents:
15965
diff
changeset

440 

28904  441 
fun read_float str = 
442 
let 

443 
val (sign, cs) = 

29156
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

444 
(case Symbol.explode str of 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

445 
"" :: cs => (~1, cs) 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

446 
 cs => (1, cs)); 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

447 
val (intpart, fracpart) = 
28904  448 
(case take_prefix Symbol.is_digit cs of 
29156
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

449 
(intpart, "." :: fracpart) => (intpart, fracpart) 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

450 
 _ => raise Fail "read_float"); 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

451 
in 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

452 
{mant = sign * #1 (Library.read_int (intpart @ fracpart)), 
89f76a58a378
renamed terminal category "float" to "float_token", to avoid name
wenzelm
parents:
28904
diff
changeset

453 
exp = length fracpart} 
42046
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

454 
end; 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

455 

6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

456 

6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

457 
(* positions *) 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

458 

6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

459 
fun encode_position pos = 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

460 
op ^ (YXML.output_markup (Position.markup pos Markup.position)); 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

461 

6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

462 
fun decode_position str = 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

463 
(case YXML.parse_body str handle Fail msg => error msg of 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

464 
[XML.Elem ((name, props), [])] => 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

465 
if name = Markup.positionN then SOME (Position.of_properties props) 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

466 
else NONE 
6341c23baf10
added Lexicon.encode_position, Lexicon.decode_position;
wenzelm
parents:
40525
diff
changeset

467 
 _ => NONE); 
28904  468 

0  469 
end; 