18
|
1 |
(* Title: Pure/Syntax/parser.ML
|
46
|
2 |
ID: $Id$
|
18
|
3 |
Author: Sonia Mahjoub and Markus Wenzel, TU Muenchen
|
|
4 |
|
|
5 |
Isabelle's main parser (used for terms and typs).
|
|
6 |
|
|
7 |
TODO:
|
|
8 |
~1 --> chain_pri
|
|
9 |
rename T, NT
|
|
10 |
improve syntax error
|
|
11 |
fix ambiguous grammar problem
|
|
12 |
*)
|
|
13 |
|
|
14 |
signature PARSER =
|
|
15 |
sig
|
|
16 |
structure XGram: XGRAM
|
|
17 |
structure ParseTree: PARSE_TREE
|
|
18 |
local open XGram ParseTree ParseTree.Lexicon in
|
|
19 |
type gram
|
|
20 |
val empty_gram: gram
|
|
21 |
val extend_gram: gram -> string list -> string prod list -> gram
|
|
22 |
val mk_gram: string list -> string prod list -> gram
|
|
23 |
val parse: gram -> string -> token list -> parsetree
|
|
24 |
end
|
|
25 |
end;
|
|
26 |
|
|
27 |
functor ParserFun(structure Symtab: SYMTAB and XGram: XGRAM
|
|
28 |
and ParseTree: PARSE_TREE)(*: PARSER *) = (* FIXME *)
|
|
29 |
struct
|
|
30 |
|
|
31 |
structure XGram = XGram;
|
|
32 |
structure ParseTree = ParseTree;
|
|
33 |
structure Lexicon = ParseTree.Lexicon;
|
|
34 |
open XGram ParseTree Lexicon;
|
|
35 |
|
|
36 |
|
|
37 |
(** datatype gram **)
|
|
38 |
|
|
39 |
datatype symb = T of token | NT of string * int;
|
|
40 |
|
|
41 |
datatype gram =
|
|
42 |
Gram of string list * (symb list * string * int) list Symtab.table;
|
|
43 |
|
|
44 |
|
|
45 |
(* empty_gram *)
|
|
46 |
|
|
47 |
val empty_gram = Gram ([], Symtab.null);
|
|
48 |
|
|
49 |
|
|
50 |
(* extend_gram *)
|
|
51 |
|
|
52 |
(*prods are stored in reverse order*)
|
|
53 |
|
|
54 |
fun extend_gram (Gram (roots, tab)) new_roots xprods =
|
|
55 |
let
|
|
56 |
fun symb_of (Terminal s) = Some (T (Token s))
|
|
57 |
| symb_of (Nonterminal (s, p)) =
|
|
58 |
(case predef_term s of
|
|
59 |
EndToken => Some (NT (s, p))
|
|
60 |
| tk => Some (T tk))
|
|
61 |
| symb_of _ = None;
|
|
62 |
|
|
63 |
fun prod_of (Prod (s, xsyms, c, p)) = (s, mapfilter symb_of xsyms, c, p);
|
|
64 |
|
|
65 |
fun add_prod (tab, (s, syms, c, p)) =
|
|
66 |
(case Symtab.lookup (tab, s) of
|
|
67 |
None => Symtab.update ((s, [(syms, c, p)]), tab)
|
|
68 |
| Some prods => Symtab.update ((s, (syms, c, p) :: prods), tab));
|
|
69 |
in
|
|
70 |
Gram (new_roots union roots,
|
|
71 |
Symtab.balance (foldl add_prod (tab, map prod_of xprods)))
|
|
72 |
end;
|
|
73 |
|
|
74 |
|
|
75 |
(* mk_gram *)
|
|
76 |
|
|
77 |
val mk_gram = extend_gram empty_gram;
|
|
78 |
|
|
79 |
|
|
80 |
(* get_prods *)
|
|
81 |
|
|
82 |
fun get_prods tab s pred =
|
|
83 |
let
|
|
84 |
fun rfilter [] ys = ys
|
|
85 |
| rfilter (x :: xs) ys = rfilter xs (if pred x then x :: ys else ys);
|
|
86 |
in
|
|
87 |
(case Symtab.lookup (tab, s) of
|
|
88 |
Some prods => rfilter prods []
|
|
89 |
| None => [])
|
|
90 |
end;
|
|
91 |
|
|
92 |
|
|
93 |
|
|
94 |
(** parse **)
|
|
95 |
|
|
96 |
type state =
|
|
97 |
string * int
|
|
98 |
* parsetree list (*before point*)
|
|
99 |
* symb list (*after point*)
|
|
100 |
* string * int;
|
|
101 |
|
|
102 |
type earleystate = state list Array.array;
|
|
103 |
|
|
104 |
|
|
105 |
|
|
106 |
fun getProductions tab A i =
|
|
107 |
get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1);
|
|
108 |
|
|
109 |
fun getProductions' tab A i k =
|
|
110 |
get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1);
|
|
111 |
|
|
112 |
|
|
113 |
|
|
114 |
fun mkState i jj A ([NT(B,~1)],id,~1) =
|
|
115 |
(A,max_pri,[],[NT (B,jj)],id,i)
|
|
116 |
| mkState i jj A (ss,id,j) = (A,j,[],ss,id,i) ;
|
|
117 |
|
|
118 |
|
|
119 |
|
|
120 |
fun conc (t,(k:int)) [] = (None, [(t,k)])
|
|
121 |
| conc (t,k) ((t',k')::ts) =
|
|
122 |
if (t = t')
|
|
123 |
then (Some k', ( if k' >= k
|
|
124 |
then (t',k')::ts
|
|
125 |
else (t,k)::ts )
|
|
126 |
)
|
|
127 |
else let val (n, ts') = conc (t,k) ts
|
|
128 |
in (n, (t',k')::ts')
|
|
129 |
end;
|
|
130 |
|
|
131 |
|
|
132 |
fun update_tree ((B,(i,ts))::used) (A,t) =
|
|
133 |
if (A = B)
|
|
134 |
then
|
|
135 |
let val (n,ts') = conc t ts
|
|
136 |
in ((A,(i,ts'))::used, n)
|
|
137 |
end
|
|
138 |
else
|
|
139 |
let val (used', n) = update_tree used (A,t)
|
|
140 |
in ((B,(i,ts)) :: used', n)
|
|
141 |
end;
|
|
142 |
|
|
143 |
|
|
144 |
|
|
145 |
fun update_index ((B,(i,ts))::used) (A,j) =
|
|
146 |
if (A = B)
|
|
147 |
then (A,(j,ts)) :: used
|
|
148 |
else (B,(i,ts)) :: (update_index used (A,j));
|
|
149 |
|
|
150 |
|
|
151 |
|
|
152 |
fun getS A i Si =
|
|
153 |
filter (fn (_,_,_,(NT(B,j))::_,_,_)
|
|
154 |
=> (A=B andalso j<=i)
|
|
155 |
| _ => false
|
|
156 |
) Si;
|
|
157 |
|
|
158 |
|
|
159 |
|
|
160 |
fun getS' A k n Si =
|
|
161 |
filter (fn (_,_,_,(NT(B,j))::_,_,_) => (A=B andalso
|
|
162 |
j<=k andalso
|
|
163 |
j>n )
|
|
164 |
| _ => false
|
|
165 |
) Si;
|
|
166 |
|
|
167 |
|
|
168 |
|
|
169 |
fun getStates Estate i ii A k =
|
|
170 |
filter (fn (_,_,_,(NT(B,j))::_,_,_)
|
|
171 |
=> (A=B andalso j<=k)
|
|
172 |
| _ => false
|
|
173 |
)
|
|
174 |
(Array.sub (Estate, ii))
|
|
175 |
;
|
|
176 |
|
|
177 |
|
|
178 |
|
|
179 |
(* MMW *)(* FIXME *)
|
|
180 |
fun movedot_term (A,j,ts,T a::sa,id,i) c =
|
|
181 |
if (valued_token c)
|
|
182 |
then (A,j,(ts@[Tip c]),sa,id,i)
|
|
183 |
else (A,j,ts,sa,id,i) ;
|
|
184 |
|
|
185 |
|
|
186 |
|
|
187 |
fun movedot_nonterm ts (A,j,tss,NT(B,k) ::sa,id,i) =
|
|
188 |
(A,j,tss@ts,sa,id,i) ;
|
|
189 |
|
|
190 |
|
|
191 |
|
|
192 |
fun movedot_lambda _ [] = []
|
|
193 |
| movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ((t,ki)::ts) =
|
|
194 |
if k <= ki
|
|
195 |
then (B,j,tss@t,sa,id,i) ::
|
|
196 |
(movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts)
|
|
197 |
else movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts
|
|
198 |
;
|
|
199 |
|
|
200 |
|
|
201 |
|
|
202 |
|
|
203 |
fun PROCESSS Estate grammar i c states =
|
|
204 |
|
|
205 |
let
|
|
206 |
fun processS used [] (Si,Sii) = (Si,Sii)
|
|
207 |
| processS used (S::States) (Si,Sii) =
|
|
208 |
( case S of
|
|
209 |
|
|
210 |
(_,_,_,(NT (A,j))::_,_,_) =>
|
|
211 |
let
|
|
212 |
val (used_neu, States_neu) =
|
|
213 |
( case assoc (used, A) of
|
|
214 |
Some (k,l) => (* A gehoert zu used *)
|
|
215 |
|
|
216 |
if (k <= j) (* Prioritaet wurde
|
|
217 |
behandelt *)
|
|
218 |
then
|
|
219 |
(used, movedot_lambda S l)
|
|
220 |
|
|
221 |
else (* Prioritaet wurde noch nicht
|
|
222 |
behandelt *)
|
|
223 |
let val L =
|
|
224 |
getProductions' grammar A j k
|
|
225 |
val States' = map (mkState i j A) L
|
|
226 |
in
|
|
227 |
(update_index used (A,j),
|
|
228 |
States'@ movedot_lambda S l
|
|
229 |
)
|
|
230 |
end
|
|
231 |
|
|
232 |
| None => (* A gehoert nicht zu used *)
|
|
233 |
let val L = getProductions grammar A j
|
|
234 |
val States' = map (mkState i j A) L
|
|
235 |
in
|
|
236 |
((A,(j,[]))::used, States')
|
|
237 |
end
|
|
238 |
)
|
|
239 |
in
|
|
240 |
processS used_neu (States @ States_neu) (S::Si,Sii)
|
|
241 |
end
|
|
242 |
|
|
243 |
| (_,_,_,(T a)::_,_,_) =>
|
|
244 |
processS used States
|
|
245 |
(S::Si, if (matching_tokens (a, c))
|
|
246 |
then (movedot_term S c)::Sii (* MMW *)(* FIXME *)
|
|
247 |
else Sii
|
|
248 |
)
|
|
249 |
|
|
250 |
|
|
251 |
| (A,k,ts,[],id,j) =>
|
|
252 |
let val tt = if id = ""
|
|
253 |
then ts
|
|
254 |
else [Node(id,ts)]
|
|
255 |
in
|
|
256 |
if (j = i)
|
|
257 |
then
|
|
258 |
let val (used',O) = update_tree used (A,(tt,k))
|
|
259 |
in ( case O of
|
|
260 |
None =>
|
|
261 |
let val Slist = getS A k Si
|
|
262 |
val States' =
|
|
263 |
map (movedot_nonterm tt ) Slist
|
|
264 |
in processS used'
|
|
265 |
(States @ States') (S::Si,Sii)
|
|
266 |
end
|
|
267 |
| Some n =>
|
|
268 |
if (n >= k)
|
|
269 |
then
|
|
270 |
processS used' States (S::Si,Sii)
|
|
271 |
else
|
|
272 |
let val Slist = getS' A k n Si
|
|
273 |
val States' =
|
|
274 |
map (movedot_nonterm tt ) Slist
|
|
275 |
in
|
|
276 |
processS used'
|
|
277 |
(States @ States') (S::Si,Sii)
|
|
278 |
end
|
|
279 |
)
|
|
280 |
end
|
|
281 |
|
|
282 |
else
|
|
283 |
processS used (States @
|
|
284 |
map (movedot_nonterm tt)
|
|
285 |
(getStates Estate i j A k))
|
|
286 |
(S::Si,Sii)
|
|
287 |
end
|
|
288 |
)
|
|
289 |
in
|
|
290 |
processS [] states ([],[])
|
|
291 |
end;
|
|
292 |
|
|
293 |
|
|
294 |
|
|
295 |
fun syntax_error toks =
|
|
296 |
error ("Syntax error at: " ^ space_implode " " (map str_of_token toks));
|
|
297 |
|
|
298 |
fun Produce grammar stateset i indata =
|
|
299 |
case (Array.sub (stateset,i)) of
|
|
300 |
[] => syntax_error indata (* MMW *)(* FIXME *)
|
|
301 |
| s =>
|
|
302 |
(case indata of
|
|
303 |
[] => Array.sub (stateset,i)
|
|
304 |
| (c::cs) =>
|
|
305 |
let val (si,sii) =
|
|
306 |
PROCESSS stateset grammar i c s
|
|
307 |
in
|
|
308 |
Array.update (stateset,i,si);
|
|
309 |
Array.update (stateset,i+1,sii);
|
|
310 |
Produce grammar stateset (i+1) cs
|
|
311 |
end
|
|
312 |
);
|
|
313 |
|
|
314 |
|
|
315 |
fun get_trees [] = []
|
|
316 |
| get_trees ((_,_,pt_lst,_,_,_) :: stateset) =
|
|
317 |
let val l = get_trees stateset
|
|
318 |
in case pt_lst of
|
|
319 |
[ptree] => ptree :: l
|
|
320 |
| _ => l
|
|
321 |
end;
|
|
322 |
|
|
323 |
|
|
324 |
|
|
325 |
fun earley grammar startsymbol indata =
|
|
326 |
let val S0 =
|
|
327 |
[("S'",0,[],[NT (startsymbol,0), T EndToken],"",0)]
|
|
328 |
val s = length indata + 1 (* MMW *)(* FIXME was .. + 2 *)
|
|
329 |
val Estate = Array.array (s, [])
|
|
330 |
in Array.update (Estate,0,S0);
|
|
331 |
let val l = Produce grammar Estate 0 indata (* MMW *)(* FIXME was .. @ [EndToken] *)
|
|
332 |
val p_trees = get_trees l
|
|
333 |
in p_trees
|
|
334 |
end
|
|
335 |
end;
|
|
336 |
|
|
337 |
|
|
338 |
(* FIXME demo *)
|
|
339 |
fun parse (Gram (roots, prod_tab)) root toks =
|
|
340 |
if root mem roots then
|
|
341 |
(case earley prod_tab root toks of
|
|
342 |
[] => error "parse: no parse trees"
|
|
343 |
| pt :: _ => pt)
|
|
344 |
else error ("Unparsable category: " ^ root);
|
|
345 |
|
|
346 |
|
|
347 |
end;
|
|
348 |
|