3 Author: Sonia Mahjoub and Markus Wenzel, TU Muenchen |
3 Author: Sonia Mahjoub and Markus Wenzel, TU Muenchen |
4 |
4 |
5 Isabelle's main parser (used for terms and typs). |
5 Isabelle's main parser (used for terms and typs). |
6 |
6 |
7 TODO: |
7 TODO: |
|
8 fix ambiguous grammar problem |
8 ~1 --> chain_pri |
9 ~1 --> chain_pri |
9 rename T, NT |
|
10 improve syntax error |
10 improve syntax error |
11 fix ambiguous grammar problem |
11 extend_gram: remove 'roots' arg |
12 *) |
12 *) |
13 |
13 |
14 signature PARSER = |
14 signature PARSER = |
15 sig |
15 sig |
16 structure XGram: XGRAM |
16 structure Lexicon: LEXICON |
17 structure ParseTree: PARSE_TREE |
17 structure SynExt: SYN_EXT |
18 local open XGram ParseTree ParseTree.Lexicon in |
18 local open Lexicon SynExt SynExt.Ast in |
19 type gram |
19 type gram |
20 val empty_gram: gram |
20 val empty_gram: gram |
21 val extend_gram: gram -> string list -> string prod list -> gram |
21 val extend_gram: gram -> string list -> xprod list -> gram |
22 val mk_gram: string list -> string prod list -> gram |
22 val merge_grams: gram -> gram -> gram |
|
23 val pretty_gram: gram -> Pretty.T list |
|
24 datatype parsetree = |
|
25 Node of string * parsetree list | |
|
26 Tip of token |
23 val parse: gram -> string -> token list -> parsetree |
27 val parse: gram -> string -> token list -> parsetree |
24 end |
28 end |
25 end; |
29 end; |
26 |
30 |
27 functor ParserFun(structure Symtab: SYMTAB and XGram: XGRAM |
31 functor ParserFun(structure Symtab: SYMTAB and Lexicon: LEXICON |
28 and ParseTree: PARSE_TREE)(*: PARSER *) = (* FIXME *) |
32 and SynExt: SYN_EXT)(*: PARSER *) = (* FIXME *) |
29 struct |
33 struct |
30 |
34 |
31 structure XGram = XGram; |
35 structure Pretty = SynExt.Ast.Pretty; |
32 structure ParseTree = ParseTree; |
36 structure Lexicon = Lexicon; |
33 structure Lexicon = ParseTree.Lexicon; |
37 structure SynExt = SynExt; |
34 open XGram ParseTree Lexicon; |
38 open Lexicon SynExt; |
35 |
39 |
36 |
40 |
37 (** datatype gram **) |
41 (** datatype gram **) |
38 |
42 |
39 datatype symb = T of token | NT of string * int; |
43 datatype symb = |
|
44 Terminal of token | |
|
45 Nonterminal of string * int; |
40 |
46 |
41 datatype gram = |
47 datatype gram = |
42 Gram of string list * (symb list * string * int) list Symtab.table; |
48 Gram of (string * (symb list * string * int)) list |
43 |
49 * (symb list * string * int) list Symtab.table; |
44 |
50 |
45 (* empty_gram *) |
51 fun mk_gram prods = Gram (prods, Symtab.make_multi prods); |
46 |
52 |
47 val empty_gram = Gram ([], Symtab.null); |
53 |
48 |
54 (* empty, extend, merge grams *) |
49 |
55 |
50 (* extend_gram *) |
56 val empty_gram = mk_gram []; |
51 |
57 |
52 (*prods are stored in reverse order*) |
58 fun extend_gram (gram1 as Gram (prods1, _)) _ xprods2 = |
53 |
|
54 fun extend_gram (Gram (roots, tab)) new_roots xprods = |
|
55 let |
59 let |
56 fun symb_of (Terminal s) = Some (T (Token s)) |
60 fun symb_of (Delim s) = Some (Terminal (Token s)) |
57 | symb_of (Nonterminal (s, p)) = |
61 | symb_of (Argument (s, p)) = |
58 (case predef_term s of |
62 (case predef_term s of |
59 EndToken => Some (NT (s, p)) |
63 None => Some (Nonterminal (s, p)) |
60 | tk => Some (T tk)) |
64 | Some tk => Some (Terminal tk)) |
61 | symb_of _ = None; |
65 | symb_of _ = None; |
62 |
66 |
63 fun prod_of (Prod (s, xsyms, c, p)) = (s, mapfilter symb_of xsyms, c, p); |
67 fun prod_of (XProd (lhs, xsymbs, const, pri)) = |
64 |
68 (lhs, (mapfilter symb_of xsymbs, const, pri)); |
65 fun add_prod (tab, (s, syms, c, p)) = |
69 |
66 (case Symtab.lookup (tab, s) of |
70 val prods2 = distinct (map prod_of xprods2); |
67 None => Symtab.update ((s, [(syms, c, p)]), tab) |
|
68 | Some prods => Symtab.update ((s, (syms, c, p) :: prods), tab)); |
|
69 in |
71 in |
70 Gram (new_roots union roots, |
72 if prods2 subset prods1 then gram1 |
71 Symtab.balance (foldl add_prod (tab, map prod_of xprods))) |
73 else mk_gram (extend_list prods1 prods2) |
72 end; |
74 end; |
73 |
75 |
74 |
76 fun merge_grams (gram1 as Gram (prods1, _)) (gram2 as Gram (prods2, _)) = |
75 (* mk_gram *) |
77 if prods2 subset prods1 then gram1 |
76 |
78 else if prods1 subset prods2 then gram2 |
77 val mk_gram = extend_gram empty_gram; |
79 else mk_gram (merge_lists prods1 prods2); |
78 |
80 |
79 |
81 |
80 (* get_prods *) |
82 (* pretty_gram *) |
81 |
83 |
82 fun get_prods tab s pred = |
84 fun pretty_gram (Gram (prods, _)) = |
83 let |
85 let |
84 fun rfilter [] ys = ys |
86 fun pretty_name name = [Pretty.str (name ^ " =")]; |
85 | rfilter (x :: xs) ys = rfilter xs (if pred x then x :: ys else ys); |
87 |
|
88 fun pretty_symb (Terminal (Token s)) = Pretty.str (quote s) |
|
89 | pretty_symb (Terminal tok) = Pretty.str (str_of_token tok) |
|
90 | pretty_symb (Nonterminal (s, p)) = |
|
91 Pretty.str (s ^ "[" ^ string_of_int p ^ "]"); |
|
92 |
|
93 fun pretty_const "" = [] |
|
94 | pretty_const c = [Pretty.str ("=> " ^ quote c)]; |
|
95 |
|
96 fun pretty_pri p = [Pretty.str ("(" ^ string_of_int p ^ ")")]; |
|
97 |
|
98 fun pretty_prod (name, (symbs, const, pri)) = |
|
99 Pretty.block (Pretty.breaks (pretty_name name @ |
|
100 map pretty_symb symbs @ pretty_const const @ pretty_pri pri)); |
86 in |
101 in |
87 (case Symtab.lookup (tab, s) of |
102 map pretty_prod prods |
88 Some prods => rfilter prods [] |
|
89 | None => []) |
|
90 end; |
103 end; |
91 |
104 |
92 |
105 |
93 |
106 |
94 (** parse **) |
107 (** parse **) |
|
108 |
|
109 datatype parsetree = |
|
110 Node of string * parsetree list | |
|
111 Tip of token; |
95 |
112 |
96 type state = |
113 type state = |
97 string * int |
114 string * int |
98 * parsetree list (*before point*) |
115 * parsetree list (*before point*) |
99 * symb list (*after point*) |
116 * symb list (*after point*) |
101 |
118 |
102 type earleystate = state list Array.array; |
119 type earleystate = state list Array.array; |
103 |
120 |
104 |
121 |
105 |
122 |
|
123 fun get_prods tab lhs pred = |
|
124 filter pred (Symtab.lookup_multi (tab, lhs)); |
|
125 |
106 fun getProductions tab A i = |
126 fun getProductions tab A i = |
107 get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1); |
127 get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1); |
108 |
128 |
109 fun getProductions' tab A i k = |
129 fun getProductions' tab A i k = |
110 get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1); |
130 get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1); |
111 |
131 |
112 |
132 |
113 |
133 |
114 fun mkState i jj A ([NT(B,~1)],id,~1) = |
134 fun mkState i jj A ([Nonterminal (B, ~1)], id, ~1) = |
115 (A,max_pri,[],[NT (B,jj)],id,i) |
135 (A, max_pri, [], [Nonterminal (B, jj)], id, i) |
116 | mkState i jj A (ss,id,j) = (A,j,[],ss,id,i) ; |
136 | mkState i jj A (ss, id, j) = (A, j, [], ss, id, i); |
117 |
137 |
118 |
138 |
119 |
139 |
120 fun conc (t,(k:int)) [] = (None, [(t,k)]) |
140 fun conc (t, k:int) [] = (None, [(t, k)]) |
121 | conc (t,k) ((t',k')::ts) = |
141 | conc (t, k) ((t', k') :: ts) = |
122 if (t = t') |
142 if t = t' then |
123 then (Some k', ( if k' >= k |
143 (Some k', if k' >= k then (t', k') :: ts else (t, k) :: ts) |
124 then (t',k')::ts |
|
125 else (t,k)::ts ) |
|
126 ) |
|
127 else let val (n, ts') = conc (t,k) ts |
|
128 in (n, (t',k')::ts') |
|
129 end; |
|
130 |
|
131 |
|
132 fun update_tree ((B,(i,ts))::used) (A,t) = |
|
133 if (A = B) |
|
134 then |
|
135 let val (n,ts') = conc t ts |
|
136 in ((A,(i,ts'))::used, n) |
|
137 end |
|
138 else |
144 else |
139 let val (used', n) = update_tree used (A,t) |
145 let val (n, ts') = conc (t, k) ts |
140 in ((B,(i,ts)) :: used', n) |
146 in (n, (t', k') :: ts') end; |
141 end; |
147 |
142 |
148 |
143 |
149 fun update_tree ((B, (i, ts)) :: used) (A, t) = |
144 |
150 if A = B then |
145 fun update_index ((B,(i,ts))::used) (A,j) = |
151 let val (n, ts') = conc t ts |
146 if (A = B) |
152 in ((A, (i, ts')) :: used, n) end |
147 then (A,(j,ts)) :: used |
153 else |
148 else (B,(i,ts)) :: (update_index used (A,j)); |
154 let val (used', n) = update_tree used (A, t) |
|
155 in ((B, (i, ts)) :: used', n) end; |
|
156 |
|
157 |
|
158 fun update_index ((B, (i, ts)) :: used) (A, j) = |
|
159 if A = B then (A, (j, ts)) :: used |
|
160 else (B, (i, ts)) :: (update_index used (A, j)); |
149 |
161 |
150 |
162 |
151 |
163 |
152 fun getS A i Si = |
164 fun getS A i Si = |
153 filter (fn (_,_,_,(NT(B,j))::_,_,_) |
165 filter |
154 => (A=B andalso j<=i) |
166 (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= i |
155 | _ => false |
167 | _ => false) Si; |
156 ) Si; |
|
157 |
|
158 |
|
159 |
168 |
160 fun getS' A k n Si = |
169 fun getS' A k n Si = |
161 filter (fn (_,_,_,(NT(B,j))::_,_,_) => (A=B andalso |
170 filter |
162 j<=k andalso |
171 (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k andalso j > n |
163 j>n ) |
172 | _ => false) Si; |
164 | _ => false |
|
165 ) Si; |
|
166 |
|
167 |
|
168 |
173 |
169 fun getStates Estate i ii A k = |
174 fun getStates Estate i ii A k = |
170 filter (fn (_,_,_,(NT(B,j))::_,_,_) |
175 filter |
171 => (A=B andalso j<=k) |
176 (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k |
172 | _ => false |
177 | _ => false) |
173 ) |
178 (Array.sub (Estate, ii)); |
174 (Array.sub (Estate, ii)) |
|
175 ; |
|
176 |
|
177 |
179 |
178 |
180 |
179 (* MMW *)(* FIXME *) |
181 (* MMW *)(* FIXME *) |
180 fun movedot_term (A,j,ts,T a::sa,id,i) c = |
182 fun movedot_term (A, j, ts, Terminal a :: sa, id, i) c = |
181 if (valued_token c) |
183 if valued_token c then |
182 then (A,j,(ts@[Tip c]),sa,id,i) |
184 (A, j, (ts @ [Tip c]), sa, id, i) |
183 else (A,j,ts,sa,id,i) ; |
185 else (A, j, ts, sa, id, i); |
184 |
186 |
185 |
187 fun movedot_nonterm ts (A, j, tss, Nonterminal (B, k) :: sa, id, i) = |
186 |
188 (A, j, tss @ ts, sa, id, i); |
187 fun movedot_nonterm ts (A,j,tss,NT(B,k) ::sa,id,i) = |
189 |
188 (A,j,tss@ts,sa,id,i) ; |
190 fun movedot_lambda _ [] = [] |
189 |
191 | movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ((t, ki) :: ts) = |
190 |
192 if k <= ki then |
191 |
193 (B, j, tss @ t, sa, id, i) :: |
192 fun movedot_lambda _ [] = [] |
194 movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts |
193 | movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ((t,ki)::ts) = |
195 else movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts; |
194 if k <= ki |
|
195 then (B,j,tss@t,sa,id,i) :: |
|
196 (movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts) |
|
197 else movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts |
|
198 ; |
|
199 |
|
200 |
196 |
201 |
197 |
202 |
198 |
203 fun PROCESSS Estate grammar i c states = |
199 fun PROCESSS Estate grammar i c states = |
204 |
|
205 let |
200 let |
206 fun processS used [] (Si,Sii) = (Si,Sii) |
201 fun processS used [] (Si, Sii) = (Si, Sii) |
207 | processS used (S::States) (Si,Sii) = |
202 | processS used (S :: States) (Si, Sii) = |
208 ( case S of |
203 (case S of |
209 |
204 (_, _, _, Nonterminal (A, j) :: _, _, _) => |
210 (_,_,_,(NT (A,j))::_,_,_) => |
|
211 let |
205 let |
212 val (used_neu, States_neu) = |
206 val (used_neu, States_neu) = |
213 ( case assoc (used, A) of |
207 (case assoc (used, A) of |
214 Some (k,l) => (* A gehoert zu used *) |
208 Some (k, l) => (*A gehoert zu used*) |
215 |
209 if k <= j (*Prioritaet wurde behandelt*) |
216 if (k <= j) (* Prioritaet wurde |
210 then (used, movedot_lambda S l) |
217 behandelt *) |
211 else (*Prioritaet wurde noch nicht behandelt*) |
218 then |
212 let |
219 (used, movedot_lambda S l) |
213 val L = getProductions' grammar A j k; |
220 |
214 val States' = map (mkState i j A) L; |
221 else (* Prioritaet wurde noch nicht |
215 in |
222 behandelt *) |
216 (update_index used (A, j), States' @ movedot_lambda S l) |
223 let val L = |
217 end |
224 getProductions' grammar A j k |
218 |
225 val States' = map (mkState i j A) L |
219 | None => (*A gehoert nicht zu used*) |
226 in |
220 let |
227 (update_index used (A,j), |
221 val L = getProductions grammar A j; |
228 States'@ movedot_lambda S l |
222 val States' = map (mkState i j A) L; |
229 ) |
223 in |
230 end |
224 ((A, (j, [])) :: used, States') |
231 |
225 end) |
232 | None => (* A gehoert nicht zu used *) |
226 in |
233 let val L = getProductions grammar A j |
227 processS used_neu (States @ States_neu) (S :: Si, Sii) |
234 val States' = map (mkState i j A) L |
228 end |
|
229 | (_, _, _, Terminal a :: _, _, _) => |
|
230 processS used States |
|
231 (S :: Si, |
|
232 if matching_tokens (a, c) then movedot_term S c :: Sii else Sii) |
|
233 (* MMW *)(* FIXME *) |
|
234 |
|
235 | (A, k, ts, [], id, j) => |
|
236 let |
|
237 val tt = if id = "" then ts else [Node (id, ts)] |
|
238 in |
|
239 if j = i then |
|
240 let |
|
241 val (used', O) = update_tree used (A, (tt, k)); |
|
242 in |
|
243 (case O of |
|
244 None => |
|
245 let |
|
246 val Slist = getS A k Si; |
|
247 val States' = map (movedot_nonterm tt) Slist; |
|
248 in |
|
249 processS used' (States @ States') (S :: Si, Sii) |
|
250 end |
|
251 | Some n => |
|
252 if n >= k then |
|
253 processS used' States (S :: Si, Sii) |
|
254 else |
|
255 let |
|
256 val Slist = getS' A k n Si; |
|
257 val States' = map (movedot_nonterm tt) Slist; |
235 in |
258 in |
236 ((A,(j,[]))::used, States') |
259 processS used' (States @ States') (S :: Si, Sii) |
237 end |
260 end) |
238 ) |
261 end |
239 in |
262 else |
240 processS used_neu (States @ States_neu) (S::Si,Sii) |
263 processS used |
241 end |
264 (States @ map (movedot_nonterm tt) (getStates Estate i j A k)) |
242 |
265 (S :: Si, Sii) |
243 | (_,_,_,(T a)::_,_,_) => |
266 end) |
244 processS used States |
|
245 (S::Si, if (matching_tokens (a, c)) |
|
246 then (movedot_term S c)::Sii (* MMW *)(* FIXME *) |
|
247 else Sii |
|
248 ) |
|
249 |
|
250 |
|
251 | (A,k,ts,[],id,j) => |
|
252 let val tt = if id = "" |
|
253 then ts |
|
254 else [Node(id,ts)] |
|
255 in |
|
256 if (j = i) |
|
257 then |
|
258 let val (used',O) = update_tree used (A,(tt,k)) |
|
259 in ( case O of |
|
260 None => |
|
261 let val Slist = getS A k Si |
|
262 val States' = |
|
263 map (movedot_nonterm tt ) Slist |
|
264 in processS used' |
|
265 (States @ States') (S::Si,Sii) |
|
266 end |
|
267 | Some n => |
|
268 if (n >= k) |
|
269 then |
|
270 processS used' States (S::Si,Sii) |
|
271 else |
|
272 let val Slist = getS' A k n Si |
|
273 val States' = |
|
274 map (movedot_nonterm tt ) Slist |
|
275 in |
|
276 processS used' |
|
277 (States @ States') (S::Si,Sii) |
|
278 end |
|
279 ) |
|
280 end |
|
281 |
|
282 else |
|
283 processS used (States @ |
|
284 map (movedot_nonterm tt) |
|
285 (getStates Estate i j A k)) |
|
286 (S::Si,Sii) |
|
287 end |
|
288 ) |
|
289 in |
267 in |
290 processS [] states ([],[]) |
268 processS [] states ([], []) |
291 end; |
269 end; |
292 |
270 |
293 |
271 |
294 |
272 |
295 fun syntax_error toks = |
273 fun syntax_error toks = |
296 error ("Syntax error at: " ^ space_implode " " (map str_of_token toks)); |
274 error ("Syntax error at: " ^ space_implode " " (map str_of_token toks)); |
297 |
275 |
298 fun Produce grammar stateset i indata = |
276 fun produce grammar stateset i indata = |
299 case (Array.sub (stateset,i)) of |
277 (case Array.sub (stateset, i) of |
300 [] => syntax_error indata (* MMW *)(* FIXME *) |
278 [] => syntax_error indata (* MMW *)(* FIXME *) |
301 | s => |
279 | s => |
302 (case indata of |
280 (case indata of |
303 [] => Array.sub (stateset,i) |
281 [] => Array.sub (stateset, i) |
304 | (c::cs) => |
282 | c :: cs => |
305 let val (si,sii) = |
283 let |
306 PROCESSS stateset grammar i c s |
284 val (si, sii) = PROCESSS stateset grammar i c s; |
307 in |
285 in |
308 Array.update (stateset,i,si); |
286 Array.update (stateset, i, si); |
309 Array.update (stateset,i+1,sii); |
287 Array.update (stateset, i + 1, sii); |
310 Produce grammar stateset (i+1) cs |
288 produce grammar stateset (i + 1) cs |
311 end |
289 end)); |
312 ); |
290 |
313 |
291 |
|
292 (*(* FIXME new *) |
|
293 val get_trees = mapfilter (fn (_, _, [pt], _, _, _) => Some pt | _ => None); |
|
294 *) |
314 |
295 |
315 fun get_trees [] = [] |
296 fun get_trees [] = [] |
316 | get_trees ((_,_,pt_lst,_,_,_) :: stateset) = |
297 | get_trees ((_, _, pt_lst, _, _, _) :: stateset) = |
317 let val l = get_trees stateset |
298 let val l = get_trees stateset |
318 in case pt_lst of |
299 in case pt_lst of |
319 [ptree] => ptree :: l |
300 [ptree] => ptree :: l |
320 | _ => l |
301 | _ => l |
321 end; |
302 end; |
322 |
|
323 |
|
324 |
303 |
325 fun earley grammar startsymbol indata = |
304 fun earley grammar startsymbol indata = |
326 let val S0 = |
305 let |
327 [("S'",0,[],[NT (startsymbol,0), T EndToken],"",0)] |
306 val S0 = [("S'", 0, [], [Nonterminal (startsymbol, 0), Terminal EndToken], "", 0)]; |
328 val s = length indata + 1 (* MMW *)(* FIXME was .. + 2 *) |
307 val s = length indata + 1; (* MMW *)(* FIXME was .. + 2 *) |
329 val Estate = Array.array (s, []) |
308 val Estate = Array.array (s, []); |
330 in Array.update (Estate,0,S0); |
309 in |
331 let val l = Produce grammar Estate 0 indata (* MMW *)(* FIXME was .. @ [EndToken] *) |
310 Array.update (Estate, 0, S0); |
332 val p_trees = get_trees l |
311 let |
333 in p_trees |
312 val l = produce grammar Estate 0 indata; (* MMW *)(* FIXME was .. @ [EndToken] *) |
334 end |
313 val p_trees = get_trees l; |
335 end; |
314 in p_trees end |
|
315 end; |
336 |
316 |
337 |
317 |
338 (* FIXME demo *) |
318 (* FIXME demo *) |
339 fun parse (Gram (roots, prod_tab)) root toks = |
319 fun parse (Gram (_, prod_tab)) start toks = |
340 if root mem roots then |
320 (case earley prod_tab start toks of |
341 (case earley prod_tab root toks of |
321 [] => sys_error "parse: no parse trees" |
342 [] => error "parse: no parse trees" |
322 | pt :: _ => pt); |
343 | pt :: _ => pt) |
|
344 else error ("Unparsable category: " ^ root); |
|
345 |
323 |
346 |
324 |
347 end; |
325 end; |
348 |
326 |