0
|
1 |
(* Title: Pure/Syntax/earley0A
|
|
2 |
ID: $Id$
|
|
3 |
Author: Tobias Nipkow
|
|
4 |
|
|
5 |
Changes:
|
|
6 |
PARSER: renamed Prod to prod
|
|
7 |
renamed mk_early_gram to mk_earley_gram
|
|
8 |
*)
|
|
9 |
|
|
10 |
signature PARSER =
|
|
11 |
sig
|
|
12 |
structure XGram: XGRAM
|
|
13 |
structure ParseTree: PARSE_TREE
|
|
14 |
local open XGram ParseTree ParseTree.Lexicon in
|
|
15 |
type Gram
|
|
16 |
val compile_xgram: string list * Token prod list -> Gram
|
|
17 |
val parse: Gram * string * Token list -> ParseTree
|
|
18 |
val parsable: Gram * string -> bool
|
|
19 |
exception SYNTAX_ERR of Token list
|
|
20 |
val print_gram: Gram * Lexicon -> unit
|
|
21 |
end
|
|
22 |
end;
|
|
23 |
|
|
24 |
functor EarleyFun(structure XGram:XGRAM and ParseTree:PARSE_TREE): PARSER =
|
|
25 |
struct
|
|
26 |
|
|
27 |
structure XGram = XGram;
|
|
28 |
structure ParseTree = ParseTree;
|
|
29 |
|
|
30 |
(* Linked lists: *)
|
|
31 |
infix 5 &;
|
|
32 |
datatype 'a LList = nilL | op & of 'a * ('a LListR)
|
|
33 |
withtype 'a LListR = 'a LList ref;
|
|
34 |
|
|
35 |
(* Apply proc to all elements in a linked list *)
|
|
36 |
fun seqll (proc: '_a -> unit) : ('_a LListR -> unit) =
|
|
37 |
let fun seq (ref nilL) = () |
|
|
38 |
seq (ref((a:'_a)&l)) = (proc a; seq l)
|
|
39 |
in seq end;
|
|
40 |
|
|
41 |
fun llist_to_list (ref nilL) = [] |
|
|
42 |
llist_to_list (ref(a&ll)) = a::(llist_to_list ll);
|
|
43 |
|
|
44 |
val len = length;
|
|
45 |
|
|
46 |
local open Array XGram ParseTree ParseTree.Lexicon in
|
|
47 |
nonfix sub;
|
|
48 |
|
|
49 |
fun forA(p: int -> unit, a: 'a array) : unit =
|
|
50 |
let val ub = length a
|
|
51 |
fun step(i) = if i=ub then () else (p(i); step(i+1));
|
|
52 |
in step 0 end;
|
|
53 |
|
|
54 |
fun itA(a0:'a, b: 'b array)(f:'a * 'b -> 'a) : 'a =
|
|
55 |
let val ub = length b
|
|
56 |
fun acc(a,i) = if i=ub then a else acc(f(a,sub(b,i)),i+1)
|
|
57 |
in acc(a0,0) end;
|
|
58 |
|
|
59 |
(*
|
|
60 |
Gram: name of nt -> number, nt number -> productions array,
|
|
61 |
nt number -> list of nt's reachable via copy ops
|
|
62 |
*)
|
|
63 |
|
|
64 |
datatype Symbol = T of Token | NT of int * int
|
|
65 |
and Op = Op of OpSyn * string * int
|
|
66 |
withtype OpSyn = Symbol array
|
|
67 |
and OpListA = Op array * int TokenMap
|
|
68 |
and FastAcc = int TokenMap;
|
|
69 |
|
|
70 |
type Gram = int Symtab.table * OpListA array * int list array;
|
|
71 |
|
|
72 |
fun non_term(Nonterminal(s,_)) = if predef_term(s)=end_token then [s] else []
|
|
73 |
| non_term(_) = [];
|
|
74 |
|
|
75 |
fun non_terms(Prod(_,symbs,_,_)) = flat(map non_term symbs);
|
|
76 |
|
|
77 |
(* mk_pre_grammar converts a list of productions in external format into an
|
|
78 |
internal Gram object. *)
|
|
79 |
val dummyTM:FastAcc = mkTokenMap[];
|
|
80 |
|
|
81 |
fun mk_pre_grammar(prods): Gram =
|
|
82 |
let fun same_res(Prod(t1,_,_,_), Prod(t2,_,_,_)) = t1=t2;
|
|
83 |
val partitioned0 = partition_eq same_res prods;
|
|
84 |
val nts0 = map (fn Prod(ty,_,_,_)::_ => ty) partitioned0;
|
|
85 |
val nts' = distinct(flat(map non_terms prods)) \\ nts0;
|
|
86 |
val nts = nts' @ nts0;
|
|
87 |
val partitioned = (replicate (len nts') []) @ partitioned0;
|
|
88 |
val ntis = nts ~~ (0 upto (len(nts)-1));
|
|
89 |
val tab = foldr Symtab.update (ntis,Symtab.null);
|
|
90 |
|
|
91 |
fun nt_or_vt(s,p) =
|
|
92 |
(case predef_term(s) of
|
|
93 |
end_token => let val Some(i) = Symtab.lookup(tab,s) in NT(i,p) end
|
|
94 |
| tk => T(tk));
|
|
95 |
|
|
96 |
fun mksyn(Terminal(t)) = [T(t)]
|
|
97 |
| mksyn(Nonterminal(t)) = [nt_or_vt t]
|
|
98 |
| mksyn(_) = [];
|
|
99 |
|
|
100 |
fun prod2op(Prod(nt,sy,opn,p)) =
|
|
101 |
let val syA = arrayoflist(flat(map mksyn sy)) in Op(syA,opn,p) end;
|
|
102 |
|
|
103 |
fun mkops prods = (arrayoflist(map prod2op prods),dummyTM);
|
|
104 |
|
|
105 |
val opLA = arrayoflist(map mkops partitioned);
|
|
106 |
|
|
107 |
val subs = array(length opLA,[]) : int list array;
|
|
108 |
fun newcp v (a,Op(syA,_,p)) =
|
|
109 |
if p=copy_pri
|
|
110 |
then let val NT(k,_) = sub(syA,0)
|
|
111 |
in if k mem v then a else k ins a end
|
|
112 |
else a;
|
|
113 |
fun reach v i =
|
|
114 |
let val new = itA ([],#1(sub(opLA,i))) (newcp v)
|
|
115 |
val v' = new union v
|
|
116 |
in flat(map (reach v') new) union v' end;
|
|
117 |
fun rch(i) = update(subs,i,reach[i]i);
|
|
118 |
|
|
119 |
in forA(rch,subs); (tab,opLA,subs) end;
|
|
120 |
|
|
121 |
val RootPref = "__";
|
|
122 |
|
|
123 |
(* Lookahead tables for speeding up parsing. Lkhd is a mapping from
|
|
124 |
nonterminals (in the form of OpList) to sets (lists) of token strings *)
|
|
125 |
|
|
126 |
type Lkhd = Token list list list;
|
|
127 |
|
|
128 |
(* subst_syn(s,k) syn = [ pref k ts | ts is a token string derivable from sy
|
|
129 |
under substitution s ] *)
|
|
130 |
(* This is the general version.
|
|
131 |
fun cross l1 l2 = flat(map (fn e2 => (map (fn e1 => e1@e2) l1)) l2);
|
|
132 |
|
|
133 |
(* pref k [x1,...,xn] is [x1,...,xk] if 0<=k<=n and [x1,...,xn] otherwise *)
|
|
134 |
fun pref 0 l = []
|
|
135 |
| pref _ [] = []
|
|
136 |
| pref k (e::l) = e::(pref (k-1) l);
|
|
137 |
|
|
138 |
fun subst_syn(s:Lkhd,k) =
|
|
139 |
let fun subst(ref(symb & syn)):Token list list =
|
|
140 |
let val l1 = case symb of T t => [[t]] |
|
|
141 |
NT(oplr,_) => let val Some l = assoc(s,!oplr) in l end
|
|
142 |
in distinct(map (pref k) (cross l1 (subst syn))) end |
|
|
143 |
subst _ = [[]]
|
|
144 |
in subst end;
|
|
145 |
*)
|
|
146 |
(* This one is specialized to lookahead 1 and a bit too generous *)
|
|
147 |
fun subst_syn(s:Lkhd,1) syA =
|
|
148 |
let fun subst i = if i = length(syA) then [[]] else
|
|
149 |
case sub(syA,i) of
|
|
150 |
NT(j,_) => let val pre = nth_elem(j,s)
|
|
151 |
in if [] mem pre then (pre \ []) union subst(i+1)
|
|
152 |
else pre end |
|
|
153 |
T(tk) => [[tk]];
|
|
154 |
in subst 0 end;
|
|
155 |
|
|
156 |
(* mk_lkhd(G,k) returns a table which associates with every nonterminal N in
|
|
157 |
G the list of pref k s for all token strings s with N -G->* s *)
|
|
158 |
|
|
159 |
fun mk_lkhd(opLA:OpListA array,k:int):Lkhd =
|
|
160 |
let fun step(s:Lkhd):Lkhd =
|
|
161 |
let fun subst_op(l,Op(sy,_,_)) = subst_syn(s,k)sy union l;
|
|
162 |
fun step2(l,(opA,_)) = l@[itA([],opA)subst_op];
|
|
163 |
in writeln"."; itA([],opLA)step2 end;
|
|
164 |
fun iterate(s:Lkhd):Lkhd = let val s' = step s
|
|
165 |
in if map len s = map len s' then s
|
|
166 |
else iterate s' end
|
|
167 |
in writeln"Computing lookahead tables ...";
|
|
168 |
iterate (replicate (length opLA) []) end;
|
|
169 |
|
|
170 |
(* create look ahead tables *)
|
|
171 |
fun mk_earley_gram(g as (tab,opLA,_):Gram):Gram =
|
|
172 |
let val lkhd = mk_lkhd(opLA,1);
|
|
173 |
fun mk_fa(i):FastAcc =
|
|
174 |
let val opA = #1(sub(opLA,i));
|
|
175 |
fun start(j) = let val Op(sy,_,_) = sub(opA,j);
|
|
176 |
val pre = subst_syn(lkhd,1) sy
|
|
177 |
in (j,if [] mem pre then [] else map hd pre) end;
|
|
178 |
in mkTokenMap(map start (0 upto(length(opA)-1))) end;
|
|
179 |
fun updt(i) = update(opLA,i,(#1(sub(opLA,i)),mk_fa(i)));
|
|
180 |
|
|
181 |
in forA(updt,opLA); g end;
|
|
182 |
|
|
183 |
fun compile_xgram(roots,prods) =
|
|
184 |
let fun mk_root nt = Prod(RootPref^nt,
|
|
185 |
[Nonterminal(nt,0),Terminal(end_token)],"",0);
|
|
186 |
val prods' = (map mk_root roots) @ prods
|
|
187 |
in mk_earley_gram(mk_pre_grammar(prods')) end;
|
|
188 |
|
|
189 |
(* State: nonterminal#, production#, index in production,
|
|
190 |
index of originating state set,
|
|
191 |
parse trees generated so far,
|
|
192 |
*)
|
|
193 |
|
|
194 |
datatype State = St of int * int * int * int * ParseTree list
|
|
195 |
withtype StateSet = State LListR * (State -> unit) LListR;
|
|
196 |
type Compl = State -> unit;
|
|
197 |
type StateSetList = StateSet array;
|
|
198 |
(* Debugging:
|
|
199 |
val print_SL = seqll(fn St(nti,pi,ip,fs,ptl)=>
|
|
200 |
(print_int nti; prs" "; print_int pi; prs" "; print_int ip; prs" ";
|
|
201 |
print_int fs; prs" "; print_int(len ptl); prs"\n"));
|
|
202 |
|
|
203 |
fun print_SS(s1,delr) = (writeln"================="; print_SL s1);
|
|
204 |
|
|
205 |
fun count_ss(ref nilL) = 0
|
|
206 |
| count_ss(ref(_ & ss)) = count_ss(ss)+1;
|
|
207 |
|
|
208 |
fun print_stat(state_sets) =
|
|
209 |
let fun pr i = let val(s1,_)=sub(state_sets,i)
|
|
210 |
in prs" "; print_int(count_ss s1) end;
|
|
211 |
in prs"["; forA(pr,state_sets); prs"]\n" end;
|
|
212 |
*)
|
|
213 |
fun mt_stateS():StateSet = (ref nilL, ref nilL);
|
|
214 |
|
|
215 |
fun mt_states(n):StateSetList = array(n,mt_stateS());
|
|
216 |
|
|
217 |
fun ismt_stateS((ref nilL,_):StateSet) = true | ismt_stateS _ = false;
|
|
218 |
|
|
219 |
fun fst_state((ref(st & _),_): StateSet) = st;
|
|
220 |
|
|
221 |
fun apply_all_states(f,(slr,_):StateSet) = seqll f slr;
|
|
222 |
|
|
223 |
fun add_state(nti,pi,ip,from,ptl,(sllr,delr):StateSet) =
|
|
224 |
let fun add(ref(St(nti',pi',ip',from',_) & rest)) =
|
|
225 |
if nti=nti' andalso pi=pi' andalso ip=ip' andalso from=from'
|
|
226 |
then ()
|
|
227 |
else add rest |
|
|
228 |
add(last as ref nilL) =
|
|
229 |
let val newst = St(nti,pi,ip,from,ptl)
|
|
230 |
in last := newst & ref nilL;
|
|
231 |
seqll (fn compl => compl newst) delr
|
|
232 |
end;
|
|
233 |
in add sllr end;
|
|
234 |
|
|
235 |
fun complete(nti,syA,opn,p,ptl,ss,si as (_,delr):StateSet,opLA,rchA) =
|
|
236 |
let val pt = mk_pt(opn,ptl)
|
|
237 |
fun compl(St(ntj,pj,jp,from,ptl)) =
|
|
238 |
let val Op(syj,_,_) = sub(fst(sub(opLA,ntj)),pj) in
|
|
239 |
if jp=length(syj) then () else
|
|
240 |
case sub(syj,jp) of
|
|
241 |
NT(nt,p') => if p >= p' andalso nti mem sub(rchA,nt)
|
|
242 |
then add_state(ntj,pj,jp+1,from,ptl@[pt], si)
|
|
243 |
else ()
|
|
244 |
| _ => ()
|
|
245 |
end
|
|
246 |
in apply_all_states(compl,ss);
|
|
247 |
if length(syA)=0 (* delayed completion in case of empty production: *)
|
|
248 |
then delr := compl & ref(!delr) else ()
|
|
249 |
end;
|
|
250 |
|
|
251 |
fun predict(tk,isi,si,p',opLA) = fn nti =>
|
|
252 |
let val (opA,tm) = sub(opLA,nti);
|
|
253 |
fun add(pi) = let val opr as Op(syA,_,p) = sub(opA,pi)
|
|
254 |
in if p < p' then () else add_state(nti,pi,0,isi,[],si) end
|
|
255 |
in seq add (applyTokenMap(tm,tk)) end;
|
|
256 |
|
|
257 |
|
|
258 |
fun parsable((tab,_,_):Gram, root:string) =
|
|
259 |
not(Symtab.lookup(tab,RootPref^root) = None);
|
|
260 |
|
|
261 |
exception SYNTAX_ERR of Token list;
|
|
262 |
|
|
263 |
fun unknown c = error("System Error - Trying to parse unknown category "^c);
|
|
264 |
|
|
265 |
fun parse((tab,opLA,rchA):Gram, root:string, tl: Token list): ParseTree =
|
|
266 |
let val tl' = tl@[end_token];
|
|
267 |
val state_sets = mt_states(len tl' +1);
|
|
268 |
val s0 = mt_stateS();
|
|
269 |
val rooti = case Symtab.lookup(tab,RootPref^root) of
|
|
270 |
Some(ri) => ri | None => unknown root;
|
|
271 |
|
|
272 |
fun lr (tl,isi,si,t) =
|
|
273 |
if ismt_stateS(si) then raise SYNTAX_ERR(t::tl) else
|
|
274 |
case tl of
|
|
275 |
[] => () |
|
|
276 |
t::tl =>
|
|
277 |
let val si1 = mt_stateS();
|
|
278 |
fun process(St(nti,pi,ip,from,ptl)) =
|
|
279 |
let val opA = #1(sub(opLA,nti))
|
|
280 |
val Op(syA,opn,p) = sub(opA,pi) in
|
|
281 |
if ip = length(syA)
|
|
282 |
then complete(nti,syA,opn,p,ptl,
|
|
283 |
sub(state_sets,from),si,opLA,rchA)
|
|
284 |
else case sub(syA,ip) of
|
|
285 |
NT(ntj,p) =>
|
|
286 |
seq (predict(t,isi,si,p,opLA)) (sub(rchA,ntj))
|
|
287 |
| T(t') =>
|
|
288 |
if matching_tokens(t,t')
|
|
289 |
then add_state(nti,pi,ip+1,from,
|
|
290 |
if valued_token(t)
|
|
291 |
then ptl@[Tip(t)] else ptl,
|
|
292 |
si1)
|
|
293 |
else () end;
|
|
294 |
|
|
295 |
in apply_all_states(process,si);
|
|
296 |
update(state_sets,isi+1,si1);
|
|
297 |
lr(tl,isi+1,si1,t) end
|
|
298 |
|
|
299 |
in update(state_sets,0,s0);
|
|
300 |
add_state(rooti,0,0,0,[],s0);
|
|
301 |
lr(tl',0,s0,end_token(*dummy*));
|
|
302 |
(*print_stat state_sets;*)
|
|
303 |
let val St(_,_,_,_,[pt]) = fst_state(sub(state_sets,len tl'))
|
|
304 |
in pt end
|
|
305 |
end;
|
|
306 |
|
|
307 |
fun print_gram ((st,opAA,rchA):Gram,lex) =
|
|
308 |
let val tts = Lexicon.name_of_token;
|
|
309 |
val al = map (fn (x,y)=>(y,x)) (Symtab.alist_of st);
|
|
310 |
fun nt i = let val Some(s) = assoc(al,i) in s end;
|
|
311 |
fun print_sy(T(end_token)) = prs". " |
|
|
312 |
print_sy(T(tk)) = (prs(tts tk); prs" ") |
|
|
313 |
print_sy(NT(i,p)) = (prs((nt i)^"[");print_int p;prs"] ");
|
|
314 |
fun print_opA(i) =
|
|
315 |
let val lhs = nt i;
|
|
316 |
val (opA,_)=sub(opAA,i);
|
|
317 |
fun print_op(j) =
|
|
318 |
let val Op(sy,n,p) = sub(opA,j)
|
|
319 |
in prs(lhs^" = "); forA(fn i=>print_sy(sub(sy,i)),sy);
|
|
320 |
if n="" then () else prs(" => \""^n^"\"");
|
|
321 |
prs" (";print_int p;prs")\n"
|
|
322 |
end;
|
|
323 |
in forA(print_op,opA) end;
|
|
324 |
fun print_rch(i) = (print_int i; prs" -> ";
|
|
325 |
print_list("[","]\n",print_int) (sub(rchA,i)))
|
|
326 |
in forA(print_opA,opAA) (*; forA(print_rch,rchA) *) end;
|
|
327 |
|
|
328 |
end;
|
|
329 |
|
|
330 |
end;
|