1 (* Title: Pure/Syntax/earley0A |
1 (* Title: Pure/Syntax/earley0A.ML |
2 ID: $Id$ |
2 ID: $Id$ |
3 Author: Tobias Nipkow |
3 Author: Tobias Nipkow |
4 |
4 |
5 Changes: |
5 WARNING: This file is about to disappear. |
6 PARSER: renamed Prod to prod |
|
7 renamed mk_early_gram to mk_earley_gram |
|
8 *) |
6 *) |
9 |
7 |
10 signature PARSER = |
8 signature PARSER = |
11 sig |
9 sig |
12 structure XGram: XGRAM |
10 structure XGram: XGRAM |
13 structure ParseTree: PARSE_TREE |
11 structure ParseTree: PARSE_TREE |
14 local open XGram ParseTree ParseTree.Lexicon in |
12 local open XGram ParseTree ParseTree.Lexicon in |
15 type Gram |
13 type gram |
16 val compile_xgram: string list * Token prod list -> Gram |
14 val mk_gram: string list -> string prod list -> gram |
17 val parse: Gram * string * Token list -> ParseTree |
15 val parse: gram -> string -> token list -> parsetree |
18 val parsable: Gram * string -> bool |
|
19 exception SYNTAX_ERR of Token list |
|
20 val print_gram: Gram * Lexicon -> unit |
|
21 end |
16 end |
|
17 |
|
18 (* exception SYNTAX_ERROR of token list *) |
|
19 (* val compile_xgram: string list * token prod list -> Gram *) |
|
20 (* val parsable: Gram * string -> bool *) |
|
21 (* val print_gram: Gram * lexicon -> unit *) |
22 end; |
22 end; |
23 |
23 |
24 functor EarleyFun(structure XGram:XGRAM and ParseTree:PARSE_TREE): PARSER = |
24 functor EarleyFun(structure Symtab: SYMTAB and XGram: XGRAM |
|
25 and ParseTree: PARSE_TREE): PARSER = |
25 struct |
26 struct |
26 |
27 |
|
28 structure ParseTree = ParseTree; |
|
29 structure Lexicon = ParseTree.Lexicon; |
27 structure XGram = XGram; |
30 structure XGram = XGram; |
28 structure ParseTree = ParseTree; |
31 open ParseTree Lexicon; |
|
32 |
|
33 |
|
34 (** mk_pt (from parse_tree.ML) **) |
|
35 |
|
36 fun mk_pt ("", [pt]) = pt |
|
37 | mk_pt ("", _) = error "mk_pt: funny copy op in parse tree" |
|
38 | mk_pt (s, ptl) = Node (s, ptl); |
|
39 |
|
40 |
|
41 |
|
42 (** token maps (from lexicon.ML) **) |
|
43 |
|
44 type 'b TokenMap = (token * 'b list) list * 'b list; |
|
45 val first_token = 0; |
|
46 |
|
47 fun int_of_token(Token(tk)) = first_token | |
|
48 int_of_token(IdentSy _) = first_token - 1 | |
|
49 int_of_token(VarSy _) = first_token - 2 | |
|
50 int_of_token(TFreeSy _) = first_token - 3 | |
|
51 int_of_token(TVarSy _) = first_token - 4 | |
|
52 int_of_token(EndToken) = first_token - 5; |
|
53 |
|
54 fun lesstk(s, t) = int_of_token(s) < int_of_token(t) orelse |
|
55 (case (s, t) of (Token(a), Token(b)) => a<b | _ => false); |
|
56 |
|
57 fun mkTokenMap(atll) = |
|
58 let val aill = atll; |
|
59 val dom = sort lesstk (distinct(flat(map snd aill))); |
|
60 val mt = map fst (filter (null o snd) atll); |
|
61 fun mktm(i) = |
|
62 let fun add(l, (a, il)) = if i mem il then a::l else l |
|
63 in (i, foldl add ([], aill)) end; |
|
64 in (map mktm dom, mt) end; |
|
65 |
|
66 fun find_al (i) = |
|
67 let fun find((j, al)::l) = if lesstk(i, j) then [] else |
|
68 if matching_tokens(i, j) then al else find l | |
|
69 find [] = []; |
|
70 in find end; |
|
71 fun applyTokenMap((l, mt), tk:token) = mt@(find_al tk l); |
|
72 |
|
73 |
29 |
74 |
30 (* Linked lists: *) |
75 (* Linked lists: *) |
31 infix 5 &; |
76 infix 5 &; |
32 datatype 'a LList = nilL | op & of 'a * ('a LListR) |
77 datatype 'a LList = nilL | op & of 'a * ('a LListR) |
33 withtype 'a LListR = 'a LList ref; |
78 withtype 'a LListR = 'a LList ref; |
55 let val ub = length b |
100 let val ub = length b |
56 fun acc(a,i) = if i=ub then a else acc(f(a,sub(b,i)),i+1) |
101 fun acc(a,i) = if i=ub then a else acc(f(a,sub(b,i)),i+1) |
57 in acc(a0,0) end; |
102 in acc(a0,0) end; |
58 |
103 |
59 (* |
104 (* |
60 Gram: name of nt -> number, nt number -> productions array, |
105 gram: name of nt -> number, nt number -> productions array, |
61 nt number -> list of nt's reachable via copy ops |
106 nt number -> list of nt's reachable via copy ops |
62 *) |
107 *) |
63 |
108 |
64 datatype Symbol = T of Token | NT of int * int |
109 datatype Symbol = T of token | NT of int * int |
65 and Op = Op of OpSyn * string * int |
110 and Op = Op of OpSyn * string * int |
66 withtype OpSyn = Symbol array |
111 withtype OpSyn = Symbol array |
67 and OpListA = Op array * int TokenMap |
112 and OpListA = Op array * int TokenMap |
68 and FastAcc = int TokenMap; |
113 and FastAcc = int TokenMap; |
69 |
114 |
70 type Gram = int Symtab.table * OpListA array * int list array; |
115 type gram = int Symtab.table * OpListA array * int list array; |
71 |
116 |
72 fun non_term(Nonterminal(s,_)) = if predef_term(s)=end_token then [s] else [] |
117 fun non_term(Nonterminal(s,_)) = if predef_term(s)=EndToken then [s] else [] |
73 | non_term(_) = []; |
118 | non_term(_) = []; |
74 |
119 |
75 fun non_terms(Prod(_,symbs,_,_)) = flat(map non_term symbs); |
120 fun non_terms(Prod(_,symbs,_,_)) = flat(map non_term symbs); |
76 |
121 |
77 (* mk_pre_grammar converts a list of productions in external format into an |
122 (* mk_pre_grammar converts a list of productions in external format into an |
78 internal Gram object. *) |
123 internal gram object. *) |
79 val dummyTM:FastAcc = mkTokenMap[]; |
124 val dummyTM:FastAcc = mkTokenMap[]; |
80 |
125 |
81 fun mk_pre_grammar(prods): Gram = |
126 fun mk_pre_grammar(prods): gram = |
82 let fun same_res(Prod(t1,_,_,_), Prod(t2,_,_,_)) = t1=t2; |
127 let fun same_res(Prod(t1,_,_,_), Prod(t2,_,_,_)) = t1=t2; |
83 val partitioned0 = partition_eq same_res prods; |
128 val partitioned0 = partition_eq same_res prods; |
84 val nts0 = map (fn Prod(ty,_,_,_)::_ => ty) partitioned0; |
129 val nts0 = map (fn Prod(ty,_,_,_)::_ => ty) partitioned0; |
85 val nts' = distinct(flat(map non_terms prods)) \\ nts0; |
130 val nts' = distinct(flat(map non_terms prods)) \\ nts0; |
86 val nts = nts' @ nts0; |
131 val nts = nts' @ nts0; |
88 val ntis = nts ~~ (0 upto (len(nts)-1)); |
133 val ntis = nts ~~ (0 upto (len(nts)-1)); |
89 val tab = foldr Symtab.update (ntis,Symtab.null); |
134 val tab = foldr Symtab.update (ntis,Symtab.null); |
90 |
135 |
91 fun nt_or_vt(s,p) = |
136 fun nt_or_vt(s,p) = |
92 (case predef_term(s) of |
137 (case predef_term(s) of |
93 end_token => let val Some(i) = Symtab.lookup(tab,s) in NT(i,p) end |
138 EndToken => let val Some(i) = Symtab.lookup(tab,s) in NT(i,p) end |
94 | tk => T(tk)); |
139 | tk => T(tk)); |
95 |
140 |
96 fun mksyn(Terminal(t)) = [T(t)] |
141 fun mksyn(Terminal(t)) = [T(t)] |
97 | mksyn(Nonterminal(t)) = [nt_or_vt t] |
142 | mksyn(Nonterminal(t)) = [nt_or_vt t] |
98 | mksyn(_) = []; |
143 | mksyn(_) = []; |
104 |
149 |
105 val opLA = arrayoflist(map mkops partitioned); |
150 val opLA = arrayoflist(map mkops partitioned); |
106 |
151 |
107 val subs = array(length opLA,[]) : int list array; |
152 val subs = array(length opLA,[]) : int list array; |
108 fun newcp v (a,Op(syA,_,p)) = |
153 fun newcp v (a,Op(syA,_,p)) = |
109 if p=copy_pri |
154 if p=chain_pri |
110 then let val NT(k,_) = sub(syA,0) |
155 then let val NT(k,_) = sub(syA,0) |
111 in if k mem v then a else k ins a end |
156 in if k mem v then a else k ins a end |
112 else a; |
157 else a; |
113 fun reach v i = |
158 fun reach v i = |
114 let val new = itA ([],#1(sub(opLA,i))) (newcp v) |
159 let val new = itA ([],#1(sub(opLA,i))) (newcp v) |
121 val RootPref = "__"; |
166 val RootPref = "__"; |
122 |
167 |
123 (* Lookahead tables for speeding up parsing. Lkhd is a mapping from |
168 (* Lookahead tables for speeding up parsing. Lkhd is a mapping from |
124 nonterminals (in the form of OpList) to sets (lists) of token strings *) |
169 nonterminals (in the form of OpList) to sets (lists) of token strings *) |
125 |
170 |
126 type Lkhd = Token list list list; |
171 type Lkhd = token list list list; |
127 |
172 |
128 (* subst_syn(s,k) syn = [ pref k ts | ts is a token string derivable from sy |
173 (* subst_syn(s,k) syn = [ pref k ts | ts is a token string derivable from sy |
129 under substitution s ] *) |
174 under substitution s ] *) |
130 (* This is the general version. |
175 (* This is the general version. |
131 fun cross l1 l2 = flat(map (fn e2 => (map (fn e1 => e1@e2) l1)) l2); |
176 fun cross l1 l2 = flat(map (fn e2 => (map (fn e1 => e1@e2) l1)) l2); |
134 fun pref 0 l = [] |
179 fun pref 0 l = [] |
135 | pref _ [] = [] |
180 | pref _ [] = [] |
136 | pref k (e::l) = e::(pref (k-1) l); |
181 | pref k (e::l) = e::(pref (k-1) l); |
137 |
182 |
138 fun subst_syn(s:Lkhd,k) = |
183 fun subst_syn(s:Lkhd,k) = |
139 let fun subst(ref(symb & syn)):Token list list = |
184 let fun subst(ref(symb & syn)):token list list = |
140 let val l1 = case symb of T t => [[t]] | |
185 let val l1 = case symb of T t => [[t]] | |
141 NT(oplr,_) => let val Some l = assoc(s,!oplr) in l end |
186 NT(oplr,_) => let val Some l = assoc(s,!oplr) in l end |
142 in distinct(map (pref k) (cross l1 (subst syn))) end | |
187 in distinct(map (pref k) (cross l1 (subst syn))) end | |
143 subst _ = [[]] |
188 subst _ = [[]] |
144 in subst end; |
189 in subst end; |
166 else iterate s' end |
211 else iterate s' end |
167 in writeln"Computing lookahead tables ..."; |
212 in writeln"Computing lookahead tables ..."; |
168 iterate (replicate (length opLA) []) end; |
213 iterate (replicate (length opLA) []) end; |
169 |
214 |
170 (* create look ahead tables *) |
215 (* create look ahead tables *) |
171 fun mk_earley_gram(g as (tab,opLA,_):Gram):Gram = |
216 fun mk_earley_gram(g as (tab,opLA,_):gram):gram = |
172 let val lkhd = mk_lkhd(opLA,1); |
217 let val lkhd = mk_lkhd(opLA,1); |
173 fun mk_fa(i):FastAcc = |
218 fun mk_fa(i):FastAcc = |
174 let val opA = #1(sub(opLA,i)); |
219 let val opA = #1(sub(opLA,i)); |
175 fun start(j) = let val Op(sy,_,_) = sub(opA,j); |
220 fun start(j) = let val Op(sy,_,_) = sub(opA,j); |
176 val pre = subst_syn(lkhd,1) sy |
221 val pre = subst_syn(lkhd,1) sy |
180 |
225 |
181 in forA(updt,opLA); g end; |
226 in forA(updt,opLA); g end; |
182 |
227 |
183 fun compile_xgram(roots,prods) = |
228 fun compile_xgram(roots,prods) = |
184 let fun mk_root nt = Prod(RootPref^nt, |
229 let fun mk_root nt = Prod(RootPref^nt, |
185 [Nonterminal(nt,0),Terminal(end_token)],"",0); |
230 [Nonterminal(nt,0),Terminal(EndToken)],"",0); |
186 val prods' = (map mk_root roots) @ prods |
231 val prods' = (map mk_root roots) @ prods |
187 in mk_earley_gram(mk_pre_grammar(prods')) end; |
232 in mk_earley_gram(mk_pre_grammar(prods')) end; |
|
233 |
|
234 |
|
235 (* translate (from xgram.ML) *) |
|
236 |
|
237 fun translate trfn = |
|
238 map |
|
239 (fn Terminal t => Terminal (trfn t) |
|
240 | Nonterminal s => Nonterminal s |
|
241 | Space s => Space s |
|
242 | Bg i => Bg i |
|
243 | Brk i => Brk i |
|
244 | En => En); |
|
245 |
|
246 |
|
247 (* mk_gram (from syntax.ML) *) |
|
248 |
|
249 fun str_to_tok (opl: string prod list): token prod list = |
|
250 map |
|
251 (fn Prod (t, syn, s, pa) => Prod (t, translate Token syn, s, pa)) |
|
252 opl; |
|
253 |
|
254 fun mk_gram roots prods = compile_xgram (roots, str_to_tok prods); |
|
255 |
|
256 |
188 |
257 |
189 (* State: nonterminal#, production#, index in production, |
258 (* State: nonterminal#, production#, index in production, |
190 index of originating state set, |
259 index of originating state set, |
191 parse trees generated so far, |
260 parse trees generated so far, |
192 *) |
261 *) |
193 |
262 |
194 datatype State = St of int * int * int * int * ParseTree list |
263 datatype State = St of int * int * int * int * parsetree list |
195 withtype StateSet = State LListR * (State -> unit) LListR; |
264 withtype StateSet = State LListR * (State -> unit) LListR; |
196 type Compl = State -> unit; |
265 type Compl = State -> unit; |
197 type StateSetList = StateSet array; |
266 type StateSetList = StateSet array; |
198 (* Debugging: |
267 (* Debugging: |
199 val print_SL = seqll(fn St(nti,pi,ip,fs,ptl)=> |
268 val print_SL = seqll(fn St(nti,pi,ip,fs,ptl)=> |
253 fun add(pi) = let val opr as Op(syA,_,p) = sub(opA,pi) |
322 fun add(pi) = let val opr as Op(syA,_,p) = sub(opA,pi) |
254 in if p < p' then () else add_state(nti,pi,0,isi,[],si) end |
323 in if p < p' then () else add_state(nti,pi,0,isi,[],si) end |
255 in seq add (applyTokenMap(tm,tk)) end; |
324 in seq add (applyTokenMap(tm,tk)) end; |
256 |
325 |
257 |
326 |
258 fun parsable((tab,_,_):Gram, root:string) = |
327 (*(* FIXME *) |
|
328 fun parsable((tab,_,_):gram, root:string) = |
259 not(Symtab.lookup(tab,RootPref^root) = None); |
329 not(Symtab.lookup(tab,RootPref^root) = None); |
260 |
330 *) |
261 exception SYNTAX_ERR of Token list; |
331 |
262 |
332 (* exception SYNTAX_ERROR of token list; *) (* FIXME *) |
263 fun unknown c = error("System Error - Trying to parse unknown category "^c); |
333 |
264 |
334 fun unknown c = error ("Unparsable category: " ^ c); |
265 fun parse((tab,opLA,rchA):Gram, root:string, tl: Token list): ParseTree = |
335 |
266 let val tl' = tl@[end_token]; |
336 fun syn_err toks = error ("Syntax error at: " ^ space_implode " " (map str_of_token toks)); |
|
337 |
|
338 fun parse ((tab,opLA,rchA):gram) (root:string) (tl: token list): parsetree = |
|
339 let val tl' = tl; (* FIXME was ..@[EndToken] *) |
267 val state_sets = mt_states(len tl' +1); |
340 val state_sets = mt_states(len tl' +1); |
268 val s0 = mt_stateS(); |
341 val s0 = mt_stateS(); |
269 val rooti = case Symtab.lookup(tab,RootPref^root) of |
342 val rooti = case Symtab.lookup(tab,RootPref^root) of |
270 Some(ri) => ri | None => unknown root; |
343 Some(ri) => ri | None => unknown root; |
271 |
344 |
272 fun lr (tl,isi,si,t) = |
345 fun lr (tl,isi,si,t) = |
273 if ismt_stateS(si) then raise SYNTAX_ERR(t::tl) else |
346 (* if ismt_stateS(si) then raise SYNTAX_ERROR(t::tl) else *) (* FIXME *) |
|
347 if ismt_stateS(si) then syn_err (t::tl) else |
274 case tl of |
348 case tl of |
275 [] => () | |
349 [] => () | |
276 t::tl => |
350 t::tl => |
277 let val si1 = mt_stateS(); |
351 let val si1 = mt_stateS(); |
278 fun process(St(nti,pi,ip,from,ptl)) = |
352 fun process(St(nti,pi,ip,from,ptl)) = |
296 update(state_sets,isi+1,si1); |
370 update(state_sets,isi+1,si1); |
297 lr(tl,isi+1,si1,t) end |
371 lr(tl,isi+1,si1,t) end |
298 |
372 |
299 in update(state_sets,0,s0); |
373 in update(state_sets,0,s0); |
300 add_state(rooti,0,0,0,[],s0); |
374 add_state(rooti,0,0,0,[],s0); |
301 lr(tl',0,s0,end_token(*dummy*)); |
375 lr(tl',0,s0,EndToken(*dummy*)); |
302 (*print_stat state_sets;*) |
376 (*print_stat state_sets;*) |
303 let val St(_,_,_,_,[pt]) = fst_state(sub(state_sets,len tl')) |
377 let val St(_,_,_,_,[pt]) = fst_state(sub(state_sets,len tl')) |
304 in pt end |
378 in pt end |
305 end; |
379 end; |
306 |
380 |
307 fun print_gram ((st,opAA,rchA):Gram,lex) = |
381 |
|
382 (*(* FIXME *) |
|
383 fun print_gram ((st,opAA,rchA):gram,lex) = |
308 let val tts = Lexicon.name_of_token; |
384 let val tts = Lexicon.name_of_token; |
309 val al = map (fn (x,y)=>(y,x)) (Symtab.alist_of st); |
385 val al = map (fn (x,y)=>(y,x)) (Symtab.alist_of st); |
310 fun nt i = let val Some(s) = assoc(al,i) in s end; |
386 fun nt i = let val Some(s) = assoc(al,i) in s end; |
311 fun print_sy(T(end_token)) = prs". " | |
387 fun print_sy(T(EndToken)) = prs". " | |
312 print_sy(T(tk)) = (prs(tts tk); prs" ") | |
388 print_sy(T(tk)) = (prs(tts tk); prs" ") | |
313 print_sy(NT(i,p)) = (prs((nt i)^"[");print_int p;prs"] "); |
389 print_sy(NT(i,p)) = (prs((nt i)^"[");print_int p;prs"] "); |
314 fun print_opA(i) = |
390 fun print_opA(i) = |
315 let val lhs = nt i; |
391 let val lhs = nt i; |
316 val (opA,_)=sub(opAA,i); |
392 val (opA,_)=sub(opAA,i); |