src/Pure/Syntax/earley0A.ML
changeset 0 a5a9c433f639
child 18 c9ec452ff08f
     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/src/Pure/Syntax/earley0A.ML	Thu Sep 16 12:20:38 1993 +0200
     1.3 @@ -0,0 +1,330 @@
     1.4 +(*  Title:      Pure/Syntax/earley0A
     1.5 +    ID:         $Id$
     1.6 +    Author:     Tobias Nipkow
     1.7 +
     1.8 +Changes:
     1.9 +  PARSER: renamed Prod to prod
    1.10 +  renamed mk_early_gram to mk_earley_gram
    1.11 +*)
    1.12 +
    1.13 +signature PARSER =
    1.14 +sig
    1.15 +  structure XGram: XGRAM
    1.16 +  structure ParseTree: PARSE_TREE
    1.17 +  local open XGram ParseTree ParseTree.Lexicon in
    1.18 +  type Gram
    1.19 +  val compile_xgram: string list * Token prod list -> Gram
    1.20 +  val parse: Gram * string * Token list -> ParseTree
    1.21 +  val parsable: Gram * string -> bool
    1.22 +  exception SYNTAX_ERR of Token list
    1.23 +  val print_gram: Gram * Lexicon -> unit
    1.24 +  end
    1.25 +end;
    1.26 +
    1.27 +functor EarleyFun(structure XGram:XGRAM and ParseTree:PARSE_TREE): PARSER =
    1.28 +struct
    1.29 +
    1.30 +structure XGram = XGram;
    1.31 +structure ParseTree = ParseTree;
    1.32 +
    1.33 +(* Linked lists: *)
    1.34 +infix 5 &;
    1.35 +datatype 'a LList = nilL | op & of 'a * ('a LListR)
    1.36 +withtype 'a LListR = 'a LList ref;
    1.37 +
    1.38 +(* Apply proc to all elements in a linked list *)
    1.39 +fun seqll (proc: '_a -> unit) : ('_a LListR -> unit) =
    1.40 +    let fun seq (ref nilL) = () |
    1.41 +            seq (ref((a:'_a)&l)) = (proc a; seq l)
    1.42 +    in seq end;
    1.43 +
    1.44 +fun llist_to_list (ref nilL) = [] |
    1.45 +    llist_to_list (ref(a&ll)) = a::(llist_to_list ll);
    1.46 +
    1.47 +val len = length;
    1.48 +
    1.49 +local open Array XGram ParseTree ParseTree.Lexicon in
    1.50 +nonfix sub;
    1.51 +
    1.52 +fun forA(p: int -> unit, a: 'a array) : unit =
    1.53 +    let val ub = length a
    1.54 +        fun step(i) = if i=ub then () else (p(i); step(i+1));
    1.55 +    in step 0 end;
    1.56 +
    1.57 +fun itA(a0:'a, b: 'b array)(f:'a * 'b -> 'a) : 'a =
    1.58 +    let val ub = length b
    1.59 +        fun acc(a,i) = if i=ub then a else acc(f(a,sub(b,i)),i+1)
    1.60 +    in acc(a0,0) end;
    1.61 +
    1.62 +(*
    1.63 +Gram: name of nt -> number, nt number -> productions array,
    1.64 +      nt number -> list of nt's reachable via copy ops
    1.65 +*)
    1.66 +
    1.67 +datatype Symbol = T of Token | NT of int * int
    1.68 +     and Op = Op of OpSyn * string * int
    1.69 +withtype OpSyn = Symbol array
    1.70 +     and OpListA = Op array * int TokenMap
    1.71 +     and FastAcc = int TokenMap;
    1.72 +
    1.73 +type Gram = int Symtab.table * OpListA array * int list array;
    1.74 +
    1.75 +fun non_term(Nonterminal(s,_)) = if predef_term(s)=end_token then [s] else []
    1.76 +  | non_term(_) = [];
    1.77 +
    1.78 +fun non_terms(Prod(_,symbs,_,_)) = flat(map non_term symbs);
    1.79 +
    1.80 +(* mk_pre_grammar converts a list of productions in external format into an
    1.81 +internal Gram object. *)
    1.82 +val dummyTM:FastAcc = mkTokenMap[];
    1.83 +
    1.84 +fun mk_pre_grammar(prods): Gram =
    1.85 +let fun same_res(Prod(t1,_,_,_), Prod(t2,_,_,_)) = t1=t2;
    1.86 +    val partitioned0 = partition_eq same_res prods;
    1.87 +    val nts0 = map (fn Prod(ty,_,_,_)::_ => ty) partitioned0;
    1.88 +    val nts' = distinct(flat(map non_terms prods)) \\ nts0;
    1.89 +    val nts = nts' @ nts0;
    1.90 +    val partitioned = (replicate (len nts') []) @ partitioned0;
    1.91 +    val ntis = nts ~~ (0 upto (len(nts)-1));
    1.92 +    val tab = foldr Symtab.update (ntis,Symtab.null);
    1.93 +
    1.94 +    fun nt_or_vt(s,p) =
    1.95 +        (case predef_term(s) of
    1.96 +           end_token => let val Some(i) = Symtab.lookup(tab,s) in NT(i,p) end
    1.97 +         | tk => T(tk));
    1.98 +
    1.99 +    fun mksyn(Terminal(t)) = [T(t)]
   1.100 +      | mksyn(Nonterminal(t)) = [nt_or_vt t]
   1.101 +      | mksyn(_) = [];
   1.102 +
   1.103 +    fun prod2op(Prod(nt,sy,opn,p)) =
   1.104 +        let val syA = arrayoflist(flat(map mksyn sy)) in Op(syA,opn,p) end;
   1.105 +
   1.106 +    fun mkops prods = (arrayoflist(map prod2op prods),dummyTM);
   1.107 +
   1.108 +    val opLA = arrayoflist(map mkops partitioned);
   1.109 +
   1.110 +    val subs = array(length opLA,[]) : int list array;
   1.111 +    fun newcp v (a,Op(syA,_,p)) =
   1.112 +        if p=copy_pri
   1.113 +        then let val NT(k,_) = sub(syA,0)
   1.114 +             in if k mem v then a else k ins a end
   1.115 +        else a;
   1.116 +    fun reach v i =
   1.117 +        let val new = itA ([],#1(sub(opLA,i))) (newcp v)
   1.118 +            val v' = new union v
   1.119 +        in flat(map (reach v') new) union v' end;
   1.120 +    fun rch(i) = update(subs,i,reach[i]i);
   1.121 +
   1.122 +in forA(rch,subs); (tab,opLA,subs) end;
   1.123 +
   1.124 +val RootPref = "__";
   1.125 +
   1.126 +(* Lookahead tables for speeding up parsing. Lkhd is a mapping from
   1.127 +nonterminals (in the form of OpList) to sets (lists) of token strings *)
   1.128 +
   1.129 +type Lkhd = Token list list list;
   1.130 +
   1.131 +(* subst_syn(s,k) syn = [ pref k ts | ts is a token string derivable from sy
   1.132 +                                      under substitution s ] *)
   1.133 +(* This is the general version.
   1.134 +fun cross l1 l2 = flat(map (fn e2 => (map (fn e1 => e1@e2) l1)) l2);
   1.135 +
   1.136 +(* pref k [x1,...,xn] is [x1,...,xk] if 0<=k<=n and [x1,...,xn] otherwise *)
   1.137 +fun pref 0 l = []
   1.138 +  | pref _ [] = []
   1.139 +  | pref k (e::l) = e::(pref (k-1) l);
   1.140 +
   1.141 +fun subst_syn(s:Lkhd,k) =
   1.142 +    let fun subst(ref(symb & syn)):Token list list =
   1.143 +              let val l1 = case symb of T t => [[t]] |
   1.144 +                         NT(oplr,_) => let val Some l = assoc(s,!oplr) in l end
   1.145 +              in distinct(map (pref k) (cross l1 (subst syn))) end |
   1.146 +            subst _ = [[]]
   1.147 +    in subst end;
   1.148 +*)
   1.149 +(* This one is specialized to lookahead 1 and a bit too generous *)
   1.150 +fun subst_syn(s:Lkhd,1) syA =
   1.151 +    let fun subst i = if i = length(syA) then [[]] else
   1.152 +              case sub(syA,i) of
   1.153 +                NT(j,_) => let val pre = nth_elem(j,s)
   1.154 +                         in if [] mem pre then (pre \ []) union subst(i+1)
   1.155 +                            else pre end |
   1.156 +               T(tk) => [[tk]];
   1.157 +    in subst 0 end;
   1.158 +
   1.159 +(* mk_lkhd(G,k) returns a table which associates with every nonterminal N in
   1.160 +G the list of pref k s for all token strings s with N -G->* s *)
   1.161 +
   1.162 +fun mk_lkhd(opLA:OpListA array,k:int):Lkhd =
   1.163 +    let fun step(s:Lkhd):Lkhd =
   1.164 +            let fun subst_op(l,Op(sy,_,_)) = subst_syn(s,k)sy union l;
   1.165 +                fun step2(l,(opA,_)) = l@[itA([],opA)subst_op];
   1.166 +            in writeln"."; itA([],opLA)step2 end;
   1.167 +        fun iterate(s:Lkhd):Lkhd = let val s' = step s
   1.168 +              in if map len s = map len s' then s
   1.169 +                 else iterate s' end
   1.170 +    in writeln"Computing lookahead tables ...";
   1.171 +       iterate (replicate (length opLA) []) end;
   1.172 +
   1.173 +(* create look ahead tables *)
   1.174 +fun mk_earley_gram(g as (tab,opLA,_):Gram):Gram =
   1.175 +    let val lkhd = mk_lkhd(opLA,1);
   1.176 +        fun mk_fa(i):FastAcc =
   1.177 +            let val opA = #1(sub(opLA,i));
   1.178 +                fun start(j) = let val Op(sy,_,_) = sub(opA,j);
   1.179 +                                   val pre = subst_syn(lkhd,1) sy
   1.180 +                        in (j,if [] mem pre then [] else map hd pre) end;
   1.181 +            in mkTokenMap(map start (0 upto(length(opA)-1))) end;
   1.182 +        fun updt(i) = update(opLA,i,(#1(sub(opLA,i)),mk_fa(i)));
   1.183 +
   1.184 +    in forA(updt,opLA); g end;
   1.185 +
   1.186 +fun compile_xgram(roots,prods) =
   1.187 +      let fun mk_root nt = Prod(RootPref^nt,
   1.188 +                [Nonterminal(nt,0),Terminal(end_token)],"",0);
   1.189 +          val prods' = (map mk_root roots) @ prods
   1.190 +      in mk_earley_gram(mk_pre_grammar(prods')) end;
   1.191 +
   1.192 +(* State: nonterminal#, production#, index in production,
   1.193 +          index of originating state set,
   1.194 +          parse trees generated so far,
   1.195 +*)
   1.196 +
   1.197 +datatype State = St of int * int * int * int * ParseTree list
   1.198 +withtype StateSet  = State LListR * (State -> unit) LListR;
   1.199 +type Compl = State -> unit;
   1.200 +type StateSetList = StateSet array;
   1.201 +(* Debugging:
   1.202 +val print_SL = seqll(fn St(nti,pi,ip,fs,ptl)=>
   1.203 +(print_int nti; prs" "; print_int pi; prs" "; print_int ip; prs" ";
   1.204 +print_int fs; prs" "; print_int(len ptl); prs"\n"));
   1.205 +
   1.206 +fun print_SS(s1,delr) = (writeln"================="; print_SL s1);
   1.207 +
   1.208 +fun count_ss(ref nilL) = 0
   1.209 +  | count_ss(ref(_ & ss)) = count_ss(ss)+1;
   1.210 +
   1.211 +fun print_stat(state_sets) =
   1.212 +    let fun pr i = let val(s1,_)=sub(state_sets,i)
   1.213 +                in prs" "; print_int(count_ss s1) end;
   1.214 +    in prs"["; forA(pr,state_sets); prs"]\n" end;
   1.215 +*)
   1.216 +fun mt_stateS():StateSet = (ref nilL, ref nilL);
   1.217 +
   1.218 +fun mt_states(n):StateSetList = array(n,mt_stateS());
   1.219 +
   1.220 +fun ismt_stateS((ref nilL,_):StateSet) = true | ismt_stateS _ = false;
   1.221 +
   1.222 +fun fst_state((ref(st & _),_): StateSet) = st;
   1.223 +
   1.224 +fun apply_all_states(f,(slr,_):StateSet) = seqll f slr;
   1.225 +
   1.226 +fun add_state(nti,pi,ip,from,ptl,(sllr,delr):StateSet) =
   1.227 +      let fun add(ref(St(nti',pi',ip',from',_) & rest)) =
   1.228 +                if nti=nti' andalso pi=pi' andalso ip=ip' andalso from=from'
   1.229 +                then ()
   1.230 +                else add rest |
   1.231 +              add(last as ref nilL) =
   1.232 +                let val newst = St(nti,pi,ip,from,ptl)
   1.233 +                in last := newst & ref nilL;
   1.234 +                   seqll (fn compl => compl newst) delr
   1.235 +                end;
   1.236 +      in add sllr end;
   1.237 +
   1.238 +fun complete(nti,syA,opn,p,ptl,ss,si as (_,delr):StateSet,opLA,rchA) =
   1.239 +      let val pt = mk_pt(opn,ptl)
   1.240 +          fun compl(St(ntj,pj,jp,from,ptl)) =
   1.241 +                let val Op(syj,_,_) = sub(fst(sub(opLA,ntj)),pj) in
   1.242 +                if jp=length(syj) then () else
   1.243 +                case sub(syj,jp) of
   1.244 +                  NT(nt,p') => if p >= p' andalso nti mem sub(rchA,nt)
   1.245 +                        then add_state(ntj,pj,jp+1,from,ptl@[pt], si)
   1.246 +                        else ()
   1.247 +                | _ => ()
   1.248 +                end
   1.249 +      in apply_all_states(compl,ss);
   1.250 +         if length(syA)=0 (* delayed completion in case of empty production: *)
   1.251 +         then delr := compl & ref(!delr) else ()
   1.252 +      end;
   1.253 +
   1.254 +fun predict(tk,isi,si,p',opLA) = fn nti =>
   1.255 +    let val (opA,tm) = sub(opLA,nti);
   1.256 +        fun add(pi) = let val opr as Op(syA,_,p) = sub(opA,pi)
   1.257 +                in if p < p' then () else add_state(nti,pi,0,isi,[],si) end
   1.258 +    in seq add (applyTokenMap(tm,tk)) end;
   1.259 +
   1.260 +
   1.261 +fun parsable((tab,_,_):Gram, root:string) =
   1.262 +        not(Symtab.lookup(tab,RootPref^root) = None);
   1.263 +
   1.264 +exception SYNTAX_ERR of Token list;
   1.265 +
   1.266 +fun unknown c = error("System Error - Trying to parse unknown category "^c);
   1.267 +
   1.268 +fun parse((tab,opLA,rchA):Gram, root:string, tl: Token list): ParseTree =
   1.269 +    let val tl' = tl@[end_token];
   1.270 +        val state_sets = mt_states(len tl' +1);
   1.271 +        val s0 = mt_stateS();
   1.272 +        val rooti = case Symtab.lookup(tab,RootPref^root) of
   1.273 +                Some(ri) => ri | None => unknown root;
   1.274 +
   1.275 +        fun lr (tl,isi,si,t) =
   1.276 +            if ismt_stateS(si) then raise SYNTAX_ERR(t::tl) else
   1.277 +            case tl of
   1.278 +              [] => () |
   1.279 +              t::tl =>
   1.280 +                let val si1 = mt_stateS();
   1.281 +                    fun process(St(nti,pi,ip,from,ptl)) =
   1.282 +                          let val opA = #1(sub(opLA,nti))
   1.283 +                              val Op(syA,opn,p) = sub(opA,pi) in
   1.284 +                        if ip = length(syA)
   1.285 +                        then complete(nti,syA,opn,p,ptl,
   1.286 +                                        sub(state_sets,from),si,opLA,rchA)
   1.287 +                        else case sub(syA,ip) of
   1.288 +                          NT(ntj,p) =>
   1.289 +                                seq (predict(t,isi,si,p,opLA)) (sub(rchA,ntj))
   1.290 +                        | T(t') =>
   1.291 +                            if matching_tokens(t,t')
   1.292 +                            then add_state(nti,pi,ip+1,from,
   1.293 +                                           if valued_token(t)
   1.294 +                                           then ptl@[Tip(t)] else ptl,
   1.295 +                                           si1)
   1.296 +                            else () end;
   1.297 +
   1.298 +            in apply_all_states(process,si);
   1.299 +               update(state_sets,isi+1,si1);
   1.300 +               lr(tl,isi+1,si1,t) end
   1.301 +
   1.302 +    in update(state_sets,0,s0);
   1.303 +       add_state(rooti,0,0,0,[],s0);
   1.304 +       lr(tl',0,s0,end_token(*dummy*));
   1.305 +       (*print_stat state_sets;*)
   1.306 +       let val St(_,_,_,_,[pt]) = fst_state(sub(state_sets,len tl'))
   1.307 +       in pt end
   1.308 +    end;
   1.309 +
   1.310 +fun print_gram ((st,opAA,rchA):Gram,lex) =
   1.311 +    let val tts = Lexicon.name_of_token;
   1.312 +        val al = map (fn (x,y)=>(y,x)) (Symtab.alist_of st);
   1.313 +        fun nt i = let val Some(s) = assoc(al,i) in s end;
   1.314 +        fun print_sy(T(end_token)) = prs". " |
   1.315 +            print_sy(T(tk)) = (prs(tts tk); prs" ") |
   1.316 +            print_sy(NT(i,p)) = (prs((nt i)^"[");print_int p;prs"] ");
   1.317 +        fun print_opA(i) =
   1.318 +                let val lhs = nt i;
   1.319 +                    val (opA,_)=sub(opAA,i);
   1.320 +                    fun print_op(j) =
   1.321 +                        let val Op(sy,n,p) = sub(opA,j)
   1.322 +                        in prs(lhs^" = "); forA(fn i=>print_sy(sub(sy,i)),sy);
   1.323 +                           if n="" then () else prs(" => \""^n^"\"");
   1.324 +                           prs" (";print_int p;prs")\n"
   1.325 +                        end;
   1.326 +                in forA(print_op,opA) end;
   1.327 +        fun print_rch(i) = (print_int i; prs" -> ";
   1.328 +                            print_list("[","]\n",print_int) (sub(rchA,i)))
   1.329 +    in forA(print_opA,opAA) (*; forA(print_rch,rchA) *) end;
   1.330 +
   1.331 +end;
   1.332 +
   1.333 +end;