src/Pure/Syntax/earley0A.ML
changeset 18 c9ec452ff08f
parent 0 a5a9c433f639
child 237 a7d3e712767a
equal deleted inserted replaced
17:b35851cafd3e 18:c9ec452ff08f
     1 (*  Title:      Pure/Syntax/earley0A
     1 (*  Title:      Pure/Syntax/earley0A.ML
     2     ID:         $Id$
     2     ID:         $Id$
     3     Author:     Tobias Nipkow
     3     Author:     Tobias Nipkow
     4 
     4 
     5 Changes:
     5 WARNING: This file is about to disappear.
     6   PARSER: renamed Prod to prod
       
     7   renamed mk_early_gram to mk_earley_gram
       
     8 *)
     6 *)
     9 
     7 
    10 signature PARSER =
     8 signature PARSER =
    11 sig
     9 sig
    12   structure XGram: XGRAM
    10   structure XGram: XGRAM
    13   structure ParseTree: PARSE_TREE
    11   structure ParseTree: PARSE_TREE
    14   local open XGram ParseTree ParseTree.Lexicon in
    12   local open XGram ParseTree ParseTree.Lexicon in
    15   type Gram
    13     type gram
    16   val compile_xgram: string list * Token prod list -> Gram
    14     val mk_gram: string list -> string prod list -> gram
    17   val parse: Gram * string * Token list -> ParseTree
    15     val parse: gram -> string -> token list -> parsetree
    18   val parsable: Gram * string -> bool
       
    19   exception SYNTAX_ERR of Token list
       
    20   val print_gram: Gram * Lexicon -> unit
       
    21   end
    16   end
       
    17 
       
    18 (*  exception SYNTAX_ERROR of token list *)
       
    19 (*  val compile_xgram: string list * token prod list -> Gram *)
       
    20 (*  val parsable: Gram * string -> bool *)
       
    21 (*  val print_gram: Gram * lexicon -> unit *)
    22 end;
    22 end;
    23 
    23 
    24 functor EarleyFun(structure XGram:XGRAM and ParseTree:PARSE_TREE): PARSER =
    24 functor EarleyFun(structure Symtab: SYMTAB and XGram: XGRAM
       
    25   and ParseTree: PARSE_TREE): PARSER =
    25 struct
    26 struct
    26 
    27 
       
    28 structure ParseTree = ParseTree;
       
    29 structure Lexicon = ParseTree.Lexicon;
    27 structure XGram = XGram;
    30 structure XGram = XGram;
    28 structure ParseTree = ParseTree;
    31 open ParseTree Lexicon;
       
    32 
       
    33 
       
    34 (** mk_pt (from parse_tree.ML) **)
       
    35 
       
    36 fun mk_pt ("", [pt]) = pt
       
    37   | mk_pt ("", _) = error "mk_pt: funny copy op in parse tree"
       
    38   | mk_pt (s, ptl) = Node (s, ptl);
       
    39 
       
    40 
       
    41 
       
    42 (** token maps (from lexicon.ML) **)
       
    43 
       
    44 type 'b TokenMap = (token * 'b list) list * 'b list;
       
    45 val first_token = 0;
       
    46 
       
    47 fun int_of_token(Token(tk)) = first_token |
       
    48     int_of_token(IdentSy _) = first_token - 1 |
       
    49     int_of_token(VarSy _) = first_token - 2 |
       
    50     int_of_token(TFreeSy _) = first_token - 3 |
       
    51     int_of_token(TVarSy _) = first_token - 4 |
       
    52     int_of_token(EndToken) = first_token - 5;
       
    53 
       
    54 fun lesstk(s, t) = int_of_token(s) < int_of_token(t) orelse
       
    55                   (case (s, t) of (Token(a), Token(b)) => a<b | _ => false);
       
    56 
       
    57 fun mkTokenMap(atll) =
       
    58     let val aill = atll;
       
    59         val dom = sort lesstk (distinct(flat(map snd aill)));
       
    60         val mt = map fst (filter (null o snd) atll);
       
    61         fun mktm(i) =
       
    62             let fun add(l, (a, il)) = if i mem il then a::l else l
       
    63             in (i, foldl add ([], aill)) end;
       
    64     in (map mktm dom, mt) end;
       
    65 
       
    66 fun find_al (i) =
       
    67     let fun find((j, al)::l) = if lesstk(i, j) then [] else
       
    68                               if matching_tokens(i, j) then al else find l |
       
    69             find [] = [];
       
    70     in find end;
       
    71 fun applyTokenMap((l, mt), tk:token) = mt@(find_al tk l);
       
    72 
       
    73 
    29 
    74 
    30 (* Linked lists: *)
    75 (* Linked lists: *)
    31 infix 5 &;
    76 infix 5 &;
    32 datatype 'a LList = nilL | op & of 'a * ('a LListR)
    77 datatype 'a LList = nilL | op & of 'a * ('a LListR)
    33 withtype 'a LListR = 'a LList ref;
    78 withtype 'a LListR = 'a LList ref;
    55     let val ub = length b
   100     let val ub = length b
    56         fun acc(a,i) = if i=ub then a else acc(f(a,sub(b,i)),i+1)
   101         fun acc(a,i) = if i=ub then a else acc(f(a,sub(b,i)),i+1)
    57     in acc(a0,0) end;
   102     in acc(a0,0) end;
    58 
   103 
    59 (*
   104 (*
    60 Gram: name of nt -> number, nt number -> productions array,
   105 gram: name of nt -> number, nt number -> productions array,
    61       nt number -> list of nt's reachable via copy ops
   106       nt number -> list of nt's reachable via copy ops
    62 *)
   107 *)
    63 
   108 
    64 datatype Symbol = T of Token | NT of int * int
   109 datatype Symbol = T of token | NT of int * int
    65      and Op = Op of OpSyn * string * int
   110      and Op = Op of OpSyn * string * int
    66 withtype OpSyn = Symbol array
   111 withtype OpSyn = Symbol array
    67      and OpListA = Op array * int TokenMap
   112      and OpListA = Op array * int TokenMap
    68      and FastAcc = int TokenMap;
   113      and FastAcc = int TokenMap;
    69 
   114 
    70 type Gram = int Symtab.table * OpListA array * int list array;
   115 type gram = int Symtab.table * OpListA array * int list array;
    71 
   116 
    72 fun non_term(Nonterminal(s,_)) = if predef_term(s)=end_token then [s] else []
   117 fun non_term(Nonterminal(s,_)) = if predef_term(s)=EndToken then [s] else []
    73   | non_term(_) = [];
   118   | non_term(_) = [];
    74 
   119 
    75 fun non_terms(Prod(_,symbs,_,_)) = flat(map non_term symbs);
   120 fun non_terms(Prod(_,symbs,_,_)) = flat(map non_term symbs);
    76 
   121 
    77 (* mk_pre_grammar converts a list of productions in external format into an
   122 (* mk_pre_grammar converts a list of productions in external format into an
    78 internal Gram object. *)
   123 internal gram object. *)
    79 val dummyTM:FastAcc = mkTokenMap[];
   124 val dummyTM:FastAcc = mkTokenMap[];
    80 
   125 
    81 fun mk_pre_grammar(prods): Gram =
   126 fun mk_pre_grammar(prods): gram =
    82 let fun same_res(Prod(t1,_,_,_), Prod(t2,_,_,_)) = t1=t2;
   127 let fun same_res(Prod(t1,_,_,_), Prod(t2,_,_,_)) = t1=t2;
    83     val partitioned0 = partition_eq same_res prods;
   128     val partitioned0 = partition_eq same_res prods;
    84     val nts0 = map (fn Prod(ty,_,_,_)::_ => ty) partitioned0;
   129     val nts0 = map (fn Prod(ty,_,_,_)::_ => ty) partitioned0;
    85     val nts' = distinct(flat(map non_terms prods)) \\ nts0;
   130     val nts' = distinct(flat(map non_terms prods)) \\ nts0;
    86     val nts = nts' @ nts0;
   131     val nts = nts' @ nts0;
    88     val ntis = nts ~~ (0 upto (len(nts)-1));
   133     val ntis = nts ~~ (0 upto (len(nts)-1));
    89     val tab = foldr Symtab.update (ntis,Symtab.null);
   134     val tab = foldr Symtab.update (ntis,Symtab.null);
    90 
   135 
    91     fun nt_or_vt(s,p) =
   136     fun nt_or_vt(s,p) =
    92         (case predef_term(s) of
   137         (case predef_term(s) of
    93            end_token => let val Some(i) = Symtab.lookup(tab,s) in NT(i,p) end
   138            EndToken => let val Some(i) = Symtab.lookup(tab,s) in NT(i,p) end
    94          | tk => T(tk));
   139          | tk => T(tk));
    95 
   140 
    96     fun mksyn(Terminal(t)) = [T(t)]
   141     fun mksyn(Terminal(t)) = [T(t)]
    97       | mksyn(Nonterminal(t)) = [nt_or_vt t]
   142       | mksyn(Nonterminal(t)) = [nt_or_vt t]
    98       | mksyn(_) = [];
   143       | mksyn(_) = [];
   104 
   149 
   105     val opLA = arrayoflist(map mkops partitioned);
   150     val opLA = arrayoflist(map mkops partitioned);
   106 
   151 
   107     val subs = array(length opLA,[]) : int list array;
   152     val subs = array(length opLA,[]) : int list array;
   108     fun newcp v (a,Op(syA,_,p)) =
   153     fun newcp v (a,Op(syA,_,p)) =
   109         if p=copy_pri
   154         if p=chain_pri
   110         then let val NT(k,_) = sub(syA,0)
   155         then let val NT(k,_) = sub(syA,0)
   111              in if k mem v then a else k ins a end
   156              in if k mem v then a else k ins a end
   112         else a;
   157         else a;
   113     fun reach v i =
   158     fun reach v i =
   114         let val new = itA ([],#1(sub(opLA,i))) (newcp v)
   159         let val new = itA ([],#1(sub(opLA,i))) (newcp v)
   121 val RootPref = "__";
   166 val RootPref = "__";
   122 
   167 
   123 (* Lookahead tables for speeding up parsing. Lkhd is a mapping from
   168 (* Lookahead tables for speeding up parsing. Lkhd is a mapping from
   124 nonterminals (in the form of OpList) to sets (lists) of token strings *)
   169 nonterminals (in the form of OpList) to sets (lists) of token strings *)
   125 
   170 
   126 type Lkhd = Token list list list;
   171 type Lkhd = token list list list;
   127 
   172 
   128 (* subst_syn(s,k) syn = [ pref k ts | ts is a token string derivable from sy
   173 (* subst_syn(s,k) syn = [ pref k ts | ts is a token string derivable from sy
   129                                       under substitution s ] *)
   174                                       under substitution s ] *)
   130 (* This is the general version.
   175 (* This is the general version.
   131 fun cross l1 l2 = flat(map (fn e2 => (map (fn e1 => e1@e2) l1)) l2);
   176 fun cross l1 l2 = flat(map (fn e2 => (map (fn e1 => e1@e2) l1)) l2);
   134 fun pref 0 l = []
   179 fun pref 0 l = []
   135   | pref _ [] = []
   180   | pref _ [] = []
   136   | pref k (e::l) = e::(pref (k-1) l);
   181   | pref k (e::l) = e::(pref (k-1) l);
   137 
   182 
   138 fun subst_syn(s:Lkhd,k) =
   183 fun subst_syn(s:Lkhd,k) =
   139     let fun subst(ref(symb & syn)):Token list list =
   184     let fun subst(ref(symb & syn)):token list list =
   140               let val l1 = case symb of T t => [[t]] |
   185               let val l1 = case symb of T t => [[t]] |
   141                          NT(oplr,_) => let val Some l = assoc(s,!oplr) in l end
   186                          NT(oplr,_) => let val Some l = assoc(s,!oplr) in l end
   142               in distinct(map (pref k) (cross l1 (subst syn))) end |
   187               in distinct(map (pref k) (cross l1 (subst syn))) end |
   143             subst _ = [[]]
   188             subst _ = [[]]
   144     in subst end;
   189     in subst end;
   166                  else iterate s' end
   211                  else iterate s' end
   167     in writeln"Computing lookahead tables ...";
   212     in writeln"Computing lookahead tables ...";
   168        iterate (replicate (length opLA) []) end;
   213        iterate (replicate (length opLA) []) end;
   169 
   214 
   170 (* create look ahead tables *)
   215 (* create look ahead tables *)
   171 fun mk_earley_gram(g as (tab,opLA,_):Gram):Gram =
   216 fun mk_earley_gram(g as (tab,opLA,_):gram):gram =
   172     let val lkhd = mk_lkhd(opLA,1);
   217     let val lkhd = mk_lkhd(opLA,1);
   173         fun mk_fa(i):FastAcc =
   218         fun mk_fa(i):FastAcc =
   174             let val opA = #1(sub(opLA,i));
   219             let val opA = #1(sub(opLA,i));
   175                 fun start(j) = let val Op(sy,_,_) = sub(opA,j);
   220                 fun start(j) = let val Op(sy,_,_) = sub(opA,j);
   176                                    val pre = subst_syn(lkhd,1) sy
   221                                    val pre = subst_syn(lkhd,1) sy
   180 
   225 
   181     in forA(updt,opLA); g end;
   226     in forA(updt,opLA); g end;
   182 
   227 
   183 fun compile_xgram(roots,prods) =
   228 fun compile_xgram(roots,prods) =
   184       let fun mk_root nt = Prod(RootPref^nt,
   229       let fun mk_root nt = Prod(RootPref^nt,
   185                 [Nonterminal(nt,0),Terminal(end_token)],"",0);
   230                 [Nonterminal(nt,0),Terminal(EndToken)],"",0);
   186           val prods' = (map mk_root roots) @ prods
   231           val prods' = (map mk_root roots) @ prods
   187       in mk_earley_gram(mk_pre_grammar(prods')) end;
   232       in mk_earley_gram(mk_pre_grammar(prods')) end;
       
   233 
       
   234 
       
   235 (* translate (from xgram.ML) *)
       
   236 
       
   237 fun translate trfn =
       
   238   map
       
   239     (fn Terminal t => Terminal (trfn t)
       
   240       | Nonterminal s => Nonterminal s
       
   241       | Space s => Space s
       
   242       | Bg i => Bg i
       
   243       | Brk i => Brk i
       
   244       | En => En);
       
   245 
       
   246 
       
   247 (* mk_gram (from syntax.ML) *)
       
   248 
       
   249 fun str_to_tok (opl: string prod list): token prod list =
       
   250   map
       
   251     (fn Prod (t, syn, s, pa) => Prod (t, translate Token syn, s, pa))
       
   252     opl;
       
   253 
       
   254 fun mk_gram roots prods = compile_xgram (roots, str_to_tok prods);
       
   255 
       
   256 
   188 
   257 
   189 (* State: nonterminal#, production#, index in production,
   258 (* State: nonterminal#, production#, index in production,
   190           index of originating state set,
   259           index of originating state set,
   191           parse trees generated so far,
   260           parse trees generated so far,
   192 *)
   261 *)
   193 
   262 
   194 datatype State = St of int * int * int * int * ParseTree list
   263 datatype State = St of int * int * int * int * parsetree list
   195 withtype StateSet  = State LListR * (State -> unit) LListR;
   264 withtype StateSet  = State LListR * (State -> unit) LListR;
   196 type Compl = State -> unit;
   265 type Compl = State -> unit;
   197 type StateSetList = StateSet array;
   266 type StateSetList = StateSet array;
   198 (* Debugging:
   267 (* Debugging:
   199 val print_SL = seqll(fn St(nti,pi,ip,fs,ptl)=>
   268 val print_SL = seqll(fn St(nti,pi,ip,fs,ptl)=>
   253         fun add(pi) = let val opr as Op(syA,_,p) = sub(opA,pi)
   322         fun add(pi) = let val opr as Op(syA,_,p) = sub(opA,pi)
   254                 in if p < p' then () else add_state(nti,pi,0,isi,[],si) end
   323                 in if p < p' then () else add_state(nti,pi,0,isi,[],si) end
   255     in seq add (applyTokenMap(tm,tk)) end;
   324     in seq add (applyTokenMap(tm,tk)) end;
   256 
   325 
   257 
   326 
   258 fun parsable((tab,_,_):Gram, root:string) =
   327 (*(* FIXME *)
       
   328 fun parsable((tab,_,_):gram, root:string) =
   259         not(Symtab.lookup(tab,RootPref^root) = None);
   329         not(Symtab.lookup(tab,RootPref^root) = None);
   260 
   330 *)
   261 exception SYNTAX_ERR of Token list;
   331 
   262 
   332 (* exception SYNTAX_ERROR of token list; *) (* FIXME *)
   263 fun unknown c = error("System Error - Trying to parse unknown category "^c);
   333 
   264 
   334 fun unknown c = error ("Unparsable category: " ^ c);
   265 fun parse((tab,opLA,rchA):Gram, root:string, tl: Token list): ParseTree =
   335 
   266     let val tl' = tl@[end_token];
   336 fun syn_err toks = error ("Syntax error at: " ^ space_implode " " (map str_of_token toks));
       
   337 
       
   338 fun parse ((tab,opLA,rchA):gram) (root:string) (tl: token list): parsetree =
       
   339     let val tl' = tl;   (* FIXME was ..@[EndToken] *)
   267         val state_sets = mt_states(len tl' +1);
   340         val state_sets = mt_states(len tl' +1);
   268         val s0 = mt_stateS();
   341         val s0 = mt_stateS();
   269         val rooti = case Symtab.lookup(tab,RootPref^root) of
   342         val rooti = case Symtab.lookup(tab,RootPref^root) of
   270                 Some(ri) => ri | None => unknown root;
   343                 Some(ri) => ri | None => unknown root;
   271 
   344 
   272         fun lr (tl,isi,si,t) =
   345         fun lr (tl,isi,si,t) =
   273             if ismt_stateS(si) then raise SYNTAX_ERR(t::tl) else
   346 (*            if ismt_stateS(si) then raise SYNTAX_ERROR(t::tl) else  *)  (* FIXME *)
       
   347             if ismt_stateS(si) then syn_err (t::tl) else
   274             case tl of
   348             case tl of
   275               [] => () |
   349               [] => () |
   276               t::tl =>
   350               t::tl =>
   277                 let val si1 = mt_stateS();
   351                 let val si1 = mt_stateS();
   278                     fun process(St(nti,pi,ip,from,ptl)) =
   352                     fun process(St(nti,pi,ip,from,ptl)) =
   296                update(state_sets,isi+1,si1);
   370                update(state_sets,isi+1,si1);
   297                lr(tl,isi+1,si1,t) end
   371                lr(tl,isi+1,si1,t) end
   298 
   372 
   299     in update(state_sets,0,s0);
   373     in update(state_sets,0,s0);
   300        add_state(rooti,0,0,0,[],s0);
   374        add_state(rooti,0,0,0,[],s0);
   301        lr(tl',0,s0,end_token(*dummy*));
   375        lr(tl',0,s0,EndToken(*dummy*));
   302        (*print_stat state_sets;*)
   376        (*print_stat state_sets;*)
   303        let val St(_,_,_,_,[pt]) = fst_state(sub(state_sets,len tl'))
   377        let val St(_,_,_,_,[pt]) = fst_state(sub(state_sets,len tl'))
   304        in pt end
   378        in pt end
   305     end;
   379     end;
   306 
   380 
   307 fun print_gram ((st,opAA,rchA):Gram,lex) =
   381 
       
   382 (*(* FIXME *)
       
   383 fun print_gram ((st,opAA,rchA):gram,lex) =
   308     let val tts = Lexicon.name_of_token;
   384     let val tts = Lexicon.name_of_token;
   309         val al = map (fn (x,y)=>(y,x)) (Symtab.alist_of st);
   385         val al = map (fn (x,y)=>(y,x)) (Symtab.alist_of st);
   310         fun nt i = let val Some(s) = assoc(al,i) in s end;
   386         fun nt i = let val Some(s) = assoc(al,i) in s end;
   311         fun print_sy(T(end_token)) = prs". " |
   387         fun print_sy(T(EndToken)) = prs". " |
   312             print_sy(T(tk)) = (prs(tts tk); prs" ") |
   388             print_sy(T(tk)) = (prs(tts tk); prs" ") |
   313             print_sy(NT(i,p)) = (prs((nt i)^"[");print_int p;prs"] ");
   389             print_sy(NT(i,p)) = (prs((nt i)^"[");print_int p;prs"] ");
   314         fun print_opA(i) =
   390         fun print_opA(i) =
   315                 let val lhs = nt i;
   391                 let val lhs = nt i;
   316                     val (opA,_)=sub(opAA,i);
   392                     val (opA,_)=sub(opAA,i);
   322                         end;
   398                         end;
   323                 in forA(print_op,opA) end;
   399                 in forA(print_op,opA) end;
   324         fun print_rch(i) = (print_int i; prs" -> ";
   400         fun print_rch(i) = (print_int i; prs" -> ";
   325                             print_list("[","]\n",print_int) (sub(rchA,i)))
   401                             print_list("[","]\n",print_int) (sub(rchA,i)))
   326     in forA(print_opA,opAA) (*; forA(print_rch,rchA) *) end;
   402     in forA(print_opA,opAA) (*; forA(print_rch,rchA) *) end;
       
   403 *)
   327 
   404 
   328 end;
   405 end;
   329 
   406 
   330 end;
   407 end;
       
   408