src/Pure/Syntax/parser.ML
changeset 237 a7d3e712767a
parent 46 f0f4978af183
child 258 e540b7d4ecb1
equal deleted inserted replaced
236:d33cd0011e48 237:a7d3e712767a
     3     Author:     Sonia Mahjoub and Markus Wenzel, TU Muenchen
     3     Author:     Sonia Mahjoub and Markus Wenzel, TU Muenchen
     4 
     4 
     5 Isabelle's main parser (used for terms and typs).
     5 Isabelle's main parser (used for terms and typs).
     6 
     6 
     7 TODO:
     7 TODO:
       
     8   fix ambiguous grammar problem
     8   ~1 --> chain_pri
     9   ~1 --> chain_pri
     9   rename T, NT
       
    10   improve syntax error
    10   improve syntax error
    11   fix ambiguous grammar problem
    11   extend_gram: remove 'roots' arg
    12 *)
    12 *)
    13 
    13 
    14 signature PARSER =
    14 signature PARSER =
    15 sig
    15 sig
    16   structure XGram: XGRAM
    16   structure Lexicon: LEXICON
    17   structure ParseTree: PARSE_TREE
    17   structure SynExt: SYN_EXT
    18   local open XGram ParseTree ParseTree.Lexicon in
    18   local open Lexicon SynExt SynExt.Ast in
    19     type gram
    19     type gram
    20     val empty_gram: gram
    20     val empty_gram: gram
    21     val extend_gram: gram -> string list -> string prod list -> gram
    21     val extend_gram: gram -> string list -> xprod list -> gram
    22     val mk_gram: string list -> string prod list -> gram
    22     val merge_grams: gram -> gram -> gram
       
    23     val pretty_gram: gram -> Pretty.T list
       
    24     datatype parsetree =
       
    25       Node of string * parsetree list |
       
    26       Tip of token
    23     val parse: gram -> string -> token list -> parsetree
    27     val parse: gram -> string -> token list -> parsetree
    24   end
    28   end
    25 end;
    29 end;
    26 
    30 
    27 functor ParserFun(structure Symtab: SYMTAB and XGram: XGRAM
    31 functor ParserFun(structure Symtab: SYMTAB and Lexicon: LEXICON
    28   and ParseTree: PARSE_TREE)(*: PARSER *) =  (* FIXME *)
    32   and SynExt: SYN_EXT)(*: PARSER *) =  (* FIXME *)
    29 struct
    33 struct
    30 
    34 
    31 structure XGram = XGram;
    35 structure Pretty = SynExt.Ast.Pretty;
    32 structure ParseTree = ParseTree;
    36 structure Lexicon = Lexicon;
    33 structure Lexicon = ParseTree.Lexicon;
    37 structure SynExt = SynExt;
    34 open XGram ParseTree Lexicon;
    38 open Lexicon SynExt;
    35 
    39 
    36 
    40 
    37 (** datatype gram **)
    41 (** datatype gram **)
    38 
    42 
    39 datatype symb = T of token | NT of string * int;
    43 datatype symb =
       
    44   Terminal of token |
       
    45   Nonterminal of string * int;
    40 
    46 
    41 datatype gram =
    47 datatype gram =
    42   Gram of string list * (symb list * string * int) list Symtab.table;
    48   Gram of (string * (symb list * string * int)) list
    43 
    49     * (symb list * string * int) list Symtab.table;
    44 
    50 
    45 (* empty_gram *)
    51 fun mk_gram prods = Gram (prods, Symtab.make_multi prods);
    46 
    52 
    47 val empty_gram = Gram ([], Symtab.null);
    53 
    48 
    54 (* empty, extend, merge grams *)
    49 
    55 
    50 (* extend_gram *)
    56 val empty_gram = mk_gram [];
    51 
    57 
    52 (*prods are stored in reverse order*)
    58 fun extend_gram (gram1 as Gram (prods1, _)) _ xprods2 =
    53 
       
    54 fun extend_gram (Gram (roots, tab)) new_roots xprods =
       
    55   let
    59   let
    56     fun symb_of (Terminal s) = Some (T (Token s))
    60     fun symb_of (Delim s) = Some (Terminal (Token s))
    57       | symb_of (Nonterminal (s, p)) =
    61       | symb_of (Argument (s, p)) =
    58           (case predef_term s of
    62           (case predef_term s of
    59             EndToken => Some (NT (s, p))
    63             None => Some (Nonterminal (s, p))
    60           | tk => Some (T tk))
    64           | Some tk => Some (Terminal tk))
    61       | symb_of _ = None;
    65       | symb_of _ = None;
    62 
    66 
    63     fun prod_of (Prod (s, xsyms, c, p)) = (s, mapfilter symb_of xsyms, c, p);
    67     fun prod_of (XProd (lhs, xsymbs, const, pri)) =
    64 
    68       (lhs, (mapfilter symb_of xsymbs, const, pri));
    65     fun add_prod (tab, (s, syms, c, p)) =
    69 
    66       (case Symtab.lookup (tab, s) of
    70     val prods2 = distinct (map prod_of xprods2);
    67         None => Symtab.update ((s, [(syms, c, p)]), tab)
       
    68       | Some prods => Symtab.update ((s, (syms, c, p) :: prods), tab));
       
    69   in
    71   in
    70     Gram (new_roots union roots,
    72     if prods2 subset prods1 then gram1
    71       Symtab.balance (foldl add_prod (tab, map prod_of xprods)))
    73     else mk_gram (extend_list prods1 prods2)
    72   end;
    74   end;
    73 
    75 
    74 
    76 fun merge_grams (gram1 as Gram (prods1, _)) (gram2 as Gram (prods2, _)) =
    75 (* mk_gram *)
    77   if prods2 subset prods1 then gram1
    76 
    78   else if prods1 subset prods2 then gram2
    77 val mk_gram = extend_gram empty_gram;
    79   else mk_gram (merge_lists prods1 prods2);
    78 
    80 
    79 
    81 
    80 (* get_prods *)
    82 (* pretty_gram *)
    81 
    83 
    82 fun get_prods tab s pred =
    84 fun pretty_gram (Gram (prods, _)) =
    83   let
    85   let
    84     fun rfilter [] ys = ys
    86     fun pretty_name name = [Pretty.str (name ^ " =")];
    85       | rfilter (x :: xs) ys = rfilter xs (if pred x then x :: ys else ys);
    87 
       
    88     fun pretty_symb (Terminal (Token s)) = Pretty.str (quote s)
       
    89       | pretty_symb (Terminal tok) = Pretty.str (str_of_token tok)
       
    90       | pretty_symb (Nonterminal (s, p)) =
       
    91           Pretty.str (s ^ "[" ^ string_of_int p ^ "]");
       
    92 
       
    93     fun pretty_const "" = []
       
    94       | pretty_const c = [Pretty.str ("=> " ^ quote c)];
       
    95 
       
    96     fun pretty_pri p = [Pretty.str ("(" ^ string_of_int p ^ ")")];
       
    97 
       
    98     fun pretty_prod (name, (symbs, const, pri)) =
       
    99       Pretty.block (Pretty.breaks (pretty_name name @
       
   100         map pretty_symb symbs @ pretty_const const @ pretty_pri pri));
    86   in
   101   in
    87     (case Symtab.lookup (tab, s) of
   102     map pretty_prod prods
    88       Some prods => rfilter prods []
       
    89     | None => [])
       
    90   end;
   103   end;
    91 
   104 
    92 
   105 
    93 
   106 
    94 (** parse **)
   107 (** parse **)
       
   108 
       
   109 datatype parsetree =
       
   110   Node of string * parsetree list |
       
   111   Tip of token;
    95 
   112 
    96 type state =
   113 type state =
    97   string * int
   114   string * int
    98     * parsetree list    (*before point*)
   115     * parsetree list    (*before point*)
    99     * symb list         (*after point*)
   116     * symb list         (*after point*)
   101 
   118 
   102 type earleystate = state list Array.array;
   119 type earleystate = state list Array.array;
   103 
   120 
   104 
   121 
   105 
   122 
       
   123 fun get_prods tab lhs pred =
       
   124   filter pred (Symtab.lookup_multi (tab, lhs));
       
   125 
   106 fun getProductions tab A i =
   126 fun getProductions tab A i =
   107   get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1);
   127   get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1);
   108 
   128 
   109 fun getProductions' tab A i k =
   129 fun getProductions' tab A i k =
   110   get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1);
   130   get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1);
   111 
   131 
   112 
   132 
   113 
   133 
   114 fun mkState i jj A ([NT(B,~1)],id,~1) =
   134 fun mkState i jj A ([Nonterminal (B, ~1)], id, ~1) =
   115                      (A,max_pri,[],[NT (B,jj)],id,i)
   135       (A, max_pri, [], [Nonterminal (B, jj)], id, i)
   116   | mkState i jj A (ss,id,j) = (A,j,[],ss,id,i) ;
   136   | mkState i jj A (ss, id, j) = (A, j, [], ss, id, i);
   117 
   137 
   118 
   138 
   119 
   139 
   120 fun conc (t,(k:int)) [] = (None, [(t,k)])
   140 fun conc (t, k:int) [] = (None, [(t, k)])
   121   | conc (t,k) ((t',k')::ts) =
   141   | conc (t, k) ((t', k') :: ts) =
   122       if (t = t')
   142       if t = t' then
   123       then (Some k', ( if k' >= k
   143         (Some k', if k' >= k then (t', k') :: ts else (t, k) :: ts)
   124                        then (t',k')::ts
       
   125                        else (t,k)::ts )
       
   126            )
       
   127       else let val (n, ts') = conc (t,k) ts
       
   128            in (n, (t',k')::ts')
       
   129            end;
       
   130 
       
   131 
       
   132 fun update_tree ((B,(i,ts))::used) (A,t) =
       
   133       if   (A = B)
       
   134       then
       
   135           let val (n,ts') = conc t ts
       
   136           in  ((A,(i,ts'))::used, n)
       
   137           end
       
   138       else
   144       else
   139           let val (used', n) = update_tree used (A,t)
   145         let val (n, ts') = conc (t, k) ts
   140           in  ((B,(i,ts)) :: used', n)
   146         in (n, (t', k') :: ts') end;
   141           end;
   147 
   142 
   148 
   143 
   149 fun update_tree ((B, (i, ts)) :: used) (A, t) =
   144 
   150   if A = B then
   145 fun update_index ((B,(i,ts))::used) (A,j) =
   151     let val (n, ts') = conc t ts
   146       if   (A = B)
   152     in ((A, (i, ts')) :: used, n) end
   147       then (A,(j,ts)) :: used
   153   else
   148       else (B,(i,ts)) :: (update_index used (A,j));
   154     let val (used', n) = update_tree used (A, t)
       
   155     in ((B, (i, ts)) :: used', n) end;
       
   156 
       
   157 
       
   158 fun update_index ((B, (i, ts)) :: used) (A, j) =
       
   159   if A = B then (A, (j, ts)) :: used
       
   160   else (B, (i, ts)) :: (update_index used (A, j));
   149 
   161 
   150 
   162 
   151 
   163 
   152 fun getS A i Si =
   164 fun getS A i Si =
   153        filter (fn (_,_,_,(NT(B,j))::_,_,_)
   165   filter
   154                                     => (A=B andalso j<=i)
   166     (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= i
   155                |      _             => false
   167       | _ => false) Si;
   156               ) Si;
       
   157 
       
   158 
       
   159 
   168 
   160 fun getS' A k n Si =
   169 fun getS' A k n Si =
   161        filter (fn (_,_,_,(NT(B,j))::_,_,_) => (A=B andalso
   170   filter
   162                                                j<=k andalso
   171     (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k andalso j > n
   163                                                j>n )
   172       | _ => false) Si;
   164                |      _                    => false
       
   165               ) Si;
       
   166 
       
   167 
       
   168 
   173 
   169 fun getStates Estate i ii A k =
   174 fun getStates Estate i ii A k =
   170        filter (fn (_,_,_,(NT(B,j))::_,_,_)
   175   filter
   171                                     => (A=B andalso j<=k)
   176     (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k
   172                |      _             => false
   177       | _ => false)
   173               )
   178     (Array.sub (Estate, ii));
   174               (Array.sub (Estate, ii))
       
   175 ;
       
   176 
       
   177 
   179 
   178 
   180 
   179 (* MMW *)(* FIXME *)
   181 (* MMW *)(* FIXME *)
   180 fun movedot_term (A,j,ts,T a::sa,id,i) c =
   182 fun movedot_term (A, j, ts, Terminal a :: sa, id, i) c =
   181                    if (valued_token c)
   183   if valued_token c then
   182                    then (A,j,(ts@[Tip c]),sa,id,i)
   184     (A, j, (ts @ [Tip c]), sa, id, i)
   183                    else (A,j,ts,sa,id,i) ;
   185   else (A, j, ts, sa, id, i);
   184 
   186 
   185 
   187 fun movedot_nonterm ts (A, j, tss, Nonterminal (B, k) :: sa, id, i) =
   186 
   188   (A, j, tss @ ts, sa, id, i);
   187 fun movedot_nonterm ts  (A,j,tss,NT(B,k) ::sa,id,i) =
   189 
   188                          (A,j,tss@ts,sa,id,i) ;
   190 fun movedot_lambda _ [] = []
   189 
   191   | movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ((t, ki) :: ts) =
   190 
   192       if k <= ki then
   191 
   193         (B, j, tss @ t, sa, id, i) ::
   192 fun movedot_lambda  _  [] = []
   194           movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts
   193   | movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ((t,ki)::ts) =
   195       else movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts;
   194     if k <= ki
       
   195     then (B,j,tss@t,sa,id,i) ::
       
   196          (movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts)
       
   197     else movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts
       
   198   ;
       
   199 
       
   200 
   196 
   201 
   197 
   202 
   198 
   203 fun PROCESSS Estate grammar i c states =
   199 fun PROCESSS Estate grammar i c states =
   204 
       
   205 let
   200 let
   206 fun processS used [] (Si,Sii) = (Si,Sii)
   201 fun processS used [] (Si, Sii) = (Si, Sii)
   207  |  processS used (S::States) (Si,Sii) =
   202   | processS used (S :: States) (Si, Sii) =
   208      ( case S of
   203       (case S of
   209 
   204         (_, _, _, Nonterminal (A, j) :: _, _, _) =>
   210         (_,_,_,(NT (A,j))::_,_,_)  =>
       
   211           let
   205           let
   212             val (used_neu, States_neu) =
   206             val (used_neu, States_neu) =
   213               ( case assoc (used, A) of
   207               (case assoc (used, A) of
   214                  Some (k,l) =>         (* A gehoert zu used *)
   208                 Some (k, l) =>      (*A gehoert zu used*)
   215 
   209                   if k <= j         (*Prioritaet wurde behandelt*)
   216                    if (k <= j)          (* Prioritaet wurde
   210                   then (used, movedot_lambda S l)
   217                                            behandelt  *)
   211                   else              (*Prioritaet wurde noch nicht behandelt*)
   218                    then
   212                     let
   219                         (used, movedot_lambda S l)
   213                       val L = getProductions' grammar A j k;
   220 
   214                       val States' = map (mkState i j A) L;
   221                    else         (* Prioritaet wurde noch nicht
   215                     in
   222                                         behandelt  *)
   216                       (update_index used (A, j), States' @ movedot_lambda S l)
   223                         let val L =
   217                     end
   224                                 getProductions' grammar A j k
   218 
   225                             val States' = map (mkState i j A) L
   219               | None =>             (*A gehoert nicht zu used*)
   226                         in
   220                   let
   227                             (update_index used (A,j),
   221                     val L = getProductions grammar A j;
   228                              States'@ movedot_lambda S l
   222                     val States' = map (mkState i j A) L;
   229                             )
   223                   in
   230                         end
   224                     ((A, (j, [])) :: used, States')
   231 
   225                   end)
   232                 | None =>       (* A gehoert nicht zu used *)
   226           in
   233                       let val L = getProductions grammar A j
   227             processS used_neu (States @ States_neu) (S :: Si, Sii)
   234                           val States' = map (mkState i j A) L
   228           end
       
   229       | (_, _, _, Terminal a :: _, _, _) =>
       
   230           processS used States
       
   231             (S :: Si,
       
   232               if matching_tokens (a, c) then movedot_term S c :: Sii else Sii)
       
   233                                           (* MMW *)(* FIXME *)
       
   234 
       
   235       | (A, k, ts, [], id, j) =>
       
   236           let
       
   237             val tt = if id = "" then ts else [Node (id, ts)]
       
   238           in
       
   239             if j = i then
       
   240               let
       
   241                 val (used', O) = update_tree used (A, (tt, k));
       
   242               in
       
   243                 (case O of
       
   244                   None =>
       
   245                     let
       
   246                       val Slist = getS A k Si;
       
   247                       val States' = map (movedot_nonterm tt) Slist;
       
   248                     in
       
   249                       processS used' (States @ States') (S :: Si, Sii)
       
   250                     end
       
   251                 | Some n =>
       
   252                     if n >= k then
       
   253                       processS used' States (S :: Si, Sii)
       
   254                     else
       
   255                       let
       
   256                         val Slist = getS' A k n Si;
       
   257                         val States' = map (movedot_nonterm tt) Slist;
   235                       in
   258                       in
   236                           ((A,(j,[]))::used, States')
   259                         processS used' (States @ States') (S :: Si, Sii)
   237                       end
   260                       end)
   238               )
   261               end
   239           in
   262             else
   240              processS used_neu (States @ States_neu) (S::Si,Sii)
   263               processS used
   241           end
   264                 (States @ map (movedot_nonterm tt) (getStates Estate i j A k))
   242 
   265                 (S :: Si, Sii)
   243      |  (_,_,_,(T a)::_,_,_)  =>
   266           end)
   244                 processS used States
       
   245                     (S::Si, if (matching_tokens (a, c))
       
   246                             then  (movedot_term S c)::Sii   (* MMW *)(* FIXME *)
       
   247                             else Sii
       
   248                     )
       
   249 
       
   250 
       
   251      |  (A,k,ts,[],id,j) =>
       
   252           let val tt = if id = ""
       
   253                        then ts
       
   254                        else [Node(id,ts)]
       
   255           in
       
   256              if (j = i)
       
   257              then
       
   258                 let val (used',O) = update_tree used (A,(tt,k))
       
   259                 in ( case O of
       
   260                       None =>
       
   261                          let val Slist = getS A k Si
       
   262                              val States' =
       
   263                               map (movedot_nonterm tt ) Slist
       
   264                          in  processS used'
       
   265                              (States @ States') (S::Si,Sii)
       
   266                          end
       
   267                    |  Some n =>
       
   268                          if (n >= k)
       
   269                          then
       
   270                             processS used' States (S::Si,Sii)
       
   271                          else
       
   272                             let val Slist = getS' A k n Si
       
   273                                 val States' =
       
   274                               map (movedot_nonterm tt ) Slist
       
   275                             in
       
   276                               processS used'
       
   277                               (States @ States') (S::Si,Sii)
       
   278                             end
       
   279                    )
       
   280                 end
       
   281 
       
   282               else
       
   283                 processS used (States @
       
   284                                map (movedot_nonterm tt)
       
   285                                (getStates Estate i j A k))
       
   286                 (S::Si,Sii)
       
   287            end
       
   288      )
       
   289 in
   267 in
   290    processS [] states ([],[])
   268   processS [] states ([], [])
   291 end;
   269 end;
   292 
   270 
   293 
   271 
   294 
   272 
   295 fun syntax_error toks =
   273 fun syntax_error toks =
   296   error ("Syntax error at: " ^ space_implode " " (map str_of_token toks));
   274   error ("Syntax error at: " ^ space_implode " " (map str_of_token toks));
   297 
   275 
   298 fun Produce grammar stateset i indata =
   276 fun produce grammar stateset i indata =
   299       case (Array.sub (stateset,i)) of
   277   (case Array.sub (stateset, i) of
   300          [] => syntax_error indata (* MMW *)(* FIXME *)
   278     [] => syntax_error indata (* MMW *)(* FIXME *)
   301       |  s  =>
   279   | s =>
   302            (case indata of
   280     (case indata of
   303                 []  => Array.sub (stateset,i)
   281       [] => Array.sub (stateset, i)
   304              |  (c::cs) =>
   282     | c :: cs =>
   305                        let val (si,sii) =
   283       let
   306                            PROCESSS stateset grammar i c s
   284         val (si, sii) = PROCESSS stateset grammar i c s;
   307                        in
   285       in
   308                            Array.update (stateset,i,si);
   286         Array.update (stateset, i, si);
   309                            Array.update (stateset,i+1,sii);
   287         Array.update (stateset, i + 1, sii);
   310                            Produce grammar stateset (i+1) cs
   288         produce grammar stateset (i + 1) cs
   311                        end
   289       end));
   312            );
   290 
   313 
   291 
       
   292 (*(* FIXME new *)
       
   293 val get_trees = mapfilter (fn (_, _, [pt], _, _, _) => Some pt | _ => None);
       
   294 *)
   314 
   295 
   315 fun get_trees [] = []
   296 fun get_trees [] = []
   316   | get_trees ((_,_,pt_lst,_,_,_) :: stateset) =
   297   | get_trees ((_, _, pt_lst, _, _, _) :: stateset) =
   317        let val l = get_trees stateset
   298       let val l = get_trees stateset
   318        in case pt_lst of
   299       in case pt_lst of
   319             [ptree] => ptree :: l
   300            [ptree] => ptree :: l
   320           |   _     => l
   301          |   _     => l
   321        end;
   302       end;
   322 
       
   323 
       
   324 
   303 
   325 fun earley grammar startsymbol indata =
   304 fun earley grammar startsymbol indata =
   326    let  val S0 =
   305   let
   327        [("S'",0,[],[NT (startsymbol,0), T EndToken],"",0)]
   306     val S0 = [("S'", 0, [], [Nonterminal (startsymbol, 0), Terminal EndToken], "", 0)];
   328         val s = length indata + 1     (* MMW *)(* FIXME was .. + 2 *)
   307     val s = length indata + 1;     (* MMW *)(* FIXME was .. + 2 *)
   329         val Estate = Array.array (s, [])
   308     val Estate = Array.array (s, []);
   330    in   Array.update (Estate,0,S0);
   309   in
   331         let val l = Produce grammar Estate 0 indata    (* MMW *)(* FIXME was .. @ [EndToken] *)
   310     Array.update (Estate, 0, S0);
   332             val p_trees = get_trees l
   311     let
   333         in p_trees
   312       val l = produce grammar Estate 0 indata;    (* MMW *)(* FIXME was .. @ [EndToken] *)
   334         end
   313       val p_trees = get_trees l;
   335    end;
   314     in p_trees end
       
   315   end;
   336 
   316 
   337 
   317 
   338 (* FIXME demo *)
   318 (* FIXME demo *)
   339 fun parse (Gram (roots, prod_tab)) root toks =
   319 fun parse (Gram (_, prod_tab)) start toks =
   340   if root mem roots then
   320   (case earley prod_tab start toks of
   341     (case earley prod_tab root toks of
   321     [] => sys_error "parse: no parse trees"
   342       [] => error "parse: no parse trees"
   322   | pt :: _ => pt);
   343     | pt :: _ => pt)
       
   344   else error ("Unparsable category: " ^ root);
       
   345 
   323 
   346 
   324 
   347 end;
   325 end;
   348 
   326