src/Pure/Syntax/parser.ML
author clasohm
Thu May 19 13:13:27 1994 +0200 (1994-05-19 ago)
changeset 377 ab8917806779
parent 373 68400ea32f7b
child 395 712dceb1ecc7
permissions -rw-r--r--
lookaheads are now computed faster (during the grammar is built)
     1 (*  Title:      Pure/Syntax/parser.ML
     2     ID:         $Id$
     3     Author:     Sonia Mahjoub, Markus Wenzel and Carsten Clasohm, TU Muenchen
     4 
     5 Isabelle's main parser (used for terms and typs).
     6 
     7 TODO:
     8   extend_gram: remove 'roots' arg
     9 *)
    10 
    11 signature PARSER =
    12 sig
    13   structure Lexicon: LEXICON
    14   structure SynExt: SYN_EXT
    15   local open Lexicon SynExt SynExt.Ast in
    16     type gram
    17     val empty_gram: gram
    18     val extend_gram: gram -> string list -> xprod list -> gram
    19     val merge_grams: gram -> gram -> gram
    20     val pretty_gram: gram -> Pretty.T list
    21     datatype parsetree =
    22       Node of string * parsetree list |
    23       Tip of token
    24     val parse: gram -> string -> token list -> parsetree list
    25   end
    26 end;
    27 
    28 functor ParserFun(structure Symtab: SYMTAB and Lexicon: LEXICON
    29   and SynExt: SYN_EXT) =
    30 struct
    31 
    32 structure Pretty = SynExt.Ast.Pretty;
    33 structure Lexicon = Lexicon;
    34 structure SynExt = SynExt;
    35 open Lexicon SynExt;
    36 
    37 
    38 (** datatype gram **)
    39 
    40 datatype symb =
    41   Terminal of token |
    42   Nonterminal of string * int;
    43 
    44 datatype refsymb = Term of token | Nonterm of rhss_ref * int
    45                                 (*reference to production list instead of name*)
    46 and gram = Gram of (string * (symb list * string * int)) list *
    47                    (string * rhss_ref) list
    48 withtype rhss_ref = (token option * (refsymb list * string * int) list) list ref
    49                                       (*lookahead table: token and productions*)
    50 
    51 (* convert productions to reference grammar with lookaheads and eliminate
    52    chain productions *)
    53 fun mk_gram prods =
    54   let val _ = writeln "Building new grammar...";
    55 
    56       (*get reference on list of all possible rhss for nonterminal lhs
    57         (if it doesn't exist a new one is created and added to the nonterminal
    58          list)*)
    59       fun get_rhss ref_prods lhs =
    60         case assoc (ref_prods, lhs) of
    61             None =>
    62               let val l = ref [(None, [])]
    63               in (l, (lhs, l) :: ref_prods) end
    64           | Some l => (l, ref_prods);
    65 
    66       (*convert symb list to refsymb list*)
    67       fun mk_refsymbs ref_prods [] rs = (rs, ref_prods)
    68         | mk_refsymbs ref_prods (Terminal tk :: symbs) rs =
    69             mk_refsymbs ref_prods symbs (rs @ [Term tk])
    70         | mk_refsymbs ref_prods (Nonterminal (name, prec) :: symbs) rs =
    71             let val (rhss, ref_prods') = get_rhss ref_prods name
    72             in mk_refsymbs ref_prods' symbs (rs @ [Nonterm (rhss, prec)])
    73             end;
    74 
    75       (*convert prod list to (string * rhss_ref) list
    76         without computing lookaheads*)
    77       fun mk_ref_gram [] ref_prods = ref_prods
    78         | mk_ref_gram ((lhs, (rhs, name, prec)) :: ps) ref_prods =
    79             let val (rhs', ref_prods') = get_rhss ref_prods lhs;
    80                 val (dummy, rhss) = hd (!rhs');
    81                 val (ref_symbs, ref_prods'') = mk_refsymbs ref_prods' rhs [];
    82             in rhs' := [(dummy, (ref_symbs, name, prec) :: rhss)];
    83                mk_ref_gram ps ref_prods''
    84             end;
    85 
    86       (*eliminate chain productions*)
    87       fun elim_chain ref_gram =
    88         let (*make a list of pairs representing chain productions and delete
    89               these productions*)
    90             fun list_chain [] = []
    91               | list_chain ((_, rhss_ref) :: ps) =
    92                   let fun lists [] new_rhss chains = (new_rhss, chains)
    93                         | lists (([Nonterm (id2, ~1)], _, ~1) :: rs) 
    94                                 new_rhss chains =
    95                             lists rs new_rhss ((rhss_ref, id2) :: chains)
    96                         | lists (rhs :: rs) new_rhss chains = 
    97                             lists rs (rhs :: new_rhss) chains;
    98 
    99                       val (dummy, rhss) = hd (!rhss_ref);
   100 
   101                       val (new_rhss, chains) = lists rhss [] [];
   102                   in rhss_ref := [(dummy, new_rhss)];
   103                      chains @ (list_chain ps)
   104                   end;
   105 
   106             (*convert a list of pairs to an association list
   107               by using the first element as the key*)
   108             fun mk_assoc pairs =
   109               let fun doit [] result = result
   110                     | doit ((id1, id2) :: ps) result =
   111                         doit ps 
   112                         (overwrite (result, (id1, id2 :: (assocs result id1))));
   113               in doit pairs [] end;
   114 
   115             (*replace reference by list of rhss in chain pairs*)
   116             fun deref (id1, ids) =
   117               let fun deref1 [] = []
   118                     | deref1 (id :: ids) =
   119                         let val (_, rhss) = hd (!id);
   120                         in rhss @ (deref1 ids) end;
   121               in (id1, deref1 ids) end;
   122 
   123             val chain_pairs = 
   124                map deref (transitive_closure (mk_assoc (list_chain ref_gram)));
   125 
   126             (*add new rhss to productions*)
   127             fun elim (rhss_ref, rhss) =
   128               let val (dummy, old_rhss) = hd (!rhss_ref);
   129               in rhss_ref := [(dummy, old_rhss @ rhss)] end;
   130         in map elim chain_pairs;
   131            ref_gram
   132         end;
   133 
   134       val ref_gram = elim_chain (mk_ref_gram prods []);
   135 
   136       (*make a list of all lambda NTs 
   137         (i.e. nonterminals that can produce lambda*)
   138       val lambdas =
   139         let fun lambda [] result = result
   140               | lambda ((_, rhss_ref) :: nts) result =
   141                   if rhss_ref mem result then
   142                     lambda nts result
   143                   else
   144                     let (*list all NTs that can be produced by a rhs
   145                           containing only lambda NTs*)
   146                         fun only_lambdas [] result = result
   147                           | only_lambdas ((_, rhss_ref) :: ps) result =
   148                               let fun only (symbs, _, _) =
   149                                    forall (fn (Nonterm (id, _)) => id mem result
   150                                             | (Term _)          => false) symbs;
   151                           
   152                                   val (_, rhss) = hd (!rhss_ref);
   153                               in if not (rhss_ref mem result) andalso
   154                                     exists only rhss then
   155                                    only_lambdas ref_gram (rhss_ref :: result)
   156                                  else
   157                                    only_lambdas ps result
   158                               end;
   159 
   160                         val (_, rhss) = hd (!rhss_ref);
   161                     in if exists (fn (symbs, _, _) => null symbs) rhss
   162                        then lambda nts
   163                               (only_lambdas ref_gram (rhss_ref :: result))
   164                        else lambda nts result
   165                     end;
   166          in lambda ref_gram [] end;
   167 
   168       (*list all nonterminals on which the lookahead depends (due to lambda 
   169         NTs this can be more than one)
   170         and report if there is a terminal at the 'start'*)
   171       fun rhss_start [] skipped = (None, skipped)
   172         | rhss_start (Term tk :: _) skipped = (Some tk, skipped)
   173         | rhss_start (Nonterm (rhss_ref, _) :: rest) skipped =
   174             if rhss_ref mem lambdas then 
   175               rhss_start rest (rhss_ref ins skipped)
   176             else
   177               (None, rhss_ref ins skipped);
   178 
   179       (*list all terminals that can start the given rhss*)
   180       fun look_rhss starts rhss_ref =
   181         let fun look [] _ result = result
   182               | look ((symbs, _, _) :: todos) done result =
   183                   let val (start_token, skipped) = rhss_start symbs [];
   184 
   185                       (*process all nonterminals on which the lookahead
   186                         depends and build the new todo and done lists for
   187                         the look function*)
   188                       fun look2 [] todos result = 
   189                             look todos (done union skipped) result
   190                         | look2 (rhss_ref :: ls) todos result =
   191                             if rhss_ref mem done then look2 ls todos result
   192                             else case assoc (starts, rhss_ref) of
   193                                 Some tks => look2 ls todos (tks union result)
   194                               | None => 
   195                                   let val (_, rhss) = hd (!rhss_ref);
   196                                   in look2 ls (rhss @ todos) result end;
   197                   in case start_token of
   198                          Some tk => look2 skipped todos (start_token ins result)
   199                        | None => look2 skipped todos result
   200                   end;
   201  
   202             val (_, rhss) = hd (!rhss_ref);
   203         in look rhss [rhss_ref] [] end;                       
   204 
   205       (*make a table that contains all possible starting terminals
   206         for each nonterminal*)
   207       fun mk_starts [] starts = starts
   208         | mk_starts ((_, rhss_ref) :: ps) starts =
   209             mk_starts ps ((rhss_ref, look_rhss starts rhss_ref) :: starts);
   210 
   211       val starts = mk_starts ref_gram [];
   212 
   213       (*add list of allowed starting tokens to productions*)
   214       fun mk_lookahead (_, rhss_ref) =
   215         let (*compares two values of type token option 
   216               (used for speed reasons)*)
   217             fun matching_opt_tks (Some tk1, Some tk2) =
   218                   matching_tokens (tk1, tk2)
   219               | matching_opt_tks _ = false;
   220 
   221             (*add item to lookahead list (a list containing pairs of token and 
   222               rhss that can be started with it)*)
   223             fun add_start new_rhs tokens table =
   224                   let fun add [] [] = []
   225                         | add (tk :: tks) [] =
   226                             (tk, [new_rhs]) :: (add tks [])
   227                         | add tokens ((tk, rhss) :: ss) =
   228                             if gen_mem matching_opt_tks (tk, tokens) then 
   229                               (tk, new_rhs :: rhss) :: (add (tokens \ tk) ss)
   230                             else
   231                               (tk, rhss) :: (add tokens ss);
   232                   in add tokens table end;
   233 
   234             (*combine all lookaheads of a list of nonterminals*)
   235             fun combine_starts rhss_refs =
   236               foldr (gen_union matching_opt_tks)
   237               ((map (fn rhss_ref => let val Some tks = assoc (starts, rhss_ref)
   238                                     in tks end) rhss_refs), []);
   239 
   240             (*get lookahead for a rhs and update lhs' lookahead list*)
   241             fun look_rhss [] table = table
   242               | look_rhss ((rhs as (symbs, id, prec)) :: rs) table =
   243                   let val (start_token, skipped) = rhss_start symbs [];
   244                       val starts = case start_token of
   245                                      Some tk => gen_ins matching_opt_tks 
   246                                              (Some tk, combine_starts skipped)
   247                                    | None => if skipped subset lambdas then
   248                                                [None]
   249                                              else
   250                                                combine_starts skipped;
   251                   in look_rhss rs (add_start rhs starts table) end;
   252 
   253              val (_, rhss) = hd (!rhss_ref);
   254         in rhss_ref := look_rhss rhss [] end;
   255 
   256   in map mk_lookahead ref_gram;
   257      Gram (prods, ref_gram)
   258   end;
   259 
   260 
   261 (* empty, extend, merge grams *)
   262 
   263 val empty_gram = mk_gram [];
   264 
   265 fun extend_gram (gram1 as Gram (prods1, _)) _ xprods2 =
   266   let
   267     fun symb_of (Delim s) = Some (Terminal (Token s))
   268       | symb_of (Argument (s, p)) =
   269           (case predef_term s of
   270             None => Some (Nonterminal (s, p))
   271           | Some tk => Some (Terminal tk))
   272       | symb_of _ = None;
   273 
   274     fun prod_of (XProd (lhs, xsymbs, const, pri)) =
   275       (lhs, (mapfilter symb_of xsymbs, const, pri));
   276 
   277     val prods2 = distinct (map prod_of xprods2);
   278   in
   279     if prods2 subset prods1 then gram1
   280     else mk_gram (extend_list prods1 prods2)
   281   end;
   282 
   283 fun merge_grams (gram1 as Gram (prods1, _)) (gram2 as Gram (prods2, _)) =
   284   if prods2 subset prods1 then gram1
   285   else if prods1 subset prods2 then gram2
   286   else mk_gram (merge_lists prods1 prods2);
   287 
   288 
   289 (* pretty_gram *)
   290 
   291 fun pretty_gram (Gram (prods, _)) =
   292   let
   293     fun pretty_name name = [Pretty.str (name ^ " =")];
   294 
   295     fun pretty_symb (Terminal (Token s)) = Pretty.str (quote s)
   296       | pretty_symb (Terminal tok) = Pretty.str (str_of_token tok)
   297       | pretty_symb (Nonterminal (s, p)) =
   298           Pretty.str (s ^ "[" ^ string_of_int p ^ "]");
   299 
   300     fun pretty_const "" = []
   301       | pretty_const c = [Pretty.str ("=> " ^ quote c)];
   302 
   303     fun pretty_pri p = [Pretty.str ("(" ^ string_of_int p ^ ")")];
   304 
   305     fun pretty_prod (name, (symbs, const, pri)) =
   306       Pretty.block (Pretty.breaks (pretty_name name @
   307         map pretty_symb symbs @ pretty_const const @ pretty_pri pri));
   308   in
   309     map pretty_prod prods
   310   end;
   311 
   312 
   313 
   314 (** parse **)
   315 
   316 datatype parsetree =
   317   Node of string * parsetree list |
   318   Tip of token;
   319 
   320 type state =
   321   rhss_ref * int      (*lhs: identification and production precedence*)
   322   * parsetree list    (*already parsed nonterminals on rhs*)
   323   * refsymb list      (*rest of rhs*)
   324   * string            (*name of production*)
   325   * int;              (*index for previous state list*)
   326 
   327 type earleystate = state list Array.array;
   328 
   329 
   330 (*Get all rhss with precedence >= minPrec*)
   331 fun getRHS minPrec = filter (fn (_, _, prec:int) => prec >= minPrec);
   332 
   333 (*Get all rhss with precedence >= minPrec and < maxPrec*)
   334 fun getRHS' minPrec maxPrec =
   335   filter (fn (_, _, prec:int) => prec >= minPrec andalso prec < maxPrec);
   336 
   337 (*Make states using a list of rhss*)
   338 fun mkStates i minPrec lhsID rhss =
   339   let fun mkState (rhs, id, prodPrec) = (lhsID, prodPrec, [], rhs, id, i);
   340   in map mkState rhss end;
   341 	
   342 (*Add parse tree to list and eliminate duplicates 
   343   saving the maximum precedence*)
   344 fun conc (t, prec:int) [] = (None, [(t, prec)])
   345   | conc (t, prec) ((t', prec') :: ts) =
   346       if t = t' then
   347         (Some prec', if prec' >= prec then (t', prec') :: ts 
   348                      else (t, prec) :: ts)
   349       else
   350         let val (n, ts') = conc (t, prec) ts
   351         in (n, (t', prec') :: ts') end;
   352 
   353 (*Update entry in used*)
   354 fun update_tree ((B, (i, ts)) :: used) (A, t) =
   355   if A = B then
   356     let val (n, ts') = conc t ts
   357     in ((A, (i, ts')) :: used, n) end
   358   else
   359     let val (used', n) = update_tree used (A, t)
   360     in ((B, (i, ts)) :: used', n) end;
   361 
   362 (*Replace entry in used*)
   363 fun update_index (A, prec) used =
   364   let fun update((hd as (B, (_, ts))) :: used, used') =
   365         if A = B
   366         then used' @ ((A, (prec, ts)) :: used)
   367         else update (used, hd :: used')
   368   in update (used, []) end;
   369 
   370 fun getS A maxPrec Si =
   371   filter
   372     (fn (_, _, _, Nonterm (B, prec) :: _, _, _)
   373           => A = B andalso prec <= maxPrec
   374       | _ => false) Si;
   375 
   376 fun getS' A maxPrec minPrec Si =
   377   filter
   378     (fn (_, _, _, Nonterm (B, prec) :: _, _, _)
   379           => A = B andalso prec > minPrec andalso prec <= maxPrec
   380       | _ => false) Si;
   381 
   382 fun getStates Estate i ii A maxPrec =
   383   filter
   384     (fn (_, _, _, Nonterm (B, prec) :: _, _, _)
   385           => A = B andalso prec <= maxPrec
   386       | _ => false)
   387     (Array.sub (Estate, ii));
   388 
   389 
   390 fun movedot_term (A, j, ts, Term a :: sa, id, i) c =
   391   if valued_token c then
   392     (A, j, (ts @ [Tip c]), sa, id, i)
   393   else (A, j, ts, sa, id, i);
   394 
   395 fun movedot_nonterm ts (A, j, tss, Nonterm _ :: sa, id, i) =
   396   (A, j, tss @ ts, sa, id, i);
   397 
   398 fun movedot_lambda _ [] = []
   399   | movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ((t, ki) :: ts) =
   400       if k <= ki then
   401         (B, j, tss @ t, sa, id, i) ::
   402           movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ts
   403       else movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ts;
   404 
   405 
   406 
   407 fun PROCESSS Estate i c states =
   408 let
   409 fun get_lookahead rhss_ref = token_assoc (!rhss_ref, c);
   410 
   411 fun processS used [] (Si, Sii) = (Si, Sii)
   412   | processS used (S :: States) (Si, Sii) =
   413       (case S of
   414         (_, _, _, Nonterm (rhss_ref, minPrec) :: _, _, _) =>
   415           let                                       (*predictor operation*)
   416             val (used_new, States_new) =
   417               (case assoc (used, rhss_ref) of
   418                 Some (usedPrec, l) =>       (*nonterminal has been processed*)
   419                   if usedPrec <= minPrec then
   420                                       (*wanted precedence has been processed*)
   421                     (used, movedot_lambda S l)
   422                   else            (*wanted precedence hasn't been parsed yet*)
   423                     let val rhss = get_lookahead rhss_ref;
   424                       val States' = mkStates i minPrec rhss_ref
   425                                       (getRHS' minPrec usedPrec rhss);
   426                     in (update_index (rhss_ref, minPrec) used, 
   427                         movedot_lambda S l @ States')
   428                     end
   429 
   430               | None =>           (*nonterminal is parsed for the first time*)
   431                   let val rhss = get_lookahead rhss_ref;
   432                       val States' = mkStates i minPrec rhss_ref
   433                                       (getRHS minPrec rhss);
   434                   in ((rhss_ref, (minPrec, [])) :: used, States') end)
   435           in
   436             processS used_new (States_new @ States) (S :: Si, Sii)
   437           end
   438       | (_, _, _, Term a :: _, _, _) =>               (*scanner operation*)
   439           processS used States
   440             (S :: Si,
   441               if matching_tokens (a, c) then movedot_term S c :: Sii else Sii)
   442 
   443       | (A, prec, ts, [], id, j) =>                   (*completer operation*)
   444           let
   445             val tt = if id = "" then ts else [Node (id, ts)]
   446           in
   447             if j = i then                             (*lambda production?*)
   448               let
   449                 val (used', O) = update_tree used (A, (tt, prec));
   450               in
   451                 (case O of
   452                   None =>
   453                     let
   454                       val Slist = getS A prec Si;
   455                       val States' = map (movedot_nonterm tt) Slist;
   456                     in
   457                       processS used' (States' @ States) (S :: Si, Sii)
   458                     end
   459                 | Some n =>
   460                     if n >= prec then
   461                       processS used' States (S :: Si, Sii)
   462                     else
   463                       let
   464                         val Slist = getS' A prec n Si;
   465                         val States' = map (movedot_nonterm tt) Slist;
   466                       in
   467                         processS used' (States' @ States) (S :: Si, Sii)
   468                       end)
   469               end 
   470             else
   471               processS used
   472                 (map (movedot_nonterm tt) (getStates Estate i j A prec) @ States)
   473                 (S :: Si, Sii)
   474           end)
   475 in
   476   processS [] states ([], [])
   477 end;
   478 
   479 
   480 
   481 fun syntax_error toks allowed =
   482   error 
   483   ((if toks = [] then
   484       "error: unexpected end of input\n"
   485     else
   486       "Syntax error at: " ^ quote (space_implode " " (map str_of_token 
   487         ((rev o tl o rev) toks)))
   488       ^ "\n")
   489    ^ "Expected tokens: " 
   490    ^ space_implode ", " (map (quote o str_of_token) allowed));
   491 
   492 fun produce stateset i indata prev_token =
   493                      (*the argument prev_token is only used for error messages*)
   494   (case Array.sub (stateset, i) of
   495     [] => let (*similar to token_assoc but does not automatically 
   496                 include 'None' key*)
   497               fun token_assoc2 (list, key) =
   498                 let fun assoc [] = []
   499                       | assoc ((keyi, xi) :: pairs) =
   500                           if is_some keyi andalso
   501                              matching_tokens (the keyi, key) then 
   502                             (assoc pairs) @ xi
   503                           else assoc pairs;
   504                           in assoc list end;
   505 
   506               (*test if tk is a lookahead for a given minimum precedence*)
   507               fun reduction minPrec tk _ (Term _ :: _, _, prec:int) =
   508                     if prec >= minPrec then true
   509                     else false
   510                 | reduction minPrec tk checked 
   511                             (Nonterm (rhss_ref, NTprec)::_,_, prec) =
   512                     if prec >= minPrec andalso not (rhss_ref mem checked) then
   513                       exists (reduction NTprec tk (rhss_ref :: checked)) 
   514                              (token_assoc2 (!rhss_ref, tk))
   515                     else false;
   516 
   517               (*compute a list of allowed starting tokens 
   518                 for a list of nonterminals considering precedence*)
   519               fun get_starts [] = []
   520                 | get_starts ((rhss_ref, minPrec:int) :: refs) =
   521                     let fun get [] = []
   522                           | get ((Some tk, prods) :: rhss) =
   523                               if exists (reduction minPrec tk [rhss_ref]) prods
   524                               then tk :: (get rhss)
   525                               else get rhss
   526                           | get ((None, _) :: rhss) =
   527                               get rhss;
   528                     in (get (!rhss_ref)) union (get_starts refs) end;
   529 
   530               val NTs = map (fn (_, _, _, Nonterm (a, prec) :: _, _, _) => 
   531                               (a, prec))
   532                             (filter (fn (_, _, _, Nonterm _ :: _, _, _) => true
   533                                      | _ => false) (Array.sub (stateset, i-1)));
   534               val allowed = distinct (get_starts NTs @
   535                             (map (fn (_, _, _, Term a :: _, _, _) => a)
   536                             (filter (fn (_, _, _, Term _ :: _, _, _) => true
   537                                    | _ => false) (Array.sub (stateset, i-1)))));
   538           in syntax_error (if prev_token = EndToken then indata
   539                            else prev_token :: indata) allowed
   540           end
   541   | s =>
   542     (case indata of
   543       [] => Array.sub (stateset, i)
   544     | c :: cs =>
   545       let
   546         val (si, sii) = PROCESSS stateset i c s;
   547       in
   548         Array.update (stateset, i, si);
   549         Array.update (stateset, i + 1, sii);
   550         produce stateset (i + 1) cs c
   551       end));
   552 
   553 
   554 val get_trees = mapfilter (fn (_, _, [pt], _, _, _) => Some pt | _ => None);
   555 
   556 
   557 fun earley grammar startsymbol indata =
   558   let
   559     val rhss_ref = case assoc (grammar, startsymbol) of
   560                        Some r => r
   561                      | None => error ("parse: Unknown startsymbol " ^ 
   562                                       quote startsymbol);
   563     val S0 = [(ref [], 0, [], [Nonterm (rhss_ref, 0), Term EndToken], "", 0)];
   564     val s = length indata + 1;
   565     val Estate = Array.array (s, []);
   566   in
   567     Array.update (Estate, 0, S0);
   568     let
   569       val l = produce Estate 0 indata EndToken(*dummy*);
   570       val p_trees = get_trees l;
   571     in p_trees end
   572   end;
   573 
   574 
   575 fun parse (Gram (_, prod_tab)) start toks =
   576   (case earley prod_tab start toks of
   577     [] => sys_error "parse: no parse trees"
   578   | pts => pts);
   579 
   580 end;
   581