src/Pure/Syntax/parser.ML
author clasohm
Wed Nov 09 13:50:59 1994 +0100 (1994-11-09 ago)
changeset 697 40f72ab196f8
parent 682 c36f49c76d22
child 764 b60e77395d1a
permissions -rw-r--r--
changed warning for extremely ambiguous expressions
wenzelm@18
     1
(*  Title:      Pure/Syntax/parser.ML
wenzelm@46
     2
    ID:         $Id$
clasohm@345
     3
    Author:     Sonia Mahjoub, Markus Wenzel and Carsten Clasohm, TU Muenchen
wenzelm@18
     4
wenzelm@552
     5
Isabelle's main parser (used for terms and types).
wenzelm@18
     6
*)
wenzelm@18
     7
wenzelm@18
     8
signature PARSER =
wenzelm@18
     9
sig
wenzelm@237
    10
  structure Lexicon: LEXICON
wenzelm@237
    11
  structure SynExt: SYN_EXT
wenzelm@237
    12
  local open Lexicon SynExt SynExt.Ast in
wenzelm@18
    13
    type gram
wenzelm@18
    14
    val empty_gram: gram
clasohm@624
    15
    val extend_gram: gram -> string list -> xprod list -> gram
wenzelm@237
    16
    val merge_grams: gram -> gram -> gram
wenzelm@237
    17
    val pretty_gram: gram -> Pretty.T list
wenzelm@237
    18
    datatype parsetree =
wenzelm@237
    19
      Node of string * parsetree list |
wenzelm@237
    20
      Tip of token
clasohm@330
    21
    val parse: gram -> string -> token list -> parsetree list
wenzelm@18
    22
  end
clasohm@682
    23
  val branching_level: int ref;
wenzelm@18
    24
end;
wenzelm@18
    25
wenzelm@237
    26
functor ParserFun(structure Symtab: SYMTAB and Lexicon: LEXICON
wenzelm@552
    27
  and SynExt: SYN_EXT): PARSER =
wenzelm@18
    28
struct
wenzelm@18
    29
wenzelm@237
    30
structure Pretty = SynExt.Ast.Pretty;
wenzelm@237
    31
structure Lexicon = Lexicon;
wenzelm@237
    32
structure SynExt = SynExt;
wenzelm@237
    33
open Lexicon SynExt;
wenzelm@18
    34
wenzelm@18
    35
wenzelm@18
    36
(** datatype gram **)
wenzelm@18
    37
wenzelm@237
    38
datatype symb =
wenzelm@237
    39
  Terminal of token |
wenzelm@237
    40
  Nonterminal of string * int;
wenzelm@18
    41
clasohm@330
    42
datatype refsymb = Term of token | Nonterm of rhss_ref * int
clasohm@330
    43
                                (*reference to production list instead of name*)
clasohm@330
    44
and gram = Gram of (string * (symb list * string * int)) list *
clasohm@330
    45
                   (string * rhss_ref) list
clasohm@330
    46
withtype rhss_ref = (token option * (refsymb list * string * int) list) list ref
clasohm@330
    47
                                      (*lookahead table: token and productions*)
clasohm@330
    48
clasohm@330
    49
(* convert productions to reference grammar with lookaheads and eliminate
clasohm@330
    50
   chain productions *)
clasohm@373
    51
fun mk_gram prods =
clasohm@395
    52
  let (*get reference on list of all possible rhss for nonterminal lhs
clasohm@330
    53
        (if it doesn't exist a new one is created and added to the nonterminal
clasohm@330
    54
         list)*)
clasohm@330
    55
      fun get_rhss ref_prods lhs =
clasohm@330
    56
        case assoc (ref_prods, lhs) of
clasohm@330
    57
            None =>
clasohm@330
    58
              let val l = ref [(None, [])]
clasohm@330
    59
              in (l, (lhs, l) :: ref_prods) end
clasohm@330
    60
          | Some l => (l, ref_prods);
clasohm@330
    61
clasohm@330
    62
      (*convert symb list to refsymb list*)
clasohm@330
    63
      fun mk_refsymbs ref_prods [] rs = (rs, ref_prods)
clasohm@330
    64
        | mk_refsymbs ref_prods (Terminal tk :: symbs) rs =
clasohm@330
    65
            mk_refsymbs ref_prods symbs (rs @ [Term tk])
clasohm@330
    66
        | mk_refsymbs ref_prods (Nonterminal (name, prec) :: symbs) rs =
clasohm@330
    67
            let val (rhss, ref_prods') = get_rhss ref_prods name
clasohm@330
    68
            in mk_refsymbs ref_prods' symbs (rs @ [Nonterm (rhss, prec)])
clasohm@330
    69
            end;
clasohm@330
    70
clasohm@330
    71
      (*convert prod list to (string * rhss_ref) list
clasohm@330
    72
        without computing lookaheads*)
clasohm@330
    73
      fun mk_ref_gram [] ref_prods = ref_prods
clasohm@330
    74
        | mk_ref_gram ((lhs, (rhs, name, prec)) :: ps) ref_prods =
clasohm@330
    75
            let val (rhs', ref_prods') = get_rhss ref_prods lhs;
clasohm@330
    76
                val (dummy, rhss) = hd (!rhs');
clasohm@330
    77
                val (ref_symbs, ref_prods'') = mk_refsymbs ref_prods' rhs [];
clasohm@330
    78
            in rhs' := [(dummy, (ref_symbs, name, prec) :: rhss)];
clasohm@330
    79
               mk_ref_gram ps ref_prods''
clasohm@330
    80
            end;
clasohm@330
    81
clasohm@330
    82
      (*eliminate chain productions*)
clasohm@330
    83
      fun elim_chain ref_gram =
clasohm@330
    84
        let (*make a list of pairs representing chain productions and delete
clasohm@330
    85
              these productions*)
clasohm@330
    86
            fun list_chain [] = []
clasohm@330
    87
              | list_chain ((_, rhss_ref) :: ps) =
clasohm@330
    88
                  let fun lists [] new_rhss chains = (new_rhss, chains)
clasohm@330
    89
                        | lists (([Nonterm (id2, ~1)], _, ~1) :: rs) 
clasohm@330
    90
                                new_rhss chains =
clasohm@330
    91
                            lists rs new_rhss ((rhss_ref, id2) :: chains)
clasohm@330
    92
                        | lists (rhs :: rs) new_rhss chains = 
clasohm@330
    93
                            lists rs (rhs :: new_rhss) chains;
clasohm@330
    94
clasohm@330
    95
                      val (dummy, rhss) = hd (!rhss_ref);
clasohm@330
    96
clasohm@330
    97
                      val (new_rhss, chains) = lists rhss [] [];
clasohm@330
    98
                  in rhss_ref := [(dummy, new_rhss)];
clasohm@330
    99
                     chains @ (list_chain ps)
clasohm@330
   100
                  end;
clasohm@330
   101
clasohm@345
   102
            (*convert a list of pairs to an association list
clasohm@345
   103
              by using the first element as the key*)
clasohm@345
   104
            fun mk_assoc pairs =
clasohm@330
   105
              let fun doit [] result = result
clasohm@330
   106
                    | doit ((id1, id2) :: ps) result =
clasohm@330
   107
                        doit ps 
clasohm@330
   108
                        (overwrite (result, (id1, id2 :: (assocs result id1))));
clasohm@345
   109
              in doit pairs [] end;
clasohm@330
   110
clasohm@330
   111
            (*replace reference by list of rhss in chain pairs*)
clasohm@345
   112
            fun deref (id1, ids) =
clasohm@345
   113
              let fun deref1 [] = []
clasohm@345
   114
                    | deref1 (id :: ids) =
clasohm@330
   115
                        let val (_, rhss) = hd (!id);
clasohm@345
   116
                        in rhss @ (deref1 ids) end;
clasohm@345
   117
              in (id1, deref1 ids) end;
clasohm@330
   118
clasohm@330
   119
            val chain_pairs = 
clasohm@345
   120
               map deref (transitive_closure (mk_assoc (list_chain ref_gram)));
clasohm@330
   121
clasohm@330
   122
            (*add new rhss to productions*)
clasohm@345
   123
            fun elim (rhss_ref, rhss) =
clasohm@330
   124
              let val (dummy, old_rhss) = hd (!rhss_ref);
clasohm@330
   125
              in rhss_ref := [(dummy, old_rhss @ rhss)] end;
clasohm@345
   126
        in map elim chain_pairs;
clasohm@330
   127
           ref_gram
clasohm@330
   128
        end;
clasohm@330
   129
clasohm@330
   130
      val ref_gram = elim_chain (mk_ref_gram prods []);
wenzelm@18
   131
clasohm@330
   132
      (*make a list of all lambda NTs 
clasohm@697
   133
        (i.e. nonterminals that can produce lambda)*)
clasohm@330
   134
      val lambdas =
clasohm@330
   135
        let fun lambda [] result = result
clasohm@330
   136
              | lambda ((_, rhss_ref) :: nts) result =
clasohm@330
   137
                  if rhss_ref mem result then
clasohm@330
   138
                    lambda nts result
clasohm@330
   139
                  else
clasohm@345
   140
                    let (*list all NTs that can be produced by a rhs
clasohm@345
   141
                          containing only lambda NTs*)
clasohm@345
   142
                        fun only_lambdas [] result = result
clasohm@345
   143
                          | only_lambdas ((_, rhss_ref) :: ps) result =
clasohm@330
   144
                              let fun only (symbs, _, _) =
clasohm@697
   145
                                  forall (fn (Nonterm (id, _)) => id mem result
clasohm@697
   146
                                           | (Term _)          => false) symbs;
clasohm@330
   147
                          
clasohm@330
   148
                                  val (_, rhss) = hd (!rhss_ref);
clasohm@330
   149
                              in if not (rhss_ref mem result) andalso
clasohm@330
   150
                                    exists only rhss then
clasohm@345
   151
                                   only_lambdas ref_gram (rhss_ref :: result)
clasohm@330
   152
                                 else
clasohm@345
   153
                                   only_lambdas ps result
clasohm@330
   154
                              end;
clasohm@330
   155
clasohm@330
   156
                        val (_, rhss) = hd (!rhss_ref);
clasohm@330
   157
                    in if exists (fn (symbs, _, _) => null symbs) rhss
clasohm@330
   158
                       then lambda nts
clasohm@345
   159
                              (only_lambdas ref_gram (rhss_ref :: result))
clasohm@330
   160
                       else lambda nts result
clasohm@330
   161
                    end;
clasohm@330
   162
         in lambda ref_gram [] end;
clasohm@330
   163
clasohm@330
   164
      (*list all nonterminals on which the lookahead depends (due to lambda 
clasohm@330
   165
        NTs this can be more than one)
clasohm@330
   166
        and report if there is a terminal at the 'start'*)
clasohm@330
   167
      fun rhss_start [] skipped = (None, skipped)
clasohm@330
   168
        | rhss_start (Term tk :: _) skipped = (Some tk, skipped)
clasohm@330
   169
        | rhss_start (Nonterm (rhss_ref, _) :: rest) skipped =
clasohm@330
   170
            if rhss_ref mem lambdas then 
clasohm@330
   171
              rhss_start rest (rhss_ref ins skipped)
clasohm@330
   172
            else
clasohm@330
   173
              (None, rhss_ref ins skipped);
clasohm@330
   174
clasohm@330
   175
      (*list all terminals that can start the given rhss*)
clasohm@330
   176
      fun look_rhss starts rhss_ref =
clasohm@377
   177
        let fun look [] _ result = result
clasohm@377
   178
              | look ((symbs, _, _) :: todos) done result =
clasohm@330
   179
                  let val (start_token, skipped) = rhss_start symbs [];
clasohm@330
   180
clasohm@330
   181
                      (*process all nonterminals on which the lookahead
clasohm@330
   182
                        depends and build the new todo and done lists for
clasohm@330
   183
                        the look function*)
clasohm@377
   184
                      fun look2 [] todos result = 
clasohm@377
   185
                            look todos (done union skipped) result
clasohm@377
   186
                        | look2 (rhss_ref :: ls) todos result =
clasohm@377
   187
                            if rhss_ref mem done then look2 ls todos result
clasohm@330
   188
                            else case assoc (starts, rhss_ref) of
clasohm@377
   189
                                Some tks => look2 ls todos (tks union result)
clasohm@377
   190
                              | None => 
clasohm@377
   191
                                  let val (_, rhss) = hd (!rhss_ref);
clasohm@377
   192
                                  in look2 ls (rhss @ todos) result end;
clasohm@330
   193
                  in case start_token of
clasohm@697
   194
                        Some tk => look2 skipped todos (start_token ins result)
clasohm@697
   195
                      | None => look2 skipped todos result
clasohm@330
   196
                  end;
clasohm@330
   197
 
clasohm@330
   198
            val (_, rhss) = hd (!rhss_ref);
clasohm@377
   199
        in look rhss [rhss_ref] [] end;                       
clasohm@330
   200
clasohm@330
   201
      (*make a table that contains all possible starting terminals
clasohm@330
   202
        for each nonterminal*)
clasohm@330
   203
      fun mk_starts [] starts = starts
clasohm@330
   204
        | mk_starts ((_, rhss_ref) :: ps) starts =
clasohm@330
   205
            mk_starts ps ((rhss_ref, look_rhss starts rhss_ref) :: starts);
clasohm@330
   206
clasohm@330
   207
      val starts = mk_starts ref_gram [];
clasohm@330
   208
clasohm@330
   209
      (*add list of allowed starting tokens to productions*)
clasohm@330
   210
      fun mk_lookahead (_, rhss_ref) =
clasohm@377
   211
        let (*compares two values of type token option 
clasohm@377
   212
              (used for speed reasons)*)
clasohm@377
   213
            fun matching_opt_tks (Some tk1, Some tk2) =
clasohm@377
   214
                  matching_tokens (tk1, tk2)
clasohm@377
   215
              | matching_opt_tks _ = false;
clasohm@377
   216
clasohm@377
   217
            (*add item to lookahead list (a list containing pairs of token and 
clasohm@377
   218
              rhss that can be started with it)*)
clasohm@345
   219
            fun add_start new_rhs tokens table =
clasohm@345
   220
                  let fun add [] [] = []
clasohm@345
   221
                        | add (tk :: tks) [] =
clasohm@345
   222
                            (tk, [new_rhs]) :: (add tks [])
clasohm@345
   223
                        | add tokens ((tk, rhss) :: ss) =
clasohm@377
   224
                            if gen_mem matching_opt_tks (tk, tokens) then 
clasohm@345
   225
                              (tk, new_rhs :: rhss) :: (add (tokens \ tk) ss)
clasohm@330
   226
                            else
clasohm@345
   227
                              (tk, rhss) :: (add tokens ss);
clasohm@345
   228
                  in add tokens table end;
clasohm@330
   229
clasohm@330
   230
            (*combine all lookaheads of a list of nonterminals*)
clasohm@345
   231
            fun combine_starts rhss_refs =
clasohm@377
   232
              foldr (gen_union matching_opt_tks)
clasohm@345
   233
              ((map (fn rhss_ref => let val Some tks = assoc (starts, rhss_ref)
clasohm@345
   234
                                    in tks end) rhss_refs), []);
clasohm@330
   235
clasohm@330
   236
            (*get lookahead for a rhs and update lhs' lookahead list*)
clasohm@330
   237
            fun look_rhss [] table = table
clasohm@330
   238
              | look_rhss ((rhs as (symbs, id, prec)) :: rs) table =
clasohm@330
   239
                  let val (start_token, skipped) = rhss_start symbs [];
clasohm@330
   240
                      val starts = case start_token of
clasohm@377
   241
                                     Some tk => gen_ins matching_opt_tks 
clasohm@377
   242
                                             (Some tk, combine_starts skipped)
clasohm@345
   243
                                   | None => if skipped subset lambdas then
clasohm@330
   244
                                               [None]
clasohm@330
   245
                                             else
clasohm@330
   246
                                               combine_starts skipped;
clasohm@330
   247
                  in look_rhss rs (add_start rhs starts table) end;
clasohm@330
   248
clasohm@330
   249
             val (_, rhss) = hd (!rhss_ref);
clasohm@330
   250
        in rhss_ref := look_rhss rhss [] end;
clasohm@330
   251
clasohm@330
   252
  in map mk_lookahead ref_gram;
clasohm@330
   253
     Gram (prods, ref_gram)
clasohm@330
   254
  end;
wenzelm@18
   255
wenzelm@18
   256
wenzelm@237
   257
(* empty, extend, merge grams *)
wenzelm@18
   258
wenzelm@237
   259
val empty_gram = mk_gram [];
wenzelm@18
   260
clasohm@624
   261
fun extend_gram (gram1 as Gram (prods1, _)) roots xprods2 =
wenzelm@18
   262
  let
clasohm@624
   263
    fun simplify preserve s = 
clasohm@624
   264
      if preserve then 
clasohm@624
   265
        (if s mem (roots \\ ["type", "prop", "any"]) then "logic" else s)
clasohm@624
   266
      else (if s = "prop" then "@prop_H" else
clasohm@624
   267
              (if s mem (roots \\ ["type", "prop", "any"]) then 
clasohm@624
   268
                 "@logic_H" 
clasohm@624
   269
               else s));
clasohm@624
   270
clasohm@624
   271
    fun not_delim (Delim _) = false
clasohm@624
   272
      | not_delim _ = true
clasohm@624
   273
clasohm@624
   274
    fun symb_of _ (Delim s) = Some (Terminal (Token s))
clasohm@624
   275
      | symb_of preserve (Argument (s, p)) =
wenzelm@18
   276
          (case predef_term s of
clasohm@624
   277
            None => Some (Nonterminal (simplify preserve s, p))
wenzelm@237
   278
          | Some tk => Some (Terminal tk))
clasohm@624
   279
      | symb_of _ _ = None;
wenzelm@18
   280
wenzelm@237
   281
    fun prod_of (XProd (lhs, xsymbs, const, pri)) =
clasohm@624
   282
      let val copy_prod = (lhs = "prop" andalso forall not_delim xsymbs andalso
clasohm@624
   283
                           const <> "");
clasohm@624
   284
clasohm@624
   285
          val preserve = copy_prod 
clasohm@624
   286
                         orelse pri = chain_pri andalso lhs = "logic"
clasohm@624
   287
                         orelse lhs mem ["@prop_H", "@logic_H", "any"];
clasohm@624
   288
clasohm@624
   289
          val lhs' = if copy_prod then "@prop_H" else
clasohm@624
   290
                     if lhs = "logic" andalso pri = chain_pri
clasohm@624
   291
                        then "@logic_H"
clasohm@624
   292
                     else if lhs mem ("logic1" :: (roots \\ ["type", "prop"])) 
clasohm@624
   293
                        then "logic"
clasohm@624
   294
                     else lhs;
clasohm@624
   295
      in (lhs', (mapfilter (symb_of preserve) xsymbs, const, pri))
clasohm@624
   296
      end;
wenzelm@18
   297
wenzelm@237
   298
    val prods2 = distinct (map prod_of xprods2);
wenzelm@18
   299
  in
wenzelm@237
   300
    if prods2 subset prods1 then gram1
clasohm@395
   301
    else (writeln "Building new grammar...";
clasohm@395
   302
          mk_gram (extend_list prods1 prods2))
wenzelm@18
   303
  end;
wenzelm@18
   304
wenzelm@237
   305
fun merge_grams (gram1 as Gram (prods1, _)) (gram2 as Gram (prods2, _)) =
wenzelm@237
   306
  if prods2 subset prods1 then gram1
wenzelm@237
   307
  else if prods1 subset prods2 then gram2
clasohm@395
   308
  else (writeln "Building new grammar...";
clasohm@395
   309
        mk_gram (merge_lists prods1 prods2));
wenzelm@237
   310
wenzelm@18
   311
wenzelm@237
   312
(* pretty_gram *)
wenzelm@18
   313
wenzelm@237
   314
fun pretty_gram (Gram (prods, _)) =
wenzelm@237
   315
  let
wenzelm@237
   316
    fun pretty_name name = [Pretty.str (name ^ " =")];
wenzelm@18
   317
wenzelm@237
   318
    fun pretty_symb (Terminal (Token s)) = Pretty.str (quote s)
wenzelm@237
   319
      | pretty_symb (Terminal tok) = Pretty.str (str_of_token tok)
wenzelm@237
   320
      | pretty_symb (Nonterminal (s, p)) =
wenzelm@237
   321
          Pretty.str (s ^ "[" ^ string_of_int p ^ "]");
wenzelm@18
   322
wenzelm@237
   323
    fun pretty_const "" = []
wenzelm@237
   324
      | pretty_const c = [Pretty.str ("=> " ^ quote c)];
wenzelm@237
   325
wenzelm@237
   326
    fun pretty_pri p = [Pretty.str ("(" ^ string_of_int p ^ ")")];
wenzelm@237
   327
wenzelm@237
   328
    fun pretty_prod (name, (symbs, const, pri)) =
wenzelm@237
   329
      Pretty.block (Pretty.breaks (pretty_name name @
wenzelm@237
   330
        map pretty_symb symbs @ pretty_const const @ pretty_pri pri));
wenzelm@18
   331
  in
wenzelm@237
   332
    map pretty_prod prods
wenzelm@18
   333
  end;
wenzelm@18
   334
wenzelm@18
   335
wenzelm@18
   336
wenzelm@18
   337
(** parse **)
wenzelm@18
   338
wenzelm@237
   339
datatype parsetree =
wenzelm@237
   340
  Node of string * parsetree list |
wenzelm@237
   341
  Tip of token;
wenzelm@237
   342
wenzelm@18
   343
type state =
clasohm@330
   344
  rhss_ref * int      (*lhs: identification and production precedence*)
clasohm@330
   345
  * parsetree list    (*already parsed nonterminals on rhs*)
clasohm@330
   346
  * refsymb list      (*rest of rhs*)
clasohm@330
   347
  * string            (*name of production*)
clasohm@330
   348
  * int;              (*index for previous state list*)
wenzelm@18
   349
wenzelm@18
   350
type earleystate = state list Array.array;
wenzelm@18
   351
wenzelm@18
   352
clasohm@330
   353
(*Get all rhss with precedence >= minPrec*)
clasohm@330
   354
fun getRHS minPrec = filter (fn (_, _, prec:int) => prec >= minPrec);
wenzelm@237
   355
clasohm@330
   356
(*Get all rhss with precedence >= minPrec and < maxPrec*)
clasohm@330
   357
fun getRHS' minPrec maxPrec =
clasohm@330
   358
  filter (fn (_, _, prec:int) => prec >= minPrec andalso prec < maxPrec);
wenzelm@18
   359
clasohm@330
   360
(*Make states using a list of rhss*)
clasohm@330
   361
fun mkStates i minPrec lhsID rhss =
clasohm@330
   362
  let fun mkState (rhs, id, prodPrec) = (lhsID, prodPrec, [], rhs, id, i);
clasohm@330
   363
  in map mkState rhss end;
clasohm@697
   364
clasohm@330
   365
(*Add parse tree to list and eliminate duplicates 
clasohm@330
   366
  saving the maximum precedence*)
clasohm@330
   367
fun conc (t, prec:int) [] = (None, [(t, prec)])
clasohm@330
   368
  | conc (t, prec) ((t', prec') :: ts) =
clasohm@330
   369
      if t = t' then
clasohm@330
   370
        (Some prec', if prec' >= prec then (t', prec') :: ts 
clasohm@330
   371
                     else (t, prec) :: ts)
clasohm@330
   372
      else
clasohm@330
   373
        let val (n, ts') = conc (t, prec) ts
clasohm@330
   374
        in (n, (t', prec') :: ts') end;
wenzelm@18
   375
clasohm@330
   376
(*Update entry in used*)
clasohm@697
   377
fun update_trees ((B, (i, ts)) :: used) (A, t) =
wenzelm@237
   378
  if A = B then
wenzelm@237
   379
    let val (n, ts') = conc t ts
wenzelm@237
   380
    in ((A, (i, ts')) :: used, n) end
wenzelm@237
   381
  else
clasohm@697
   382
    let val (used', n) = update_trees used (A, t)
wenzelm@237
   383
    in ((B, (i, ts)) :: used', n) end;
wenzelm@18
   384
clasohm@330
   385
(*Replace entry in used*)
clasohm@697
   386
fun update_prec (A, prec) used =
clasohm@697
   387
  let fun update ((hd as (B, (_, ts))) :: used, used') =
clasohm@330
   388
        if A = B
clasohm@330
   389
        then used' @ ((A, (prec, ts)) :: used)
clasohm@330
   390
        else update (used, hd :: used')
clasohm@330
   391
  in update (used, []) end;
wenzelm@18
   392
clasohm@330
   393
fun getS A maxPrec Si =
wenzelm@237
   394
  filter
clasohm@330
   395
    (fn (_, _, _, Nonterm (B, prec) :: _, _, _)
clasohm@330
   396
          => A = B andalso prec <= maxPrec
wenzelm@237
   397
      | _ => false) Si;
wenzelm@18
   398
clasohm@330
   399
fun getS' A maxPrec minPrec Si =
wenzelm@237
   400
  filter
clasohm@330
   401
    (fn (_, _, _, Nonterm (B, prec) :: _, _, _)
clasohm@330
   402
          => A = B andalso prec > minPrec andalso prec <= maxPrec
wenzelm@237
   403
      | _ => false) Si;
wenzelm@18
   404
clasohm@330
   405
fun getStates Estate i ii A maxPrec =
wenzelm@237
   406
  filter
clasohm@330
   407
    (fn (_, _, _, Nonterm (B, prec) :: _, _, _)
clasohm@330
   408
          => A = B andalso prec <= maxPrec
wenzelm@237
   409
      | _ => false)
wenzelm@237
   410
    (Array.sub (Estate, ii));
wenzelm@18
   411
wenzelm@18
   412
clasohm@330
   413
fun movedot_term (A, j, ts, Term a :: sa, id, i) c =
wenzelm@237
   414
  if valued_token c then
clasohm@697
   415
    (A, j, ts @ [Tip c], sa, id, i)
wenzelm@237
   416
  else (A, j, ts, sa, id, i);
wenzelm@18
   417
clasohm@330
   418
fun movedot_nonterm ts (A, j, tss, Nonterm _ :: sa, id, i) =
wenzelm@237
   419
  (A, j, tss @ ts, sa, id, i);
wenzelm@18
   420
wenzelm@237
   421
fun movedot_lambda _ [] = []
clasohm@330
   422
  | movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ((t, ki) :: ts) =
wenzelm@237
   423
      if k <= ki then
wenzelm@237
   424
        (B, j, tss @ t, sa, id, i) ::
clasohm@330
   425
          movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ts
clasohm@330
   426
      else movedot_lambda (B, j, tss, Nonterm (A, k) :: sa, id, i) ts;
wenzelm@18
   427
wenzelm@18
   428
clasohm@682
   429
val warned = ref false;  (*flag for warning message*)
clasohm@697
   430
val branching_level = ref 200;  (*trigger value for warnings*)
wenzelm@18
   431
clasohm@330
   432
fun PROCESSS Estate i c states =
wenzelm@18
   433
let
clasohm@330
   434
fun get_lookahead rhss_ref = token_assoc (!rhss_ref, c);
clasohm@330
   435
wenzelm@237
   436
fun processS used [] (Si, Sii) = (Si, Sii)
wenzelm@237
   437
  | processS used (S :: States) (Si, Sii) =
wenzelm@237
   438
      (case S of
clasohm@330
   439
        (_, _, _, Nonterm (rhss_ref, minPrec) :: _, _, _) =>
clasohm@330
   440
          let                                       (*predictor operation*)
clasohm@697
   441
            val (used', new_states) =
clasohm@330
   442
              (case assoc (used, rhss_ref) of
clasohm@330
   443
                Some (usedPrec, l) =>       (*nonterminal has been processed*)
clasohm@330
   444
                  if usedPrec <= minPrec then
clasohm@330
   445
                                      (*wanted precedence has been processed*)
clasohm@330
   446
                    (used, movedot_lambda S l)
clasohm@330
   447
                  else            (*wanted precedence hasn't been parsed yet*)
clasohm@330
   448
                    let val rhss = get_lookahead rhss_ref;
clasohm@330
   449
                      val States' = mkStates i minPrec rhss_ref
clasohm@330
   450
                                      (getRHS' minPrec usedPrec rhss);
clasohm@697
   451
                    in (update_prec (rhss_ref, minPrec) used, 
clasohm@330
   452
                        movedot_lambda S l @ States')
wenzelm@237
   453
                    end
wenzelm@18
   454
clasohm@330
   455
              | None =>           (*nonterminal is parsed for the first time*)
clasohm@330
   456
                  let val rhss = get_lookahead rhss_ref;
clasohm@330
   457
                      val States' = mkStates i minPrec rhss_ref
clasohm@330
   458
                                      (getRHS minPrec rhss);
clasohm@697
   459
                  in ((rhss_ref, (minPrec, [])) :: used, States') end);
clasohm@697
   460
clasohm@697
   461
            val _ = if not (!warned) andalso
clasohm@697
   462
                       length (new_states @ States) > (!branching_level) then
clasohm@697
   463
                     (writeln "Warning: Currently parsed expression could be \
clasohm@697
   464
                              \extremely ambiguous.";
clasohm@697
   465
                      warned := true)
clasohm@697
   466
                    else ()
wenzelm@18
   467
          in
clasohm@697
   468
            processS used' (new_states @ States) (S :: Si, Sii)
wenzelm@18
   469
          end
clasohm@330
   470
      | (_, _, _, Term a :: _, _, _) =>               (*scanner operation*)
wenzelm@237
   471
          processS used States
wenzelm@237
   472
            (S :: Si,
wenzelm@237
   473
              if matching_tokens (a, c) then movedot_term S c :: Sii else Sii)
clasohm@330
   474
      | (A, prec, ts, [], id, j) =>                   (*completer operation*)
clasohm@697
   475
          let val tt = if id = "" then ts else [Node (id, ts)] in
clasohm@330
   476
            if j = i then                             (*lambda production?*)
wenzelm@237
   477
              let
clasohm@697
   478
                val (used', O) = update_trees used (A, (tt, prec));
wenzelm@237
   479
              in
wenzelm@237
   480
                (case O of
wenzelm@237
   481
                  None =>
wenzelm@237
   482
                    let
clasohm@330
   483
                      val Slist = getS A prec Si;
wenzelm@237
   484
                      val States' = map (movedot_nonterm tt) Slist;
wenzelm@237
   485
                    in
clasohm@330
   486
                      processS used' (States' @ States) (S :: Si, Sii)
wenzelm@237
   487
                    end
wenzelm@237
   488
                | Some n =>
clasohm@330
   489
                    if n >= prec then
wenzelm@237
   490
                      processS used' States (S :: Si, Sii)
wenzelm@237
   491
                    else
wenzelm@237
   492
                      let
clasohm@330
   493
                        val Slist = getS' A prec n Si;
wenzelm@237
   494
                        val States' = map (movedot_nonterm tt) Slist;
wenzelm@237
   495
                      in
clasohm@682
   496
                       processS used' (States' @ States) (S :: Si, Sii)
wenzelm@237
   497
                      end)
clasohm@330
   498
              end 
wenzelm@237
   499
            else
clasohm@697
   500
              let val Slist = getStates Estate i j A prec in
clasohm@697
   501
                processS used (map (movedot_nonterm tt) Slist @ States)
clasohm@697
   502
                         (S :: Si, Sii)
clasohm@682
   503
              end
wenzelm@237
   504
          end)
clasohm@682
   505
in processS [] states ([], []) end;
wenzelm@18
   506
wenzelm@18
   507
clasohm@362
   508
fun syntax_error toks allowed =
clasohm@362
   509
  error 
clasohm@362
   510
  ((if toks = [] then
clasohm@362
   511
      "error: unexpected end of input\n"
clasohm@362
   512
    else
clasohm@367
   513
      "Syntax error at: " ^ quote (space_implode " " (map str_of_token 
clasohm@367
   514
        ((rev o tl o rev) toks)))
clasohm@362
   515
      ^ "\n")
clasohm@362
   516
   ^ "Expected tokens: " 
clasohm@362
   517
   ^ space_implode ", " (map (quote o str_of_token) allowed));
wenzelm@18
   518
clasohm@362
   519
fun produce stateset i indata prev_token =
clasohm@362
   520
                     (*the argument prev_token is only used for error messages*)
wenzelm@237
   521
  (case Array.sub (stateset, i) of
clasohm@373
   522
    [] => let (*similar to token_assoc but does not automatically 
clasohm@373
   523
                include 'None' key*)
clasohm@373
   524
              fun token_assoc2 (list, key) =
clasohm@373
   525
                let fun assoc [] = []
clasohm@373
   526
                      | assoc ((keyi, xi) :: pairs) =
clasohm@373
   527
                          if is_some keyi andalso
clasohm@373
   528
                             matching_tokens (the keyi, key) then 
clasohm@373
   529
                            (assoc pairs) @ xi
clasohm@373
   530
                          else assoc pairs;
clasohm@373
   531
                          in assoc list end;
clasohm@373
   532
clasohm@373
   533
              (*test if tk is a lookahead for a given minimum precedence*)
clasohm@373
   534
              fun reduction minPrec tk _ (Term _ :: _, _, prec:int) =
clasohm@373
   535
                    if prec >= minPrec then true
clasohm@373
   536
                    else false
clasohm@373
   537
                | reduction minPrec tk checked 
clasohm@373
   538
                            (Nonterm (rhss_ref, NTprec)::_,_, prec) =
clasohm@373
   539
                    if prec >= minPrec andalso not (rhss_ref mem checked) then
clasohm@373
   540
                      exists (reduction NTprec tk (rhss_ref :: checked)) 
clasohm@373
   541
                             (token_assoc2 (!rhss_ref, tk))
clasohm@373
   542
                    else false;
clasohm@373
   543
clasohm@373
   544
              (*compute a list of allowed starting tokens 
clasohm@373
   545
                for a list of nonterminals considering precedence*)
clasohm@362
   546
              fun get_starts [] = []
clasohm@372
   547
                | get_starts ((rhss_ref, minPrec:int) :: refs) =
clasohm@362
   548
                    let fun get [] = []
clasohm@373
   549
                          | get ((Some tk, prods) :: rhss) =
clasohm@373
   550
                              if exists (reduction minPrec tk [rhss_ref]) prods
clasohm@373
   551
                              then tk :: (get rhss)
clasohm@372
   552
                              else get rhss
clasohm@362
   553
                          | get ((None, _) :: rhss) =
clasohm@362
   554
                              get rhss;
clasohm@362
   555
                    in (get (!rhss_ref)) union (get_starts refs) end;
clasohm@362
   556
clasohm@372
   557
              val NTs = map (fn (_, _, _, Nonterm (a, prec) :: _, _, _) => 
clasohm@372
   558
                              (a, prec))
clasohm@362
   559
                            (filter (fn (_, _, _, Nonterm _ :: _, _, _) => true
clasohm@362
   560
                                     | _ => false) (Array.sub (stateset, i-1)));
clasohm@367
   561
              val allowed = distinct (get_starts NTs @
clasohm@362
   562
                            (map (fn (_, _, _, Term a :: _, _, _) => a)
clasohm@362
   563
                            (filter (fn (_, _, _, Term _ :: _, _, _) => true
clasohm@367
   564
                                   | _ => false) (Array.sub (stateset, i-1)))));
clasohm@362
   565
          in syntax_error (if prev_token = EndToken then indata
clasohm@362
   566
                           else prev_token :: indata) allowed
clasohm@362
   567
          end
wenzelm@237
   568
  | s =>
wenzelm@237
   569
    (case indata of
wenzelm@237
   570
      [] => Array.sub (stateset, i)
wenzelm@237
   571
    | c :: cs =>
wenzelm@237
   572
      let
clasohm@330
   573
        val (si, sii) = PROCESSS stateset i c s;
wenzelm@237
   574
      in
wenzelm@237
   575
        Array.update (stateset, i, si);
wenzelm@237
   576
        Array.update (stateset, i + 1, sii);
clasohm@362
   577
        produce stateset (i + 1) cs c
wenzelm@237
   578
      end));
wenzelm@18
   579
wenzelm@18
   580
wenzelm@237
   581
val get_trees = mapfilter (fn (_, _, [pt], _, _, _) => Some pt | _ => None);
wenzelm@237
   582
wenzelm@18
   583
wenzelm@18
   584
fun earley grammar startsymbol indata =
wenzelm@237
   585
  let
clasohm@624
   586
    val startsymbol' = case startsymbol of
clasohm@624
   587
                           "logic" => "@logic_H"
clasohm@624
   588
                         | "prop"  => "@prop_H"
clasohm@624
   589
                         | other   => other;
clasohm@330
   590
    val rhss_ref = case assoc (grammar, startsymbol) of
clasohm@330
   591
                       Some r => r
clasohm@624
   592
                     | None   => error ("parse: Unknown startsymbol " ^ 
clasohm@624
   593
                                        quote startsymbol);
clasohm@330
   594
    val S0 = [(ref [], 0, [], [Nonterm (rhss_ref, 0), Term EndToken], "", 0)];
clasohm@330
   595
    val s = length indata + 1;
wenzelm@237
   596
    val Estate = Array.array (s, []);
wenzelm@237
   597
  in
wenzelm@237
   598
    Array.update (Estate, 0, S0);
wenzelm@237
   599
    let
clasohm@682
   600
      val l = (warned := false; produce Estate 0 indata EndToken(*dummy*));
clasohm@624
   601
wenzelm@237
   602
      val p_trees = get_trees l;
wenzelm@237
   603
    in p_trees end
wenzelm@237
   604
  end;
wenzelm@18
   605
wenzelm@18
   606
wenzelm@237
   607
fun parse (Gram (_, prod_tab)) start toks =
clasohm@624
   608
let val r =
wenzelm@237
   609
  (case earley prod_tab start toks of
wenzelm@237
   610
    [] => sys_error "parse: no parse trees"
clasohm@330
   611
  | pts => pts);
clasohm@624
   612
in r end
wenzelm@18
   613
wenzelm@18
   614
end;