src/Pure/Syntax/parser.ML
author wenzelm
Mon Oct 04 15:30:49 1993 +0100 (1993-10-04 ago)
changeset 18 c9ec452ff08f
child 46 f0f4978af183
permissions -rw-r--r--
lots of internal cleaning and tuning;
removed {parse,print}_{pre,post}_proc;
new lexer: now human readable due to scanner combinators;
new parser installed, but still inactive (due to grammar ambiguities);
added Syntax.test_read;
typ_of_term: sorts now made distinct and sorted;
mixfix: added forced line breaks (//);
PROP now printed before subterm of type prop with non-const head;
wenzelm@18
     1
(*  Title:      Pure/Syntax/parser.ML
wenzelm@18
     2
    Author:     Sonia Mahjoub and Markus Wenzel, TU Muenchen
wenzelm@18
     3
wenzelm@18
     4
Isabelle's main parser (used for terms and typs).
wenzelm@18
     5
wenzelm@18
     6
TODO:
wenzelm@18
     7
  ~1 --> chain_pri
wenzelm@18
     8
  rename T, NT
wenzelm@18
     9
  improve syntax error
wenzelm@18
    10
  fix ambiguous grammar problem
wenzelm@18
    11
*)
wenzelm@18
    12
wenzelm@18
    13
signature PARSER =
wenzelm@18
    14
sig
wenzelm@18
    15
  structure XGram: XGRAM
wenzelm@18
    16
  structure ParseTree: PARSE_TREE
wenzelm@18
    17
  local open XGram ParseTree ParseTree.Lexicon in
wenzelm@18
    18
    type gram
wenzelm@18
    19
    val empty_gram: gram
wenzelm@18
    20
    val extend_gram: gram -> string list -> string prod list -> gram
wenzelm@18
    21
    val mk_gram: string list -> string prod list -> gram
wenzelm@18
    22
    val parse: gram -> string -> token list -> parsetree
wenzelm@18
    23
  end
wenzelm@18
    24
end;
wenzelm@18
    25
wenzelm@18
    26
functor ParserFun(structure Symtab: SYMTAB and XGram: XGRAM
wenzelm@18
    27
  and ParseTree: PARSE_TREE)(*: PARSER *) =  (* FIXME *)
wenzelm@18
    28
struct
wenzelm@18
    29
wenzelm@18
    30
structure XGram = XGram;
wenzelm@18
    31
structure ParseTree = ParseTree;
wenzelm@18
    32
structure Lexicon = ParseTree.Lexicon;
wenzelm@18
    33
open XGram ParseTree Lexicon;
wenzelm@18
    34
wenzelm@18
    35
wenzelm@18
    36
(** datatype gram **)
wenzelm@18
    37
wenzelm@18
    38
datatype symb = T of token | NT of string * int;
wenzelm@18
    39
wenzelm@18
    40
datatype gram =
wenzelm@18
    41
  Gram of string list * (symb list * string * int) list Symtab.table;
wenzelm@18
    42
wenzelm@18
    43
wenzelm@18
    44
(* empty_gram *)
wenzelm@18
    45
wenzelm@18
    46
val empty_gram = Gram ([], Symtab.null);
wenzelm@18
    47
wenzelm@18
    48
wenzelm@18
    49
(* extend_gram *)
wenzelm@18
    50
wenzelm@18
    51
(*prods are stored in reverse order*)
wenzelm@18
    52
wenzelm@18
    53
fun extend_gram (Gram (roots, tab)) new_roots xprods =
wenzelm@18
    54
  let
wenzelm@18
    55
    fun symb_of (Terminal s) = Some (T (Token s))
wenzelm@18
    56
      | symb_of (Nonterminal (s, p)) =
wenzelm@18
    57
          (case predef_term s of
wenzelm@18
    58
            EndToken => Some (NT (s, p))
wenzelm@18
    59
          | tk => Some (T tk))
wenzelm@18
    60
      | symb_of _ = None;
wenzelm@18
    61
wenzelm@18
    62
    fun prod_of (Prod (s, xsyms, c, p)) = (s, mapfilter symb_of xsyms, c, p);
wenzelm@18
    63
wenzelm@18
    64
    fun add_prod (tab, (s, syms, c, p)) =
wenzelm@18
    65
      (case Symtab.lookup (tab, s) of
wenzelm@18
    66
        None => Symtab.update ((s, [(syms, c, p)]), tab)
wenzelm@18
    67
      | Some prods => Symtab.update ((s, (syms, c, p) :: prods), tab));
wenzelm@18
    68
  in
wenzelm@18
    69
    Gram (new_roots union roots,
wenzelm@18
    70
      Symtab.balance (foldl add_prod (tab, map prod_of xprods)))
wenzelm@18
    71
  end;
wenzelm@18
    72
wenzelm@18
    73
wenzelm@18
    74
(* mk_gram *)
wenzelm@18
    75
wenzelm@18
    76
val mk_gram = extend_gram empty_gram;
wenzelm@18
    77
wenzelm@18
    78
wenzelm@18
    79
(* get_prods *)
wenzelm@18
    80
wenzelm@18
    81
fun get_prods tab s pred =
wenzelm@18
    82
  let
wenzelm@18
    83
    fun rfilter [] ys = ys
wenzelm@18
    84
      | rfilter (x :: xs) ys = rfilter xs (if pred x then x :: ys else ys);
wenzelm@18
    85
  in
wenzelm@18
    86
    (case Symtab.lookup (tab, s) of
wenzelm@18
    87
      Some prods => rfilter prods []
wenzelm@18
    88
    | None => [])
wenzelm@18
    89
  end;
wenzelm@18
    90
wenzelm@18
    91
wenzelm@18
    92
wenzelm@18
    93
(** parse **)
wenzelm@18
    94
wenzelm@18
    95
type state =
wenzelm@18
    96
  string * int
wenzelm@18
    97
    * parsetree list    (*before point*)
wenzelm@18
    98
    * symb list         (*after point*)
wenzelm@18
    99
    * string * int;
wenzelm@18
   100
wenzelm@18
   101
type earleystate = state list Array.array;
wenzelm@18
   102
wenzelm@18
   103
wenzelm@18
   104
wenzelm@18
   105
fun getProductions tab A i =
wenzelm@18
   106
  get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1);
wenzelm@18
   107
wenzelm@18
   108
fun getProductions' tab A i k =
wenzelm@18
   109
  get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1);
wenzelm@18
   110
wenzelm@18
   111
wenzelm@18
   112
wenzelm@18
   113
fun mkState i jj A ([NT(B,~1)],id,~1) =
wenzelm@18
   114
                     (A,max_pri,[],[NT (B,jj)],id,i)
wenzelm@18
   115
  | mkState i jj A (ss,id,j) = (A,j,[],ss,id,i) ;
wenzelm@18
   116
wenzelm@18
   117
wenzelm@18
   118
wenzelm@18
   119
fun conc (t,(k:int)) [] = (None, [(t,k)])
wenzelm@18
   120
  | conc (t,k) ((t',k')::ts) =
wenzelm@18
   121
      if (t = t')
wenzelm@18
   122
      then (Some k', ( if k' >= k
wenzelm@18
   123
                       then (t',k')::ts
wenzelm@18
   124
                       else (t,k)::ts )
wenzelm@18
   125
           )
wenzelm@18
   126
      else let val (n, ts') = conc (t,k) ts
wenzelm@18
   127
           in (n, (t',k')::ts')
wenzelm@18
   128
           end;
wenzelm@18
   129
wenzelm@18
   130
wenzelm@18
   131
fun update_tree ((B,(i,ts))::used) (A,t) =
wenzelm@18
   132
      if   (A = B)
wenzelm@18
   133
      then
wenzelm@18
   134
          let val (n,ts') = conc t ts
wenzelm@18
   135
          in  ((A,(i,ts'))::used, n)
wenzelm@18
   136
          end
wenzelm@18
   137
      else
wenzelm@18
   138
          let val (used', n) = update_tree used (A,t)
wenzelm@18
   139
          in  ((B,(i,ts)) :: used', n)
wenzelm@18
   140
          end;
wenzelm@18
   141
wenzelm@18
   142
wenzelm@18
   143
wenzelm@18
   144
fun update_index ((B,(i,ts))::used) (A,j) =
wenzelm@18
   145
      if   (A = B)
wenzelm@18
   146
      then (A,(j,ts)) :: used
wenzelm@18
   147
      else (B,(i,ts)) :: (update_index used (A,j));
wenzelm@18
   148
wenzelm@18
   149
wenzelm@18
   150
wenzelm@18
   151
fun getS A i Si =
wenzelm@18
   152
       filter (fn (_,_,_,(NT(B,j))::_,_,_)
wenzelm@18
   153
                                    => (A=B andalso j<=i)
wenzelm@18
   154
               |      _             => false
wenzelm@18
   155
              ) Si;
wenzelm@18
   156
wenzelm@18
   157
wenzelm@18
   158
wenzelm@18
   159
fun getS' A k n Si =
wenzelm@18
   160
       filter (fn (_,_,_,(NT(B,j))::_,_,_) => (A=B andalso
wenzelm@18
   161
                                               j<=k andalso
wenzelm@18
   162
                                               j>n )
wenzelm@18
   163
               |      _                    => false
wenzelm@18
   164
              ) Si;
wenzelm@18
   165
wenzelm@18
   166
wenzelm@18
   167
wenzelm@18
   168
fun getStates Estate i ii A k =
wenzelm@18
   169
       filter (fn (_,_,_,(NT(B,j))::_,_,_)
wenzelm@18
   170
                                    => (A=B andalso j<=k)
wenzelm@18
   171
               |      _             => false
wenzelm@18
   172
              )
wenzelm@18
   173
              (Array.sub (Estate, ii))
wenzelm@18
   174
;
wenzelm@18
   175
wenzelm@18
   176
wenzelm@18
   177
wenzelm@18
   178
(* MMW *)(* FIXME *)
wenzelm@18
   179
fun movedot_term (A,j,ts,T a::sa,id,i) c =
wenzelm@18
   180
                   if (valued_token c)
wenzelm@18
   181
                   then (A,j,(ts@[Tip c]),sa,id,i)
wenzelm@18
   182
                   else (A,j,ts,sa,id,i) ;
wenzelm@18
   183
wenzelm@18
   184
wenzelm@18
   185
wenzelm@18
   186
fun movedot_nonterm ts  (A,j,tss,NT(B,k) ::sa,id,i) =
wenzelm@18
   187
                         (A,j,tss@ts,sa,id,i) ;
wenzelm@18
   188
wenzelm@18
   189
wenzelm@18
   190
wenzelm@18
   191
fun movedot_lambda  _  [] = []
wenzelm@18
   192
  | movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ((t,ki)::ts) =
wenzelm@18
   193
    if k <= ki
wenzelm@18
   194
    then (B,j,tss@t,sa,id,i) ::
wenzelm@18
   195
         (movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts)
wenzelm@18
   196
    else movedot_lambda (B,j,tss,(NT(A,k))::sa,id,i) ts
wenzelm@18
   197
  ;
wenzelm@18
   198
wenzelm@18
   199
wenzelm@18
   200
wenzelm@18
   201
wenzelm@18
   202
fun PROCESSS Estate grammar i c states =
wenzelm@18
   203
wenzelm@18
   204
let
wenzelm@18
   205
fun processS used [] (Si,Sii) = (Si,Sii)
wenzelm@18
   206
 |  processS used (S::States) (Si,Sii) =
wenzelm@18
   207
     ( case S of
wenzelm@18
   208
wenzelm@18
   209
        (_,_,_,(NT (A,j))::_,_,_)  =>
wenzelm@18
   210
          let
wenzelm@18
   211
            val (used_neu, States_neu) =
wenzelm@18
   212
              ( case assoc (used, A) of
wenzelm@18
   213
                 Some (k,l) =>         (* A gehoert zu used *)
wenzelm@18
   214
wenzelm@18
   215
                   if (k <= j)          (* Prioritaet wurde
wenzelm@18
   216
                                           behandelt  *)
wenzelm@18
   217
                   then
wenzelm@18
   218
                        (used, movedot_lambda S l)
wenzelm@18
   219
wenzelm@18
   220
                   else         (* Prioritaet wurde noch nicht
wenzelm@18
   221
                                        behandelt  *)
wenzelm@18
   222
                        let val L =
wenzelm@18
   223
                                getProductions' grammar A j k
wenzelm@18
   224
                            val States' = map (mkState i j A) L
wenzelm@18
   225
                        in
wenzelm@18
   226
                            (update_index used (A,j),
wenzelm@18
   227
                             States'@ movedot_lambda S l
wenzelm@18
   228
                            )
wenzelm@18
   229
                        end
wenzelm@18
   230
wenzelm@18
   231
                | None =>       (* A gehoert nicht zu used *)
wenzelm@18
   232
                      let val L = getProductions grammar A j
wenzelm@18
   233
                          val States' = map (mkState i j A) L
wenzelm@18
   234
                      in
wenzelm@18
   235
                          ((A,(j,[]))::used, States')
wenzelm@18
   236
                      end
wenzelm@18
   237
              )
wenzelm@18
   238
          in
wenzelm@18
   239
             processS used_neu (States @ States_neu) (S::Si,Sii)
wenzelm@18
   240
          end
wenzelm@18
   241
wenzelm@18
   242
     |  (_,_,_,(T a)::_,_,_)  =>
wenzelm@18
   243
                processS used States
wenzelm@18
   244
                    (S::Si, if (matching_tokens (a, c))
wenzelm@18
   245
                            then  (movedot_term S c)::Sii   (* MMW *)(* FIXME *)
wenzelm@18
   246
                            else Sii
wenzelm@18
   247
                    )
wenzelm@18
   248
wenzelm@18
   249
wenzelm@18
   250
     |  (A,k,ts,[],id,j) =>
wenzelm@18
   251
          let val tt = if id = ""
wenzelm@18
   252
                       then ts
wenzelm@18
   253
                       else [Node(id,ts)]
wenzelm@18
   254
          in
wenzelm@18
   255
             if (j = i)
wenzelm@18
   256
             then
wenzelm@18
   257
                let val (used',O) = update_tree used (A,(tt,k))
wenzelm@18
   258
                in ( case O of
wenzelm@18
   259
                      None =>
wenzelm@18
   260
                         let val Slist = getS A k Si
wenzelm@18
   261
                             val States' =
wenzelm@18
   262
                              map (movedot_nonterm tt ) Slist
wenzelm@18
   263
                         in  processS used'
wenzelm@18
   264
                             (States @ States') (S::Si,Sii)
wenzelm@18
   265
                         end
wenzelm@18
   266
                   |  Some n =>
wenzelm@18
   267
                         if (n >= k)
wenzelm@18
   268
                         then
wenzelm@18
   269
                            processS used' States (S::Si,Sii)
wenzelm@18
   270
                         else
wenzelm@18
   271
                            let val Slist = getS' A k n Si
wenzelm@18
   272
                                val States' =
wenzelm@18
   273
                              map (movedot_nonterm tt ) Slist
wenzelm@18
   274
                            in
wenzelm@18
   275
                              processS used'
wenzelm@18
   276
                              (States @ States') (S::Si,Sii)
wenzelm@18
   277
                            end
wenzelm@18
   278
                   )
wenzelm@18
   279
                end
wenzelm@18
   280
wenzelm@18
   281
              else
wenzelm@18
   282
                processS used (States @
wenzelm@18
   283
                               map (movedot_nonterm tt)
wenzelm@18
   284
                               (getStates Estate i j A k))
wenzelm@18
   285
                (S::Si,Sii)
wenzelm@18
   286
           end
wenzelm@18
   287
     )
wenzelm@18
   288
in
wenzelm@18
   289
   processS [] states ([],[])
wenzelm@18
   290
end;
wenzelm@18
   291
wenzelm@18
   292
wenzelm@18
   293
wenzelm@18
   294
fun syntax_error toks =
wenzelm@18
   295
  error ("Syntax error at: " ^ space_implode " " (map str_of_token toks));
wenzelm@18
   296
wenzelm@18
   297
fun Produce grammar stateset i indata =
wenzelm@18
   298
      case (Array.sub (stateset,i)) of
wenzelm@18
   299
         [] => syntax_error indata (* MMW *)(* FIXME *)
wenzelm@18
   300
      |  s  =>
wenzelm@18
   301
           (case indata of
wenzelm@18
   302
                []  => Array.sub (stateset,i)
wenzelm@18
   303
             |  (c::cs) =>
wenzelm@18
   304
                       let val (si,sii) =
wenzelm@18
   305
                           PROCESSS stateset grammar i c s
wenzelm@18
   306
                       in
wenzelm@18
   307
                           Array.update (stateset,i,si);
wenzelm@18
   308
                           Array.update (stateset,i+1,sii);
wenzelm@18
   309
                           Produce grammar stateset (i+1) cs
wenzelm@18
   310
                       end
wenzelm@18
   311
           );
wenzelm@18
   312
wenzelm@18
   313
wenzelm@18
   314
fun get_trees [] = []
wenzelm@18
   315
  | get_trees ((_,_,pt_lst,_,_,_) :: stateset) =
wenzelm@18
   316
       let val l = get_trees stateset
wenzelm@18
   317
       in case pt_lst of
wenzelm@18
   318
            [ptree] => ptree :: l
wenzelm@18
   319
          |   _     => l
wenzelm@18
   320
       end;
wenzelm@18
   321
wenzelm@18
   322
wenzelm@18
   323
wenzelm@18
   324
fun earley grammar startsymbol indata =
wenzelm@18
   325
   let  val S0 =
wenzelm@18
   326
       [("S'",0,[],[NT (startsymbol,0), T EndToken],"",0)]
wenzelm@18
   327
        val s = length indata + 1     (* MMW *)(* FIXME was .. + 2 *)
wenzelm@18
   328
        val Estate = Array.array (s, [])
wenzelm@18
   329
   in   Array.update (Estate,0,S0);
wenzelm@18
   330
        let val l = Produce grammar Estate 0 indata    (* MMW *)(* FIXME was .. @ [EndToken] *)
wenzelm@18
   331
            val p_trees = get_trees l
wenzelm@18
   332
        in p_trees
wenzelm@18
   333
        end
wenzelm@18
   334
   end;
wenzelm@18
   335
wenzelm@18
   336
wenzelm@18
   337
(* FIXME demo *)
wenzelm@18
   338
fun parse (Gram (roots, prod_tab)) root toks =
wenzelm@18
   339
  if root mem roots then
wenzelm@18
   340
    (case earley prod_tab root toks of
wenzelm@18
   341
      [] => error "parse: no parse trees"
wenzelm@18
   342
    | pt :: _ => pt)
wenzelm@18
   343
  else error ("Unparsable category: " ^ root);
wenzelm@18
   344
wenzelm@18
   345
wenzelm@18
   346
end;
wenzelm@18
   347