src/Pure/Syntax/parser.ML
author wenzelm
Wed Jan 19 14:21:26 1994 +0100 (1994-01-19 ago)
changeset 237 a7d3e712767a
parent 46 f0f4978af183
child 258 e540b7d4ecb1
permissions -rw-r--r--
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
now much leaner (eliminated gramgraph, all data except tables of old
parser are shared); simplified the internal interfaces for syntax
extension;
wenzelm@18
     1
(*  Title:      Pure/Syntax/parser.ML
wenzelm@46
     2
    ID:         $Id$
wenzelm@18
     3
    Author:     Sonia Mahjoub and Markus Wenzel, TU Muenchen
wenzelm@18
     4
wenzelm@18
     5
Isabelle's main parser (used for terms and typs).
wenzelm@18
     6
wenzelm@18
     7
TODO:
wenzelm@237
     8
  fix ambiguous grammar problem
wenzelm@18
     9
  ~1 --> chain_pri
wenzelm@18
    10
  improve syntax error
wenzelm@237
    11
  extend_gram: remove 'roots' arg
wenzelm@18
    12
*)
wenzelm@18
    13
wenzelm@18
    14
signature PARSER =
wenzelm@18
    15
sig
wenzelm@237
    16
  structure Lexicon: LEXICON
wenzelm@237
    17
  structure SynExt: SYN_EXT
wenzelm@237
    18
  local open Lexicon SynExt SynExt.Ast in
wenzelm@18
    19
    type gram
wenzelm@18
    20
    val empty_gram: gram
wenzelm@237
    21
    val extend_gram: gram -> string list -> xprod list -> gram
wenzelm@237
    22
    val merge_grams: gram -> gram -> gram
wenzelm@237
    23
    val pretty_gram: gram -> Pretty.T list
wenzelm@237
    24
    datatype parsetree =
wenzelm@237
    25
      Node of string * parsetree list |
wenzelm@237
    26
      Tip of token
wenzelm@18
    27
    val parse: gram -> string -> token list -> parsetree
wenzelm@18
    28
  end
wenzelm@18
    29
end;
wenzelm@18
    30
wenzelm@237
    31
functor ParserFun(structure Symtab: SYMTAB and Lexicon: LEXICON
wenzelm@237
    32
  and SynExt: SYN_EXT)(*: PARSER *) =  (* FIXME *)
wenzelm@18
    33
struct
wenzelm@18
    34
wenzelm@237
    35
structure Pretty = SynExt.Ast.Pretty;
wenzelm@237
    36
structure Lexicon = Lexicon;
wenzelm@237
    37
structure SynExt = SynExt;
wenzelm@237
    38
open Lexicon SynExt;
wenzelm@18
    39
wenzelm@18
    40
wenzelm@18
    41
(** datatype gram **)
wenzelm@18
    42
wenzelm@237
    43
datatype symb =
wenzelm@237
    44
  Terminal of token |
wenzelm@237
    45
  Nonterminal of string * int;
wenzelm@18
    46
wenzelm@18
    47
datatype gram =
wenzelm@237
    48
  Gram of (string * (symb list * string * int)) list
wenzelm@237
    49
    * (symb list * string * int) list Symtab.table;
wenzelm@18
    50
wenzelm@237
    51
fun mk_gram prods = Gram (prods, Symtab.make_multi prods);
wenzelm@18
    52
wenzelm@18
    53
wenzelm@237
    54
(* empty, extend, merge grams *)
wenzelm@18
    55
wenzelm@237
    56
val empty_gram = mk_gram [];
wenzelm@18
    57
wenzelm@237
    58
fun extend_gram (gram1 as Gram (prods1, _)) _ xprods2 =
wenzelm@18
    59
  let
wenzelm@237
    60
    fun symb_of (Delim s) = Some (Terminal (Token s))
wenzelm@237
    61
      | symb_of (Argument (s, p)) =
wenzelm@18
    62
          (case predef_term s of
wenzelm@237
    63
            None => Some (Nonterminal (s, p))
wenzelm@237
    64
          | Some tk => Some (Terminal tk))
wenzelm@18
    65
      | symb_of _ = None;
wenzelm@18
    66
wenzelm@237
    67
    fun prod_of (XProd (lhs, xsymbs, const, pri)) =
wenzelm@237
    68
      (lhs, (mapfilter symb_of xsymbs, const, pri));
wenzelm@18
    69
wenzelm@237
    70
    val prods2 = distinct (map prod_of xprods2);
wenzelm@18
    71
  in
wenzelm@237
    72
    if prods2 subset prods1 then gram1
wenzelm@237
    73
    else mk_gram (extend_list prods1 prods2)
wenzelm@18
    74
  end;
wenzelm@18
    75
wenzelm@237
    76
fun merge_grams (gram1 as Gram (prods1, _)) (gram2 as Gram (prods2, _)) =
wenzelm@237
    77
  if prods2 subset prods1 then gram1
wenzelm@237
    78
  else if prods1 subset prods2 then gram2
wenzelm@237
    79
  else mk_gram (merge_lists prods1 prods2);
wenzelm@237
    80
wenzelm@18
    81
wenzelm@237
    82
(* pretty_gram *)
wenzelm@18
    83
wenzelm@237
    84
fun pretty_gram (Gram (prods, _)) =
wenzelm@237
    85
  let
wenzelm@237
    86
    fun pretty_name name = [Pretty.str (name ^ " =")];
wenzelm@18
    87
wenzelm@237
    88
    fun pretty_symb (Terminal (Token s)) = Pretty.str (quote s)
wenzelm@237
    89
      | pretty_symb (Terminal tok) = Pretty.str (str_of_token tok)
wenzelm@237
    90
      | pretty_symb (Nonterminal (s, p)) =
wenzelm@237
    91
          Pretty.str (s ^ "[" ^ string_of_int p ^ "]");
wenzelm@18
    92
wenzelm@237
    93
    fun pretty_const "" = []
wenzelm@237
    94
      | pretty_const c = [Pretty.str ("=> " ^ quote c)];
wenzelm@237
    95
wenzelm@237
    96
    fun pretty_pri p = [Pretty.str ("(" ^ string_of_int p ^ ")")];
wenzelm@237
    97
wenzelm@237
    98
    fun pretty_prod (name, (symbs, const, pri)) =
wenzelm@237
    99
      Pretty.block (Pretty.breaks (pretty_name name @
wenzelm@237
   100
        map pretty_symb symbs @ pretty_const const @ pretty_pri pri));
wenzelm@18
   101
  in
wenzelm@237
   102
    map pretty_prod prods
wenzelm@18
   103
  end;
wenzelm@18
   104
wenzelm@18
   105
wenzelm@18
   106
wenzelm@18
   107
(** parse **)
wenzelm@18
   108
wenzelm@237
   109
datatype parsetree =
wenzelm@237
   110
  Node of string * parsetree list |
wenzelm@237
   111
  Tip of token;
wenzelm@237
   112
wenzelm@18
   113
type state =
wenzelm@18
   114
  string * int
wenzelm@18
   115
    * parsetree list    (*before point*)
wenzelm@18
   116
    * symb list         (*after point*)
wenzelm@18
   117
    * string * int;
wenzelm@18
   118
wenzelm@18
   119
type earleystate = state list Array.array;
wenzelm@18
   120
wenzelm@18
   121
wenzelm@18
   122
wenzelm@237
   123
fun get_prods tab lhs pred =
wenzelm@237
   124
  filter pred (Symtab.lookup_multi (tab, lhs));
wenzelm@237
   125
wenzelm@18
   126
fun getProductions tab A i =
wenzelm@18
   127
  get_prods tab A (fn (_, _, j) => j >= i orelse j = ~1);
wenzelm@18
   128
wenzelm@18
   129
fun getProductions' tab A i k =
wenzelm@18
   130
  get_prods tab A (fn (_, _, j) => (j >= i andalso j < k) orelse j = ~1);
wenzelm@18
   131
wenzelm@18
   132
wenzelm@18
   133
wenzelm@237
   134
fun mkState i jj A ([Nonterminal (B, ~1)], id, ~1) =
wenzelm@237
   135
      (A, max_pri, [], [Nonterminal (B, jj)], id, i)
wenzelm@237
   136
  | mkState i jj A (ss, id, j) = (A, j, [], ss, id, i);
wenzelm@18
   137
wenzelm@18
   138
wenzelm@18
   139
wenzelm@237
   140
fun conc (t, k:int) [] = (None, [(t, k)])
wenzelm@237
   141
  | conc (t, k) ((t', k') :: ts) =
wenzelm@237
   142
      if t = t' then
wenzelm@237
   143
        (Some k', if k' >= k then (t', k') :: ts else (t, k) :: ts)
wenzelm@237
   144
      else
wenzelm@237
   145
        let val (n, ts') = conc (t, k) ts
wenzelm@237
   146
        in (n, (t', k') :: ts') end;
wenzelm@18
   147
wenzelm@18
   148
wenzelm@237
   149
fun update_tree ((B, (i, ts)) :: used) (A, t) =
wenzelm@237
   150
  if A = B then
wenzelm@237
   151
    let val (n, ts') = conc t ts
wenzelm@237
   152
    in ((A, (i, ts')) :: used, n) end
wenzelm@237
   153
  else
wenzelm@237
   154
    let val (used', n) = update_tree used (A, t)
wenzelm@237
   155
    in ((B, (i, ts)) :: used', n) end;
wenzelm@18
   156
wenzelm@18
   157
wenzelm@237
   158
fun update_index ((B, (i, ts)) :: used) (A, j) =
wenzelm@237
   159
  if A = B then (A, (j, ts)) :: used
wenzelm@237
   160
  else (B, (i, ts)) :: (update_index used (A, j));
wenzelm@18
   161
wenzelm@18
   162
wenzelm@18
   163
wenzelm@18
   164
fun getS A i Si =
wenzelm@237
   165
  filter
wenzelm@237
   166
    (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= i
wenzelm@237
   167
      | _ => false) Si;
wenzelm@18
   168
wenzelm@18
   169
fun getS' A k n Si =
wenzelm@237
   170
  filter
wenzelm@237
   171
    (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k andalso j > n
wenzelm@237
   172
      | _ => false) Si;
wenzelm@18
   173
wenzelm@18
   174
fun getStates Estate i ii A k =
wenzelm@237
   175
  filter
wenzelm@237
   176
    (fn (_, _, _, Nonterminal (B, j) :: _, _, _) => A = B andalso j <= k
wenzelm@237
   177
      | _ => false)
wenzelm@237
   178
    (Array.sub (Estate, ii));
wenzelm@18
   179
wenzelm@18
   180
wenzelm@18
   181
(* MMW *)(* FIXME *)
wenzelm@237
   182
fun movedot_term (A, j, ts, Terminal a :: sa, id, i) c =
wenzelm@237
   183
  if valued_token c then
wenzelm@237
   184
    (A, j, (ts @ [Tip c]), sa, id, i)
wenzelm@237
   185
  else (A, j, ts, sa, id, i);
wenzelm@18
   186
wenzelm@237
   187
fun movedot_nonterm ts (A, j, tss, Nonterminal (B, k) :: sa, id, i) =
wenzelm@237
   188
  (A, j, tss @ ts, sa, id, i);
wenzelm@18
   189
wenzelm@237
   190
fun movedot_lambda _ [] = []
wenzelm@237
   191
  | movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ((t, ki) :: ts) =
wenzelm@237
   192
      if k <= ki then
wenzelm@237
   193
        (B, j, tss @ t, sa, id, i) ::
wenzelm@237
   194
          movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts
wenzelm@237
   195
      else movedot_lambda (B, j, tss, Nonterminal (A, k) :: sa, id, i) ts;
wenzelm@18
   196
wenzelm@18
   197
wenzelm@18
   198
wenzelm@18
   199
fun PROCESSS Estate grammar i c states =
wenzelm@18
   200
let
wenzelm@237
   201
fun processS used [] (Si, Sii) = (Si, Sii)
wenzelm@237
   202
  | processS used (S :: States) (Si, Sii) =
wenzelm@237
   203
      (case S of
wenzelm@237
   204
        (_, _, _, Nonterminal (A, j) :: _, _, _) =>
wenzelm@18
   205
          let
wenzelm@18
   206
            val (used_neu, States_neu) =
wenzelm@237
   207
              (case assoc (used, A) of
wenzelm@237
   208
                Some (k, l) =>      (*A gehoert zu used*)
wenzelm@237
   209
                  if k <= j         (*Prioritaet wurde behandelt*)
wenzelm@237
   210
                  then (used, movedot_lambda S l)
wenzelm@237
   211
                  else              (*Prioritaet wurde noch nicht behandelt*)
wenzelm@237
   212
                    let
wenzelm@237
   213
                      val L = getProductions' grammar A j k;
wenzelm@237
   214
                      val States' = map (mkState i j A) L;
wenzelm@237
   215
                    in
wenzelm@237
   216
                      (update_index used (A, j), States' @ movedot_lambda S l)
wenzelm@237
   217
                    end
wenzelm@18
   218
wenzelm@237
   219
              | None =>             (*A gehoert nicht zu used*)
wenzelm@237
   220
                  let
wenzelm@237
   221
                    val L = getProductions grammar A j;
wenzelm@237
   222
                    val States' = map (mkState i j A) L;
wenzelm@237
   223
                  in
wenzelm@237
   224
                    ((A, (j, [])) :: used, States')
wenzelm@237
   225
                  end)
wenzelm@18
   226
          in
wenzelm@237
   227
            processS used_neu (States @ States_neu) (S :: Si, Sii)
wenzelm@18
   228
          end
wenzelm@237
   229
      | (_, _, _, Terminal a :: _, _, _) =>
wenzelm@237
   230
          processS used States
wenzelm@237
   231
            (S :: Si,
wenzelm@237
   232
              if matching_tokens (a, c) then movedot_term S c :: Sii else Sii)
wenzelm@237
   233
                                          (* MMW *)(* FIXME *)
wenzelm@18
   234
wenzelm@237
   235
      | (A, k, ts, [], id, j) =>
wenzelm@237
   236
          let
wenzelm@237
   237
            val tt = if id = "" then ts else [Node (id, ts)]
wenzelm@18
   238
          in
wenzelm@237
   239
            if j = i then
wenzelm@237
   240
              let
wenzelm@237
   241
                val (used', O) = update_tree used (A, (tt, k));
wenzelm@237
   242
              in
wenzelm@237
   243
                (case O of
wenzelm@237
   244
                  None =>
wenzelm@237
   245
                    let
wenzelm@237
   246
                      val Slist = getS A k Si;
wenzelm@237
   247
                      val States' = map (movedot_nonterm tt) Slist;
wenzelm@237
   248
                    in
wenzelm@237
   249
                      processS used' (States @ States') (S :: Si, Sii)
wenzelm@237
   250
                    end
wenzelm@237
   251
                | Some n =>
wenzelm@237
   252
                    if n >= k then
wenzelm@237
   253
                      processS used' States (S :: Si, Sii)
wenzelm@237
   254
                    else
wenzelm@237
   255
                      let
wenzelm@237
   256
                        val Slist = getS' A k n Si;
wenzelm@237
   257
                        val States' = map (movedot_nonterm tt) Slist;
wenzelm@237
   258
                      in
wenzelm@237
   259
                        processS used' (States @ States') (S :: Si, Sii)
wenzelm@237
   260
                      end)
wenzelm@237
   261
              end
wenzelm@237
   262
            else
wenzelm@237
   263
              processS used
wenzelm@237
   264
                (States @ map (movedot_nonterm tt) (getStates Estate i j A k))
wenzelm@237
   265
                (S :: Si, Sii)
wenzelm@237
   266
          end)
wenzelm@18
   267
in
wenzelm@237
   268
  processS [] states ([], [])
wenzelm@18
   269
end;
wenzelm@18
   270
wenzelm@18
   271
wenzelm@18
   272
wenzelm@18
   273
fun syntax_error toks =
wenzelm@18
   274
  error ("Syntax error at: " ^ space_implode " " (map str_of_token toks));
wenzelm@18
   275
wenzelm@237
   276
fun produce grammar stateset i indata =
wenzelm@237
   277
  (case Array.sub (stateset, i) of
wenzelm@237
   278
    [] => syntax_error indata (* MMW *)(* FIXME *)
wenzelm@237
   279
  | s =>
wenzelm@237
   280
    (case indata of
wenzelm@237
   281
      [] => Array.sub (stateset, i)
wenzelm@237
   282
    | c :: cs =>
wenzelm@237
   283
      let
wenzelm@237
   284
        val (si, sii) = PROCESSS stateset grammar i c s;
wenzelm@237
   285
      in
wenzelm@237
   286
        Array.update (stateset, i, si);
wenzelm@237
   287
        Array.update (stateset, i + 1, sii);
wenzelm@237
   288
        produce grammar stateset (i + 1) cs
wenzelm@237
   289
      end));
wenzelm@18
   290
wenzelm@18
   291
wenzelm@237
   292
(*(* FIXME new *)
wenzelm@237
   293
val get_trees = mapfilter (fn (_, _, [pt], _, _, _) => Some pt | _ => None);
wenzelm@237
   294
*)
wenzelm@237
   295
wenzelm@18
   296
fun get_trees [] = []
wenzelm@237
   297
  | get_trees ((_, _, pt_lst, _, _, _) :: stateset) =
wenzelm@237
   298
      let val l = get_trees stateset
wenzelm@237
   299
      in case pt_lst of
wenzelm@237
   300
           [ptree] => ptree :: l
wenzelm@237
   301
         |   _     => l
wenzelm@237
   302
      end;
wenzelm@18
   303
wenzelm@18
   304
fun earley grammar startsymbol indata =
wenzelm@237
   305
  let
wenzelm@237
   306
    val S0 = [("S'", 0, [], [Nonterminal (startsymbol, 0), Terminal EndToken], "", 0)];
wenzelm@237
   307
    val s = length indata + 1;     (* MMW *)(* FIXME was .. + 2 *)
wenzelm@237
   308
    val Estate = Array.array (s, []);
wenzelm@237
   309
  in
wenzelm@237
   310
    Array.update (Estate, 0, S0);
wenzelm@237
   311
    let
wenzelm@237
   312
      val l = produce grammar Estate 0 indata;    (* MMW *)(* FIXME was .. @ [EndToken] *)
wenzelm@237
   313
      val p_trees = get_trees l;
wenzelm@237
   314
    in p_trees end
wenzelm@237
   315
  end;
wenzelm@18
   316
wenzelm@18
   317
wenzelm@18
   318
(* FIXME demo *)
wenzelm@237
   319
fun parse (Gram (_, prod_tab)) start toks =
wenzelm@237
   320
  (case earley prod_tab start toks of
wenzelm@237
   321
    [] => sys_error "parse: no parse trees"
wenzelm@237
   322
  | pt :: _ => pt);
wenzelm@18
   323
wenzelm@18
   324
wenzelm@18
   325
end;
wenzelm@18
   326