| author | wenzelm | 
| Fri, 24 Sep 2021 22:23:26 +0200 | |
| changeset 74362 | 0135a0c77b64 | 
| parent 71468 | 53fcbede7bf7 | 
| child 77723 | b761c91c2447 | 
| permissions | -rw-r--r-- | 
| 18 | 1 | (* Title: Pure/Syntax/parser.ML | 
| 67545 | 2 | Author: Carsten Clasohm, Sonia Mahjoub | 
| 3 | Author: Makarius | |
| 18 | 4 | |
| 67545 | 5 | General context-free parser for the inner syntax of terms and types. | 
| 18 | 6 | *) | 
| 7 | ||
| 8 | signature PARSER = | |
| 15752 | 9 | sig | 
| 1507 | 10 | type gram | 
| 11 | val empty_gram: gram | |
| 42288 
2074b31650e6
discontinued special treatment of structure Syntax_Ext (formerly Syn_Ext);
 wenzelm parents: 
42282diff
changeset | 12 | val extend_gram: Syntax_Ext.xprod list -> gram -> gram | 
| 45632 
b23c42b9f78a
prefer Parser.make_gram over Parser.merge_gram, to approximate n-ary merges on theory import;
 wenzelm parents: 
44102diff
changeset | 13 | val make_gram: Syntax_Ext.xprod list -> gram | 
| 1507 | 14 | val pretty_gram: gram -> Pretty.T list | 
| 15 | datatype parsetree = | |
| 16 | Node of string * parsetree list | | |
| 17 | Tip of Lexicon.token | |
| 42374 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 18 | exception PARSETREE of parsetree | 
| 42205 
22f5cc06c419
direct pretty printing of parsetrees -- prevent diagnostic output from crashing due to undeclared entities;
 wenzelm parents: 
41378diff
changeset | 19 | val pretty_parsetree: parsetree -> Pretty.T | 
| 45641 | 20 | val parse: gram -> string -> Lexicon.token list -> parsetree list | 
| 41378 | 21 | val branching_level: int Config.T | 
| 15752 | 22 | end; | 
| 1507 | 23 | |
| 15752 | 24 | structure Parser: PARSER = | 
| 18 | 25 | struct | 
| 15752 | 26 | |
| 18 | 27 | (** datatype gram **) | 
| 28 | ||
| 67541 | 29 | (* nonterminals *) | 
| 30 | ||
| 67545 | 31 | (*production for the NTs are stored in a vector, indexed by the NT tag*) | 
| 67541 | 32 | type nt = int; | 
| 33 | ||
| 67545 | 34 | type tags = nt Symtab.table; | 
| 35 | val tags_empty: tags = Symtab.empty; | |
| 36 | fun tags_content (tags: tags) = sort_by #1 (Symtab.dest tags); | |
| 37 | fun tags_lookup (tags: tags) = Symtab.lookup tags; | |
| 38 | fun tags_insert tag (tags: tags) = Symtab.update_new tag tags; | |
| 39 | fun tags_name (tags: tags) = the o Inttab.lookup (Inttab.make (map swap (Symtab.dest tags))); | |
| 40 | ||
| 67541 | 41 | type nts = Inttab.set; | 
| 42 | val nts_empty: nts = Inttab.empty; | |
| 43 | val nts_merge: nts * nts -> nts = Inttab.merge (K true); | |
| 44 | fun nts_insert nt : nts -> nts = Inttab.insert_set nt; | |
| 45 | fun nts_member (nts: nts) = Inttab.defined nts; | |
| 46 | fun nts_fold f (nts: nts) = Inttab.fold (f o #1) nts; | |
| 47 | fun nts_subset (nts1: nts, nts2: nts) = Inttab.forall (nts_member nts2 o #1) nts1; | |
| 48 | fun nts_is_empty (nts: nts) = Inttab.is_empty nts; | |
| 49 | fun nts_is_unique (nts: nts) = nts_is_empty nts orelse Inttab.is_single nts; | |
| 50 | ||
| 51 | ||
| 67539 | 52 | (* tokens *) | 
| 53 | ||
| 67549 | 54 | structure Tokens = Table(type key = Lexicon.token val ord = Lexicon.tokens_match_ord); | 
| 67539 | 55 | |
| 56 | type tokens = Tokens.set; | |
| 57 | val tokens_empty: tokens = Tokens.empty; | |
| 58 | val tokens_merge: tokens * tokens -> tokens = Tokens.merge (K true); | |
| 59 | fun tokens_insert tk : tokens -> tokens = Tokens.insert_set tk; | |
| 60 | fun tokens_remove tk : tokens -> tokens = Tokens.remove_set tk; | |
| 61 | fun tokens_member (tokens: tokens) = Tokens.defined tokens; | |
| 62 | fun tokens_is_empty (tokens: tokens) = Tokens.is_empty tokens; | |
| 63 | fun tokens_fold f (tokens: tokens) = Tokens.fold (f o #1) tokens; | |
| 64 | val tokens_union = tokens_fold tokens_insert; | |
| 65 | val tokens_subtract = tokens_fold tokens_remove; | |
| 66 | fun tokens_find P (tokens: tokens) = | |
| 67 | Tokens.get_first (fn (tok, ()) => if P tok then SOME tok else NONE) tokens; | |
| 67541 | 68 | fun tokens_add (nt: nt, tokens) = if tokens_is_empty tokens then I else cons (nt, tokens); | 
| 67533 
f253e5eaf995
clarified types and operations: potentially more efficient add_prods;
 wenzelm parents: 
67532diff
changeset | 69 | |
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 70 | |
| 67545 | 71 | (* productions *) | 
| 72 | ||
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 73 | datatype symb = | 
| 67545 | 74 | Terminal of Lexicon.token | | 
| 75 | Nonterminal of nt * int; (*(tag, prio)*) | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 76 | |
| 67540 | 77 | type prods = (symb list * string * int) list Tokens.table; (*start_token ~> [(rhs, name, prio)]*) | 
| 78 | ||
| 79 | val prods_empty: prods = Tokens.empty; | |
| 80 | fun prods_lookup (prods: prods) = Tokens.lookup_list prods; | |
| 81 | fun prods_update entry : prods -> prods = Tokens.update entry; | |
| 82 | fun prods_content (prods: prods) = distinct (op =) (maps #2 (Tokens.dest prods)); | |
| 83 | ||
| 67545 | 84 | type nt_gram = (nts * tokens) * prods; (*dependent_nts, start_tokens, prods*) | 
| 85 | (*depent_nts is a set of all NTs whose lookahead depends on this NT's lookahead*) | |
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 86 | |
| 67540 | 87 | val nt_gram_empty: nt_gram = ((nts_empty, tokens_empty), prods_empty); | 
| 67533 
f253e5eaf995
clarified types and operations: potentially more efficient add_prods;
 wenzelm parents: 
67532diff
changeset | 88 | |
| 67531 | 89 | type chains = unit Int_Graph.T; | 
| 90 | fun chains_preds (chains: chains) = Int_Graph.immediate_preds chains; | |
| 91 | fun chains_all_preds (chains: chains) = Int_Graph.all_preds chains; | |
| 92 | fun chains_all_succs (chains: chains) = Int_Graph.all_succs chains; | |
| 93 | val chains_empty: chains = Int_Graph.empty; | |
| 94 | fun chains_member (chains: chains) = Int_Graph.is_edge chains; | |
| 95 | fun chains_declare nt : chains -> chains = Int_Graph.default_node (nt, ()); | |
| 96 | fun chains_insert (from, to) = | |
| 97 | chains_declare from #> chains_declare to #> Int_Graph.add_edge (from, to); | |
| 98 | ||
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 99 | datatype gram = | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 100 | Gram of | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 101 |    {nt_count: int,
 | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 102 | prod_count: int, | 
| 67531 | 103 | tags: tags, | 
| 104 | chains: chains, | |
| 67533 
f253e5eaf995
clarified types and operations: potentially more efficient add_prods;
 wenzelm parents: 
67532diff
changeset | 105 | lambdas: nts, | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 106 | prods: nt_gram Vector.vector}; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 107 | (*"tags" is used to map NT names (i.e. strings) to tags; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 108 | chain productions are not stored as normal productions | 
| 67517 
add9a9f6a290
explicit graph for chains, which contains all nts as nodes;
 wenzelm parents: 
67516diff
changeset | 109 | but instead as an entry in "chains": from -> to; | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 110 | lambda productions are stored as normal productions | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 111 | and also as an entry in "lambdas"*) | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 112 | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 113 | (*productions for which no starting token is | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 114 | known yet are associated with this token*) | 
| 38713 | 115 | val unknown_start = Lexicon.eof; | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 116 | |
| 67539 | 117 | fun get_start tks = | 
| 118 | (case Tokens.min tks of | |
| 119 | SOME (tk, _) => tk | |
| 120 | | NONE => unknown_start); | |
| 67516 | 121 | |
| 67543 | 122 | fun add_production prods (lhs, new_prod as (rhs, _, pri)) (chains, lambdas) = | 
| 123 | let | |
| 124 | (*store chain if it does not already exist*) | |
| 125 | val (chain, new_chain, chains') = | |
| 126 | (case (pri, rhs) of | |
| 127 | (~1, [Nonterminal (from, ~1)]) => | |
| 128 | if chains_member chains (from, lhs) | |
| 129 | then (SOME from, false, chains) | |
| 130 | else (SOME from, true, chains_insert (from, lhs) chains) | |
| 71468 
53fcbede7bf7
more robust (amending add9a9f6a290): proper syntax error instead of exception for grammar with unreachable nonterminals, e.g. nonterminal f1 syntax "_F" :: "f1 ⇒ 'b"  ("F _" 10);
 wenzelm parents: 
69575diff
changeset | 131 | | _ => | 
| 
53fcbede7bf7
more robust (amending add9a9f6a290): proper syntax error instead of exception for grammar with unreachable nonterminals, e.g. nonterminal f1 syntax "_F" :: "f1 ⇒ 'b"  ("F _" 10);
 wenzelm parents: 
69575diff
changeset | 132 | let | 
| 
53fcbede7bf7
more robust (amending add9a9f6a290): proper syntax error instead of exception for grammar with unreachable nonterminals, e.g. nonterminal f1 syntax "_F" :: "f1 ⇒ 'b"  ("F _" 10);
 wenzelm parents: 
69575diff
changeset | 133 | val chains' = chains | 
| 
53fcbede7bf7
more robust (amending add9a9f6a290): proper syntax error instead of exception for grammar with unreachable nonterminals, e.g. nonterminal f1 syntax "_F" :: "f1 ⇒ 'b"  ("F _" 10);
 wenzelm parents: 
69575diff
changeset | 134 | |> chains_declare lhs | 
| 
53fcbede7bf7
more robust (amending add9a9f6a290): proper syntax error instead of exception for grammar with unreachable nonterminals, e.g. nonterminal f1 syntax "_F" :: "f1 ⇒ 'b"  ("F _" 10);
 wenzelm parents: 
69575diff
changeset | 135 | |> fold (fn Nonterminal (nt, _) => chains_declare nt | _ => I) rhs; | 
| 
53fcbede7bf7
more robust (amending add9a9f6a290): proper syntax error instead of exception for grammar with unreachable nonterminals, e.g. nonterminal f1 syntax "_F" :: "f1 ⇒ 'b"  ("F _" 10);
 wenzelm parents: 
69575diff
changeset | 136 | in (NONE, false, chains') end); | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 137 | |
| 67543 | 138 | (*propagate new chain in lookahead and lambdas; | 
| 139 | added_starts is used later to associate existing | |
| 140 | productions with new starting tokens*) | |
| 141 | val (added_starts, lambdas') = | |
| 142 | if not new_chain then ([], lambdas) | |
| 143 | else | |
| 144 | let (*lookahead of chain's source*) | |
| 145 | val ((_, from_tks), _) = Array.sub (prods, the chain); | |
| 1175 
1b15a4b1a4f7
added comments; fixed a bug; reduced memory usage slightly
 clasohm parents: 
1147diff
changeset | 146 | |
| 67543 | 147 | (*copy from's lookahead to chain's destinations*) | 
| 148 | fun copy_lookahead to = | |
| 149 | let | |
| 150 | val ((to_nts, to_tks), ps) = Array.sub (prods, to); | |
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 151 | |
| 67543 | 152 | val new_tks = tokens_subtract to_tks from_tks; (*added lookahead tokens*) | 
| 153 | val to_tks' = tokens_merge (to_tks, new_tks); | |
| 154 | val _ = Array.update (prods, to, ((to_nts, to_tks'), ps)); | |
| 155 | in tokens_add (to, new_tks) end; | |
| 1175 
1b15a4b1a4f7
added comments; fixed a bug; reduced memory usage slightly
 clasohm parents: 
1147diff
changeset | 156 | |
| 67543 | 157 | val tos = chains_all_succs chains' [lhs]; | 
| 158 | in | |
| 159 | (fold copy_lookahead tos [], | |
| 160 | lambdas |> nts_member lambdas lhs ? fold nts_insert tos) | |
| 161 | end; | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 162 | |
| 67543 | 163 | (*test if new production can produce lambda | 
| 164 | (rhs must either be empty or only consist of lambda NTs)*) | |
| 165 | val new_lambdas = | |
| 166 | if forall | |
| 167 | (fn Nonterminal (id, _) => nts_member lambdas' id | |
| 168 | | Terminal _ => false) rhs | |
| 169 | then SOME (filter_out (nts_member lambdas') (chains_all_succs chains' [lhs])) | |
| 170 | else NONE; | |
| 171 | val lambdas'' = fold nts_insert (these new_lambdas) lambdas'; | |
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 172 | |
| 67543 | 173 | (*list optional terminal and all nonterminals on which the lookahead | 
| 174 | of a production depends*) | |
| 175 | fun lookahead_dependency _ [] nts = (NONE, nts) | |
| 176 | | lookahead_dependency _ (Terminal tk :: _) nts = (SOME tk, nts) | |
| 177 | | lookahead_dependency lambdas (Nonterminal (nt, _) :: symbs) nts = | |
| 178 | if nts_member lambdas nt then | |
| 179 | lookahead_dependency lambdas symbs (nts_insert nt nts) | |
| 180 | else (NONE, nts_insert nt nts); | |
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 181 | |
| 67543 | 182 | (*get all known starting tokens for a nonterminal*) | 
| 183 | fun starts_for_nt nt = snd (fst (Array.sub (prods, nt))); | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 184 | |
| 67543 | 185 | (*update prods, lookaheads, and lambdas according to new lambda NTs*) | 
| 186 | val (added_starts', lambdas') = | |
| 187 | let | |
| 188 | (*propagate added lambda NT*) | |
| 189 | fun propagate_lambda [] added_starts lambdas = (added_starts, lambdas) | |
| 190 | | propagate_lambda (l :: ls) added_starts lambdas = | |
| 191 | let | |
| 192 | (*get lookahead for lambda NT*) | |
| 193 | val ((dependent, l_starts), _) = Array.sub (prods, l); | |
| 1175 
1b15a4b1a4f7
added comments; fixed a bug; reduced memory usage slightly
 clasohm parents: 
1147diff
changeset | 194 | |
| 67543 | 195 | (*check productions whose lookahead may depend on lambda NT*) | 
| 196 | fun examine_prods [] add_lambda nt_dependencies added_tks nt_prods = | |
| 197 | (add_lambda, nt_dependencies, added_tks, nt_prods) | |
| 198 | | examine_prods ((p as (rhs, _, _)) :: ps) add_lambda | |
| 199 | nt_dependencies added_tks nt_prods = | |
| 200 | let val (tk, nts) = lookahead_dependency lambdas rhs nts_empty in | |
| 201 | if nts_member nts l then (*update production's lookahead*) | |
| 202 | let | |
| 203 | val new_lambda = | |
| 204 | is_none tk andalso nts_subset (nts, lambdas); | |
| 18 | 205 | |
| 67543 | 206 | val new_tks = | 
| 207 | tokens_empty | |
| 208 | |> fold tokens_insert (the_list tk) | |
| 209 | |> nts_fold (tokens_union o starts_for_nt) nts | |
| 210 | |> tokens_subtract l_starts; | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 211 | |
| 67543 | 212 | val added_tks' = tokens_merge (added_tks, new_tks); | 
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 213 | |
| 67543 | 214 | val nt_dependencies' = nts_merge (nt_dependencies, nts); | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 215 | |
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 216 | (*associate production with new starting tokens*) | 
| 67539 | 217 | fun copy tk nt_prods = | 
| 67543 | 218 | prods_update (tk, p :: prods_lookup nt_prods tk) nt_prods; | 
| 219 | ||
| 220 | val nt_prods' = nt_prods | |
| 221 | |> tokens_fold copy new_tks | |
| 67550 | 222 | |> new_lambda ? copy Lexicon.dummy; | 
| 67543 | 223 | in | 
| 224 | examine_prods ps (add_lambda orelse new_lambda) | |
| 225 | nt_dependencies' added_tks' nt_prods' | |
| 226 | end | |
| 227 | else (*skip production*) | |
| 228 | examine_prods ps add_lambda nt_dependencies added_tks nt_prods | |
| 229 | end; | |
| 230 | ||
| 231 | (*check each NT whose lookahead depends on new lambda NT*) | |
| 232 | fun process_nts nt (added_lambdas, added_starts) = | |
| 233 | let | |
| 234 | val ((old_nts, old_tks), nt_prods) = Array.sub (prods, nt); | |
| 235 | ||
| 236 | (*existing productions whose lookahead may depend on l*) | |
| 237 | val tk_prods = prods_lookup nt_prods (get_start l_starts); | |
| 238 | ||
| 239 | (*add_lambda is true if an existing production of the nt | |
| 240 | produces lambda due to the new lambda NT l*) | |
| 241 | val (add_lambda, nt_dependencies, added_tks, nt_prods') = | |
| 242 | examine_prods tk_prods false nts_empty tokens_empty nt_prods; | |
| 243 | ||
| 244 | val new_nts = nts_merge (old_nts, nt_dependencies); | |
| 245 | val new_tks = tokens_merge (old_tks, added_tks); | |
| 246 | ||
| 247 | val added_lambdas' = added_lambdas |> add_lambda ? cons nt; | |
| 248 | val _ = Array.update (prods, nt, ((new_nts, new_tks), nt_prods')); | |
| 249 | (*N.B. that because the tks component | |
| 250 | is used to access existing | |
| 251 | productions we have to add new | |
| 252 | tokens at the _end_ of the list*) | |
| 253 | val added_starts' = tokens_add (nt, added_tks) added_starts; | |
| 254 | in (added_lambdas', added_starts') end; | |
| 377 
ab8917806779
lookaheads are now computed faster (during the grammar is built)
 clasohm parents: 
373diff
changeset | 255 | |
| 67543 | 256 | val (added_lambdas, added_starts') = | 
| 257 | nts_fold process_nts dependent ([], added_starts); | |
| 258 | val added_lambdas' = filter_out (nts_member lambdas) added_lambdas; | |
| 259 | in | |
| 260 | propagate_lambda (ls @ added_lambdas') added_starts' | |
| 261 | (fold nts_insert added_lambdas' lambdas) | |
| 262 | end; | |
| 263 | in | |
| 264 | propagate_lambda | |
| 265 | (nts_fold (fn l => not (nts_member lambdas l) ? cons l) lambdas'' []) | |
| 266 | added_starts lambdas'' | |
| 267 | end; | |
| 268 | ||
| 269 | (*insert production into grammar*) | |
| 270 | val added_starts' = | |
| 271 | if is_some chain then added_starts' (*don't store chain production*) | |
| 272 | else | |
| 273 | let | |
| 274 | (*lookahead tokens of new production and on which | |
| 275 | NTs lookahead depends*) | |
| 276 | val (start_tk, start_nts) = lookahead_dependency lambdas' rhs nts_empty; | |
| 277 | ||
| 278 | val start_tks = | |
| 279 | tokens_empty | |
| 280 | |> fold tokens_insert (the_list start_tk) | |
| 281 | |> nts_fold (tokens_union o starts_for_nt) start_nts; | |
| 282 | ||
| 283 | val start_tks' = | |
| 284 | start_tks | |
| 67550 | 285 | |> (if is_some new_lambdas then tokens_insert Lexicon.dummy | 
| 67543 | 286 | else if tokens_is_empty start_tks then tokens_insert unknown_start | 
| 287 | else I); | |
| 288 | ||
| 289 | (*add lhs NT to list of dependent NTs in lookahead*) | |
| 290 | fun add_nts nt initial = | |
| 291 | (if initial then | |
| 292 | let val ((old_nts, old_tks), ps) = Array.sub (prods, nt) in | |
| 293 | if nts_member old_nts lhs then () | |
| 294 | else Array.update (prods, nt, ((nts_insert lhs old_nts, old_tks), ps)) | |
| 295 | end | |
| 296 | else (); false); | |
| 297 | ||
| 298 | (*add new start tokens to chained NTs' lookahead list; | |
| 299 | also store new production for lhs NT*) | |
| 300 | fun add_tks [] added = added | |
| 301 | | add_tks (nt :: nts) added = | |
| 302 | let | |
| 303 | val ((old_nts, old_tks), nt_prods) = Array.sub (prods, nt); | |
| 304 | ||
| 305 | val new_tks = tokens_subtract old_tks start_tks; | |
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 306 | |
| 67543 | 307 | (*store new production*) | 
| 308 | fun store tk (prods, _) = | |
| 309 | let | |
| 310 | val tk_prods = prods_lookup prods tk; | |
| 311 | val tk_prods' = new_prod :: tk_prods; | |
| 312 | val prods' = prods_update (tk, tk_prods') prods; | |
| 313 | in (prods', true) end; | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 314 | |
| 67543 | 315 | val (nt_prods', changed) = (nt_prods, false) | 
| 316 | |> nt = lhs ? tokens_fold store start_tks'; | |
| 317 | val _ = | |
| 318 | if not changed andalso tokens_is_empty new_tks then () | |
| 319 | else | |
| 320 | Array.update | |
| 321 | (prods, nt, ((old_nts, tokens_merge (old_tks, new_tks)), nt_prods')); | |
| 322 | in add_tks nts (tokens_add (nt, new_tks) added) end; | |
| 323 | val _ = nts_fold add_nts start_nts true; | |
| 324 | in add_tks (chains_all_succs chains' [lhs]) [] end; | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 325 | |
| 67543 | 326 | (*associate productions with new lookaheads*) | 
| 327 | val _ = | |
| 328 | let | |
| 329 | (*propagate added start tokens*) | |
| 330 | fun add_starts [] = () | |
| 331 | | add_starts ((changed_nt, new_tks) :: starts) = | |
| 332 | let | |
| 333 | (*token under which old productions which | |
| 334 | depend on changed_nt could be stored*) | |
| 335 | val key = | |
| 336 | tokens_find (not o tokens_member new_tks) (starts_for_nt changed_nt) | |
| 337 | |> the_default unknown_start; | |
| 338 | ||
| 339 | (*copy productions whose lookahead depends on changed_nt; | |
| 340 | if key = SOME unknown_start then tk_prods is used to hold | |
| 341 | the productions not copied*) | |
| 342 | fun update_prods [] result = result | |
| 343 | | update_prods ((p as (rhs, _: string, _: nt)) :: ps) | |
| 344 | (tk_prods, nt_prods) = | |
| 345 | let | |
| 346 | (*lookahead dependency for production*) | |
| 347 | val (tk, depends) = lookahead_dependency lambdas' rhs nts_empty; | |
| 348 | ||
| 349 | (*test if this production has to be copied*) | |
| 350 | val update = nts_member depends changed_nt; | |
| 18 | 351 | |
| 67543 | 352 | (*test if production could already be associated with | 
| 353 | a member of new_tks*) | |
| 354 | val lambda = | |
| 355 | not (nts_is_unique depends) orelse | |
| 356 | not (nts_is_empty depends) andalso is_some tk | |
| 357 | andalso tokens_member new_tks (the tk); | |
| 358 | ||
| 359 | (*associate production with new starting tokens*) | |
| 360 | fun copy tk nt_prods = | |
| 361 | let | |
| 362 | val tk_prods = prods_lookup nt_prods tk; | |
| 363 | val tk_prods' = | |
| 364 | if not lambda then p :: tk_prods | |
| 365 | else insert (op =) p tk_prods; | |
| 366 | (*if production depends on lambda NT we | |
| 367 | have to look for duplicates*) | |
| 368 | in prods_update (tk, tk_prods') nt_prods end; | |
| 369 | val result = | |
| 370 | if update then (tk_prods, tokens_fold copy new_tks nt_prods) | |
| 371 | else if key = unknown_start then (p :: tk_prods, nt_prods) | |
| 372 | else (tk_prods, nt_prods); | |
| 373 | in update_prods ps result end; | |
| 18 | 374 | |
| 67543 | 375 | (*copy existing productions for new starting tokens*) | 
| 376 | fun process_nts nt = | |
| 377 | let | |
| 378 | val ((nts, tks), nt_prods) = Array.sub (prods, nt); | |
| 379 | ||
| 380 | val tk_prods = prods_lookup nt_prods key; | |
| 381 | ||
| 382 | (*associate productions with new lookahead tokens*) | |
| 383 | val (tk_prods', nt_prods') = update_prods tk_prods ([], nt_prods); | |
| 384 | ||
| 385 | val nt_prods'' = | |
| 386 | if key = unknown_start then | |
| 387 | prods_update (key, tk_prods') nt_prods' | |
| 388 | else nt_prods'; | |
| 389 | ||
| 390 | val added_tks = tokens_subtract tks new_tks; | |
| 391 | val tks' = tokens_merge (tks, added_tks); | |
| 392 | val _ = Array.update (prods, nt, ((nts, tks'), nt_prods'')); | |
| 393 | in tokens_add (nt, added_tks) end; | |
| 394 | ||
| 395 | val ((dependent, _), _) = Array.sub (prods, changed_nt); | |
| 396 | in add_starts (starts @ nts_fold process_nts dependent []) end; | |
| 397 | in add_starts added_starts' end; | |
| 398 | in (chains', lambdas') end; | |
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 399 | |
| 18 | 400 | |
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 401 | (* pretty_gram *) | 
| 18 | 402 | |
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 403 | fun pretty_gram (Gram {tags, prods, chains, ...}) =
 | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 404 | let | 
| 67531 | 405 | val print_nt = tags_name tags; | 
| 67518 | 406 |     fun print_pri p = if p < 0 then "" else Symbol.make_sup ("(" ^ string_of_int p ^ ")");
 | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 407 | |
| 67552 | 408 | fun pretty_symb (Terminal tok) = | 
| 409 | if Lexicon.is_literal tok | |
| 410 | then Pretty.quote (Pretty.keyword1 (Lexicon.str_of_token tok)) | |
| 411 | else Pretty.str (Lexicon.str_of_token tok) | |
| 67513 | 412 | | pretty_symb (Nonterminal (tag, p)) = Pretty.str (print_nt tag ^ print_pri p); | 
| 18 | 413 | |
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 414 | fun pretty_const "" = [] | 
| 67513 | 415 |       | pretty_const c = [Pretty.str ("\<^bold>\<Rightarrow> " ^ quote c)];
 | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 416 | |
| 67513 | 417 | fun prod_of_chain from = ([Nonterminal (from, ~1)], "", ~1); | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 418 | |
| 67513 | 419 | fun pretty_prod (name, tag) = | 
| 67540 | 420 | (prods_content (#2 (Vector.sub (prods, tag))) @ map prod_of_chain (chains_preds chains tag)) | 
| 67513 | 421 | |> map (fn (symbs, const, p) => | 
| 422 | Pretty.block (Pretty.breaks | |
| 423 | (Pretty.str (name ^ print_pri p ^ " =") :: map pretty_symb symbs @ pretty_const const))); | |
| 67531 | 424 | in maps pretty_prod (tags_content tags) end; | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 425 | |
| 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 426 | |
| 42217 | 427 | |
| 67545 | 428 | (** operations on grammars **) | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 429 | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 430 | val empty_gram = | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 431 | Gram | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 432 |    {nt_count = 0,
 | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 433 | prod_count = 0, | 
| 67531 | 434 | tags = tags_empty, | 
| 435 | chains = chains_empty, | |
| 67533 
f253e5eaf995
clarified types and operations: potentially more efficient add_prods;
 wenzelm parents: 
67532diff
changeset | 436 | lambdas = nts_empty, | 
| 
f253e5eaf995
clarified types and operations: potentially more efficient add_prods;
 wenzelm parents: 
67532diff
changeset | 437 | prods = Vector.fromList [nt_gram_empty]}; | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 438 | |
| 1438 | 439 | |
| 440 | (*Add productions to a grammar*) | |
| 37684 | 441 | fun extend_gram [] gram = gram | 
| 442 |   | extend_gram xprods (Gram {nt_count, prod_count, tags, chains, lambdas, prods}) =
 | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 443 | let | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 444 | (*Get tag for existing nonterminal or create a new one*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 445 | fun get_tag nt_count tags nt = | 
| 67531 | 446 | (case tags_lookup tags nt of | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 447 | SOME tag => (nt_count, tags, tag) | 
| 67531 | 448 | | NONE => (nt_count + 1, tags_insert (nt, nt_count) tags, nt_count)); | 
| 1438 | 449 | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 450 | (*Convert symbols to the form used by the parser; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 451 | delimiters and predefined terms are stored as terminals, | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 452 | nonterminals are converted to integer tags*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 453 | fun symb_of [] nt_count tags result = (nt_count, tags, rev result) | 
| 42288 
2074b31650e6
discontinued special treatment of structure Syntax_Ext (formerly Syn_Ext);
 wenzelm parents: 
42282diff
changeset | 454 | | symb_of (Syntax_Ext.Delim s :: ss) nt_count tags result = | 
| 67552 | 455 | symb_of ss nt_count tags (Terminal (Lexicon.literal s) :: result) | 
| 42288 
2074b31650e6
discontinued special treatment of structure Syntax_Ext (formerly Syn_Ext);
 wenzelm parents: 
42282diff
changeset | 456 | | symb_of (Syntax_Ext.Argument (s, p) :: ss) nt_count tags result = | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 457 | let | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 458 | val (nt_count', tags', new_symb) = | 
| 67556 | 459 | (case Lexicon.get_terminal s of | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 460 | NONE => | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 461 | let val (nt_count', tags', s_tag) = get_tag nt_count tags s; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 462 | in (nt_count', tags', Nonterminal (s_tag, p)) end | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 463 | | SOME tk => (nt_count, tags, Terminal tk)); | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 464 | in symb_of ss nt_count' tags' (new_symb :: result) end | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 465 | | symb_of (_ :: ss) nt_count tags result = symb_of ss nt_count tags result; | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 466 | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 467 | (*Convert list of productions by invoking symb_of for each of them*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 468 | fun prod_of [] nt_count prod_count tags result = | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 469 | (nt_count, prod_count, tags, result) | 
| 42288 
2074b31650e6
discontinued special treatment of structure Syntax_Ext (formerly Syn_Ext);
 wenzelm parents: 
42282diff
changeset | 470 | | prod_of (Syntax_Ext.XProd (lhs, xsymbs, const, pri) :: ps) | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 471 | nt_count prod_count tags result = | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 472 | let | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 473 | val (nt_count', tags', lhs_tag) = get_tag nt_count tags lhs; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 474 | val (nt_count'', tags'', prods) = symb_of xsymbs nt_count' tags' []; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 475 | in | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 476 | prod_of ps nt_count'' (prod_count + 1) tags'' | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 477 | ((lhs_tag, (prods, const, pri)) :: result) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 478 | end; | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 479 | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 480 | val (nt_count', prod_count', tags', xprods') = | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 481 | prod_of xprods nt_count prod_count tags []; | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 482 | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 483 | (*Copy array containing productions of old grammar; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 484 | this has to be done to preserve the old grammar while being able | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 485 | to change the array's content*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 486 | val prods' = | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 487 | let | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 488 | fun get_prod i = | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 489 | if i < nt_count then Vector.sub (prods, i) | 
| 67533 
f253e5eaf995
clarified types and operations: potentially more efficient add_prods;
 wenzelm parents: 
67532diff
changeset | 490 | else nt_gram_empty; | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 491 | in Array.tabulate (nt_count', get_prod) end; | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 492 | |
| 67543 | 493 | val (chains', lambdas') = | 
| 494 | (chains, lambdas) |> fold (add_production prods') xprods'; | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 495 | in | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 496 | Gram | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 497 |          {nt_count = nt_count',
 | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 498 | prod_count = prod_count', | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 499 | tags = tags', | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 500 | chains = chains', | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 501 | lambdas = lambdas', | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 502 | prods = Array.vector prods'} | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 503 | end; | 
| 18 | 504 | |
| 45632 
b23c42b9f78a
prefer Parser.make_gram over Parser.merge_gram, to approximate n-ary merges on theory import;
 wenzelm parents: 
44102diff
changeset | 505 | fun make_gram xprods = extend_gram xprods empty_gram; | 
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 506 | |
| 18 | 507 | |
| 42217 | 508 | |
| 42374 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 509 | (** parser **) | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 510 | |
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 511 | (* parsetree *) | 
| 18 | 512 | |
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 513 | datatype parsetree = | 
| 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 514 | Node of string * parsetree list | | 
| 37683 | 515 | Tip of Lexicon.token; | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 516 | |
| 42374 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 517 | exception PARSETREE of parsetree; | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 518 | |
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 519 | fun pretty_parsetree parsetree = | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 520 | let | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 521 | fun pretty (Node (c, pts)) = | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 522 |           [Pretty.enclose "(" ")" (Pretty.breaks (Pretty.quote (Pretty.str c) :: maps pretty pts))]
 | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 523 | | pretty (Tip tok) = | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 524 | if Lexicon.valued_token tok then [Pretty.str (Lexicon.str_of_token tok)] else []; | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 525 | in (case pretty parsetree of [prt] => prt | _ => raise PARSETREE parsetree) end; | 
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 526 | |
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 527 | |
| 
b9a6b465da25
clarified pretty_parsetree: suppress literal tokens;
 wenzelm parents: 
42288diff
changeset | 528 | (* parser state *) | 
| 42205 
22f5cc06c419
direct pretty printing of parsetrees -- prevent diagnostic output from crashing due to undeclared entities;
 wenzelm parents: 
41378diff
changeset | 529 | |
| 18 | 530 | type state = | 
| 67533 
f253e5eaf995
clarified types and operations: potentially more efficient add_prods;
 wenzelm parents: 
67532diff
changeset | 531 | nt * int * (*identification and production precedence*) | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 532 | parsetree list * (*already parsed nonterminals on rhs*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 533 | symb list * (*rest of rhs*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 534 | string * (*name of production*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 535 | int; (*index for previous state list*) | 
| 18 | 536 | |
| 537 | ||
| 38713 | 538 | (*Get all rhss with precedence >= min_prec*) | 
| 42217 | 539 | fun get_RHS min_prec = filter (fn (_, _, prec: int) => prec >= min_prec); | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 540 | |
| 38713 | 541 | (*Get all rhss with precedence >= min_prec and < max_prec*) | 
| 542 | fun get_RHS' min_prec max_prec = | |
| 42217 | 543 | filter (fn (_, _, prec: int) => prec >= min_prec andalso prec < max_prec); | 
| 18 | 544 | |
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 545 | (*Make states using a list of rhss*) | 
| 62669 | 546 | fun mk_states i lhs_ID rhss = | 
| 38713 | 547 | let fun mk_state (rhs, id, prod_prec) = (lhs_ID, prod_prec, [], rhs, id, i); | 
| 548 | in map mk_state rhss end; | |
| 697 
40f72ab196f8
changed warning for extremely ambiguous expressions
 clasohm parents: 
682diff
changeset | 549 | |
| 15752 | 550 | (*Add parse tree to list and eliminate duplicates | 
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 551 | saving the maximum precedence*) | 
| 42217 | 552 | fun conc (t: parsetree list, prec: int) [] = (NONE, [(t, prec)]) | 
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 553 | | conc (t, prec) ((t', prec') :: ts) = | 
| 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 554 | if t = t' then | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 555 | (SOME prec', | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 556 | if prec' >= prec then (t', prec') :: ts | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 557 | else (t, prec) :: ts) | 
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 558 | else | 
| 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 559 | let val (n, ts') = conc (t, prec) ts | 
| 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 560 | in (n, (t', prec') :: ts') end; | 
| 18 | 561 | |
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 562 | (*Update entry in used*) | 
| 42226 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 563 | fun update_trees (A, t) used = | 
| 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 564 | let | 
| 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 565 | val (i, ts) = the (Inttab.lookup used A); | 
| 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 566 | val (n, ts') = conc t ts; | 
| 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 567 | in (n, Inttab.update (A, (i, ts')) used) end; | 
| 18 | 568 | |
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 569 | (*Replace entry in used*) | 
| 42226 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 570 | fun update_prec (A, prec) = | 
| 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 571 | Inttab.map_entry A (fn (_, ts) => (prec, ts)); | 
| 18 | 572 | |
| 42220 | 573 | fun getS A max_prec NONE Si = | 
| 574 | filter | |
| 575 | (fn (_, _, _, Nonterminal (B, prec) :: _, _, _) => A = B andalso prec <= max_prec | |
| 576 | | _ => false) Si | |
| 577 | | getS A max_prec (SOME min_prec) Si = | |
| 578 | filter | |
| 579 | (fn (_, _, _, Nonterminal (B, prec) :: _, _, _) => | |
| 580 | A = B andalso prec > min_prec andalso prec <= max_prec | |
| 581 | | _ => false) Si; | |
| 18 | 582 | |
| 62669 | 583 | fun get_states Estate j A max_prec = | 
| 33317 | 584 | filter | 
| 38713 | 585 | (fn (_, _, _, Nonterminal (B, prec) :: _, _, _) => A = B andalso prec <= max_prec | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 586 | | _ => false) | 
| 62669 | 587 | (Array.sub (Estate, j)); | 
| 18 | 588 | |
| 589 | ||
| 42219 | 590 | fun movedot_term c (A, j, ts, Terminal a :: sa, id, i) = | 
| 42282 
4fa41e068ff0
report literal tokens according to parsetree head;
 wenzelm parents: 
42226diff
changeset | 591 | if Lexicon.valued_token c orelse id <> "" | 
| 
4fa41e068ff0
report literal tokens according to parsetree head;
 wenzelm parents: 
42226diff
changeset | 592 | then (A, j, Tip c :: ts, sa, id, i) | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 593 | else (A, j, ts, sa, id, i); | 
| 18 | 594 | |
| 42219 | 595 | fun movedot_nonterm tt (A, j, ts, Nonterminal _ :: sa, id, i) = | 
| 42282 
4fa41e068ff0
report literal tokens according to parsetree head;
 wenzelm parents: 
42226diff
changeset | 596 | (A, j, tt @ ts, sa, id, i); | 
| 18 | 597 | |
| 42219 | 598 | fun movedot_lambda [] _ = [] | 
| 42221 | 599 | | movedot_lambda ((t, ki) :: ts) (state as (B, j, tss, Nonterminal (A, k) :: sa, id, i)) = | 
| 42282 
4fa41e068ff0
report literal tokens according to parsetree head;
 wenzelm parents: 
42226diff
changeset | 600 | if k <= ki then (B, j, t @ tss, sa, id, i) :: movedot_lambda ts state | 
| 42221 | 601 | else movedot_lambda ts state; | 
| 18 | 602 | |
| 603 | ||
| 41378 | 604 | (*trigger value for warnings*) | 
| 69575 | 605 | val branching_level = Config.declare_int ("syntax_branching_level", \<^here>) (K 600);
 | 
| 18 | 606 | |
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 607 | (*get all productions of a NT and NTs chained to it which can | 
| 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 608 | be started by specified token*) | 
| 67540 | 609 | fun prods_for (Gram {prods, chains, ...}) tk nt =
 | 
| 37683 | 610 | let | 
| 67540 | 611 | fun token_prods prods = | 
| 612 | fold cons (prods_lookup prods tk) #> | |
| 67550 | 613 | fold cons (prods_lookup prods Lexicon.dummy); | 
| 67540 | 614 | fun nt_prods nt = #2 (Vector.sub (prods, nt)); | 
| 615 | in fold (token_prods o nt_prods) (chains_all_preds chains [nt]) [] end; | |
| 1147 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 616 | |
| 
57b5f55bf879
removed 'raw' productions from gram datatype; replaced mk_gram by add_prods;
 clasohm parents: 
890diff
changeset | 617 | |
| 67515 | 618 | fun PROCESSS gram Estate i c states = | 
| 37683 | 619 | let | 
| 62669 | 620 | fun processS _ [] (Si, Sii) = (Si, Sii) | 
| 37683 | 621 | | processS used (S :: States) (Si, Sii) = | 
| 622 | (case S of | |
| 38713 | 623 | (_, _, _, Nonterminal (nt, min_prec) :: _, _, _) => | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 624 | let (*predictor operation*) | 
| 37683 | 625 | val (used', new_states) = | 
| 42226 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 626 | (case Inttab.lookup used nt of | 
| 38713 | 627 | SOME (used_prec, l) => (*nonterminal has been processed*) | 
| 628 | if used_prec <= min_prec then | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 629 | (*wanted precedence has been processed*) | 
| 42219 | 630 | (used, movedot_lambda l S) | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 631 | else (*wanted precedence hasn't been parsed yet*) | 
| 37683 | 632 | let | 
| 67540 | 633 | val tk_prods = prods_for gram c nt; | 
| 42217 | 634 | val States' = | 
| 62669 | 635 | mk_states i nt (get_RHS' min_prec used_prec tk_prods); | 
| 42219 | 636 | in (update_prec (nt, min_prec) used, movedot_lambda l S @ States') end | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 637 | | NONE => (*nonterminal is parsed for the first time*) | 
| 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 638 | let | 
| 67540 | 639 | val tk_prods = prods_for gram c nt; | 
| 62669 | 640 | val States' = mk_states i nt (get_RHS min_prec tk_prods); | 
| 42226 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 641 | in (Inttab.update (nt, (min_prec, [])) used, States') end); | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 642 | in | 
| 37683 | 643 | processS used' (new_states @ States) (S :: Si, Sii) | 
| 15752 | 644 | end | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 645 | | (_, _, _, Terminal a :: _, _, _) => (*scanner operation*) | 
| 37683 | 646 | processS used States | 
| 67549 | 647 | (S :: Si, | 
| 648 | if Lexicon.tokens_match_ord (a, c) = EQUAL then movedot_term c S :: Sii else Sii) | |
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 649 | | (A, prec, ts, [], id, j) => (*completer operation*) | 
| 42222 
ff50c436b199
accumulate parsetrees in canonical reverse order;
 wenzelm parents: 
42221diff
changeset | 650 | let val tt = if id = "" then ts else [Node (id, rev ts)] in | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 651 | if j = i then (*lambda production?*) | 
| 37683 | 652 | let | 
| 42226 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 653 | val (prec', used') = update_trees (A, (tt, prec)) used; | 
| 42220 | 654 | val Slist = getS A prec prec' Si; | 
| 655 | val States' = map (movedot_nonterm tt) Slist; | |
| 656 | in processS used' (States' @ States) (S :: Si, Sii) end | |
| 37683 | 657 | else | 
| 62669 | 658 | let val Slist = get_states Estate j A prec | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 659 | in processS used (map (movedot_nonterm tt) Slist @ States) (S :: Si, Sii) end | 
| 37683 | 660 | end) | 
| 42226 
cb650789f2f0
use standard tables with standard argument order;
 wenzelm parents: 
42222diff
changeset | 661 | in processS Inttab.empty states ([], []) end; | 
| 18 | 662 | |
| 663 | ||
| 67515 | 664 | fun produce gram stateset i indata prev_token = | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 665 | (case Array.sub (stateset, i) of | 
| 25986 
26f1e4c172c3
syntax error: reduced spam -- print expected nonterminals instead of terminals;
 wenzelm parents: 
24245diff
changeset | 666 | [] => | 
| 
26f1e4c172c3
syntax error: reduced spam -- print expected nonterminals instead of terminals;
 wenzelm parents: 
24245diff
changeset | 667 | let | 
| 37683 | 668 | val toks = if Lexicon.is_eof prev_token then indata else prev_token :: indata; | 
| 48992 | 669 | val pos = Position.here (Lexicon.pos_of_token prev_token); | 
| 27801 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 670 | in | 
| 55624 | 671 | if null toks then | 
| 672 |           error ("Inner syntax error: unexpected end of input" ^ pos)
 | |
| 673 | else | |
| 674 |           error ("Inner syntax error" ^ pos ^
 | |
| 675 | Markup.markup Markup.no_report | |
| 676 |               ("\n" ^ Pretty.string_of
 | |
| 677 | (Pretty.block [ | |
| 678 | Pretty.str "at", Pretty.brk 1, | |
| 679 | Pretty.block | |
| 680 | (Pretty.str "\"" :: | |
| 681 | Pretty.breaks (map (Pretty.str o Lexicon.str_of_token) (#1 (split_last toks))) @ | |
| 682 | [Pretty.str "\""])]))) | |
| 27801 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 683 | end | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 684 | | s => | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 685 | (case indata of | 
| 42218 
8e0a4d500e5b
streamlined token list operations, assuming that the order of union does not matter;
 wenzelm parents: 
42217diff
changeset | 686 | [] => s | 
| 42217 | 687 | | c :: cs => | 
| 688 | let | |
| 67515 | 689 | val (si, sii) = PROCESSS gram stateset i c s; | 
| 42217 | 690 | val _ = Array.update (stateset, i, si); | 
| 691 | val _ = Array.update (stateset, i + 1, sii); | |
| 67515 | 692 | in produce gram stateset (i + 1) cs c end)); | 
| 18 | 693 | |
| 694 | ||
| 42217 | 695 | fun get_trees states = map_filter (fn (_, _, [pt], _, _, _) => SOME pt | _ => NONE) states; | 
| 18 | 696 | |
| 67515 | 697 | fun earley (gram as Gram {tags, ...}) startsymbol indata =
 | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 698 | let | 
| 37683 | 699 | val start_tag = | 
| 67531 | 700 | (case tags_lookup tags startsymbol of | 
| 37683 | 701 | SOME tag => tag | 
| 40959 | 702 |       | NONE => error ("Inner syntax: bad grammar root symbol " ^ quote startsymbol));
 | 
| 37683 | 703 | val S0 = [(~1, 0, [], [Nonterminal (start_tag, 0), Terminal Lexicon.eof], "", 0)]; | 
| 330 
2fda15dd1e0f
changed the way a grammar is generated to allow the new parser to work;
 clasohm parents: 
258diff
changeset | 704 | val s = length indata + 1; | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 705 | val Estate = Array.array (s, []); | 
| 38712 
f7688fd819a8
some attempts to recover Isabelle/ML style from the depths of time;
 wenzelm parents: 
38711diff
changeset | 706 | val _ = Array.update (Estate, 0, S0); | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 707 | in | 
| 67515 | 708 | get_trees (produce gram Estate 0 indata Lexicon.eof) | 
| 237 
a7d3e712767a
MAJOR INTERNAL CHANGE: extend and merge operations of syntax tables
 wenzelm parents: 
46diff
changeset | 709 | end; | 
| 18 | 710 | |
| 711 | ||
| 67515 | 712 | fun parse gram start toks = | 
| 27801 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 713 | let | 
| 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 714 | val end_pos = | 
| 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 715 | (case try List.last toks of | 
| 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 716 | NONE => Position.none | 
| 67551 | 717 | | SOME tok => Lexicon.end_pos_of_token tok); | 
| 27801 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 718 | val r = | 
| 67515 | 719 | (case earley gram start (toks @ [Lexicon.mk_eof end_pos]) of | 
| 37852 
a902f158b4fc
eliminated old-style sys_error/SYS_ERROR in favour of exception Fail -- after careful checking that there is no overlap with existing handling of that;
 wenzelm parents: 
37684diff
changeset | 720 | [] => raise Fail "Inner syntax: no parse trees" | 
| 27801 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 721 | | pts => pts); | 
| 
0d0adaf0228d
datatype token: maintain range, tuned representation;
 wenzelm parents: 
26678diff
changeset | 722 | in r end; | 
| 26678 | 723 | |
| 18 | 724 | end; |