diff -r b35851cafd3e -r c9ec452ff08f src/Pure/Syntax/lexicon.ML --- a/src/Pure/Syntax/lexicon.ML Fri Oct 01 13:26:22 1993 +0100 +++ b/src/Pure/Syntax/lexicon.ML Mon Oct 04 15:30:49 1993 +0100 @@ -1,251 +1,388 @@ -(* Title: Lexicon +(* Title: Pure/Syntax/lexicon.ML ID: $Id$ - Author: Tobias Nipkow, TU Muenchen - Copyright 1993 TU Muenchen + Author: Tobias Nipkow and Markus Wenzel, TU Muenchen -The Isabelle Lexer +Scanner combinators and Isabelle's main lexer (used for terms and typs). +*) -Changes: -TODO: - Lexicon -> lexicon, Token -> token - end_token -> EndToken ? -*) +infix 5 -- ^^; +infix 3 >>; +infix 0 ||; signature LEXICON0 = sig val is_identifier: string -> bool + val string_of_vname: indexname -> string val scan_varname: string list -> indexname * string list - val string_of_vname: indexname -> string + val scan_var: string -> term end; signature LEXICON = sig - type Lexicon - datatype Token = Token of string - | IdentSy of string - | VarSy of string * int - | TFreeSy of string - | TVarSy of string * int - | end_token; - val empty: Lexicon - val extend: Lexicon * string list -> Lexicon - val matching_tokens: Token * Token -> bool - val mk_lexicon: string list -> Lexicon - val name_of_token: Token -> string - val predef_term: string -> Token + include LEXICON0 + val is_xid: string -> bool + val is_tfree: string -> bool + type lexicon + datatype token = + Token of string | + IdentSy of string | + VarSy of string | + TFreeSy of string | + TVarSy of string | + EndToken; + val id: string + val var: string + val tfree: string + val tvar: string + val str_of_token: token -> string + val display_token: token -> string + val matching_tokens: token * token -> bool + val valued_token: token -> bool + val predef_term: string -> token val is_terminal: string -> bool - val tokenize: Lexicon -> string -> Token list - val token_to_string: Token -> string - val valued_token: Token -> bool - type 'b TokenMap - val mkTokenMap: ('b * Token list) list -> 'b TokenMap - val applyTokenMap: 'b TokenMap * Token -> 'b list - include LEXICON0 + val empty_lexicon: lexicon + val extend_lexicon: lexicon -> string list -> lexicon + val mk_lexicon: string list -> lexicon + val tokenize: lexicon -> bool -> string -> token list + + exception LEXICAL_ERROR + val >> : ('a -> 'b * 'c) * ('b -> 'd) -> 'a -> 'd * 'c + val || : ('a -> 'b) * ('a -> 'b) -> 'a -> 'b + val -- : ('a -> 'b * 'c) * ('c -> 'd * 'e) -> 'a -> ('b * 'd) * 'e + val ^^ : ('a -> string * 'b) * ('b -> string * 'c) -> 'a -> string * 'c + val $$ : ''a -> ''a list -> ''a * ''a list + val scan_empty: 'a list -> 'b list * 'a list + val scan_one: ('a -> bool) -> 'a list -> 'a * 'a list + val scan_any: ('a -> bool) -> 'a list -> 'a list * 'a list + val scan_any1: ('a -> bool) -> 'a list -> 'a list * 'a list + val scan_end: 'a list -> 'b list * 'a list + val optional: ('a -> 'b * 'a) -> 'a -> 'b option * 'a + val repeat: ('a -> 'b * 'a) -> 'a -> 'b list * 'a + val repeat1: ('a -> 'b * 'a) -> 'a -> 'b list * 'a end; -functor LexiconFun(Extension: EXTENSION): LEXICON = +functor LexiconFun(): LEXICON = struct -open Extension; + +(** is_identifier etc. **) + +fun is_ident [] = false + | is_ident (c :: cs) = is_letter c andalso forall is_letdig cs; + +val is_identifier = is_ident o explode; + +fun is_xid s = + (case explode s of + "_" :: cs => is_ident cs + | cs => is_ident cs); + +fun is_tfree s = + (case explode s of + "'" :: cs => is_ident cs + | _ => false); + -datatype Token = Token of string - | IdentSy of string - | VarSy of string * int - | TFreeSy of string - | TVarSy of string * int - | end_token; -val no_token = ""; - -datatype dfa = Tip | Branch of string * string * dfa * dfa * dfa; - -type Lexicon = dfa; - -fun is_id(c::cs) = is_letter(c) andalso forall is_letdig cs - | is_id([]) = false; - -fun is_identifier s = is_id(explode s); +(** string_of_vname **) -val empty = Tip; +fun string_of_vname (x, i) = + let + val si = string_of_int i; + in + if is_digit (last_elem (explode x)) then "?" ^ x ^ "." ^ si + else if i = 0 then "?" ^ x + else "?" ^ x ^ si + end; -fun extend1(dfa, s) = - let fun add(Branch(c, a, less, eq, gr), cs as (d::ds)) = - if c>d then Branch(c, a, add(less, cs), eq, gr) else - if cd then next_dfa(gr, c) else Some(a, eq); - -exception ID of string * string list; - -val eof_id = "End of input - identifier expected.\n"; +datatype token = + Token of string | + IdentSy of string | + VarSy of string | + TFreeSy of string | + TVarSy of string | + EndToken; -(*A string of letters, digits, or ' _ *) -fun xscan_ident exn = -let fun scan [] = raise exn(eof_id, []) - | scan(c::cs) = - if is_letter c - then let val (ds, tail) = take_prefix is_letdig cs - in (implode(c::ds), tail) end - else raise exn("Identifier expected: ", c::cs) -in scan end; -(*Scan the offset of a Var, if present; otherwise ~1 *) -fun scan_offset cs = case cs of - ("."::[]) => (~1, cs) - | ("."::(ds as c::cs')) => if is_digit c then scan_int ds else (~1, cs) - | _ => (~1, cs); - -fun split_varname s = - let val (rpost, rpref) = take_prefix is_digit (rev(explode s)) - val (i, _) = scan_int(rev rpost) - in (implode(rev rpref), i) end; +(* names of valued tokens *) -fun xscan_varname exn cs : (string*int) * string list = -let val (a, ds) = xscan_ident exn cs; - val (i, es) = scan_offset ds -in if i = ~1 then (split_varname a, es) else ((a, i), es) end; +val id = "id"; +val var = "var"; +val tfree = "tfree"; +val tvar = "tvar"; -fun scan_varname s = xscan_varname ID s - handle ID(err, cs) => error(err^(implode cs)); -fun tokenize (dfa) (s:string) : Token list = -let exception LEX_ERR; - exception FAIL of string * string list; - val lexerr = "Lexical error: "; +(* str_of_token *) - fun tokenize1 (_:dfa, []:string list) : Token * string list = - raise LEX_ERR - | tokenize1(dfa, c::cs) = - case next_dfa(dfa, c) of - None => raise LEX_ERR - | Some(a, dfa') => - if a=no_token then tokenize1(dfa', cs) - else (tokenize1(dfa', cs) handle LEX_ERR => - if is_identifier a andalso not(null cs) andalso - is_letdig(hd cs) - then raise LEX_ERR else (Token(a), cs)); +fun str_of_token (Token s) = s + | str_of_token (IdentSy s) = s + | str_of_token (VarSy s) = s + | str_of_token (TFreeSy s) = s + | str_of_token (TVarSy s) = s + | str_of_token EndToken = ""; - fun token(cs) = tokenize1(dfa, cs) handle LEX_ERR => raise FAIL(lexerr, cs); + +(* display_token *) - fun id([]) = raise FAIL(eof_id, []) - | id(cs as c::cs') = - if is_letter(c) - then let val (id, cs'') = xscan_ident FAIL cs - in (IdentSy(id), cs'') end - else - if c = "?" - then case cs' of - "'"::xs => let val ((a, i), ys) = xscan_varname FAIL xs - in (TVarSy("'"^a, i), ys) end - | _ => let val ((a, i), ys) = xscan_varname FAIL cs' - in (VarSy(a, i), ys) end - else - if c = "'" - then let val (a, cs'') = xscan_ident FAIL cs' - in (TFreeSy("'" ^ a), cs'') end - else raise FAIL(lexerr, cs); +fun display_token (Token s) = quote s + | display_token (IdentSy s) = "id(" ^ s ^ ")" + | display_token (VarSy s) = "var(" ^ s ^ ")" + | display_token (TFreeSy s) = "tfree(" ^ s ^ ")" + | display_token (TVarSy s) = "tvar(" ^ s ^ ")" + | display_token EndToken = ""; - fun tknize([], ts) = rev(ts) - | tknize(cs as c::cs', ts) = - if is_blank(c) then tknize(cs', ts) else - let val (t, cs'') = - if c="?" then id(cs) handle FAIL _ => token(cs) - else (token(cs) handle FAIL _ => id(cs)) - in tknize(cs'', t::ts) end + +(* matching_tokens *) -in tknize(explode s, []) handle FAIL(s, cs) => error(s^(implode cs)) end; +fun matching_tokens (Token x, Token y) = (x = y) + | matching_tokens (IdentSy _, IdentSy _) = true + | matching_tokens (VarSy _, VarSy _) = true + | matching_tokens (TFreeSy _, TFreeSy _) = true + | matching_tokens (TVarSy _, TVarSy _) = true + | matching_tokens (EndToken, EndToken) = true + | matching_tokens _ = false; -(*Variables have the form ?nameidx, or ?name.idx if "name" ends with a digit*) -fun string_of_vname (a, idx) = case rev(explode a) of - [] => "?NULL_VARIABLE_NAME" - | c::_ => "?" ^ - (if is_digit c then a ^ "." ^ string_of_int idx - else if idx = 0 then a - else a ^ string_of_int idx); -fun token_to_string (Token(s)) = s - | token_to_string (IdentSy(s)) = s - | token_to_string (VarSy v) = string_of_vname v - | token_to_string (TFreeSy a) = a - | token_to_string (TVarSy v) = string_of_vname v - | token_to_string end_token = "\n"; +(* valued_token *) -(* MMW *) -fun name_of_token (Token s) = quote s - | name_of_token (IdentSy _) = id - | name_of_token (VarSy _) = var - | name_of_token (TFreeSy _) = tfree - | name_of_token (TVarSy _) = tvar; +fun valued_token (Token _) = false + | valued_token (IdentSy _) = true + | valued_token (VarSy _) = true + | valued_token (TFreeSy _) = true + | valued_token (TVarSy _) = true + | valued_token EndToken = false; -fun matching_tokens(Token(i), Token(j)) = (i=j) | - matching_tokens(IdentSy(_), IdentSy(_)) = true | - matching_tokens(VarSy _, VarSy _) = true | - matching_tokens(TFreeSy _, TFreeSy _) = true | - matching_tokens(TVarSy _, TVarSy _) = true | - matching_tokens(end_token, end_token) = true | - matching_tokens(_, _) = false; -fun valued_token(end_token) = false - | valued_token(Token _) = false - | valued_token(IdentSy _) = true - | valued_token(VarSy _) = true - | valued_token(TFreeSy _) = true - | valued_token(TVarSy _) = true; +(* predef_term *) -(* MMW *) fun predef_term name = if name = id then IdentSy name - else if name = var then VarSy (name, 0) + else if name = var then VarSy name else if name = tfree then TFreeSy name - else if name = tvar then TVarSy (name, 0) - else end_token; + else if name = tvar then TVarSy name + else EndToken; -(* MMW *) + +(* is_terminal **) + fun is_terminal s = s mem [id, var, tfree, tvar]; -type 'b TokenMap = (Token * 'b list) list * 'b list; -val first_token = 0; +(** datatype lexicon **) + +datatype lexicon = + Empty | + Branch of string * string * lexicon * lexicon * lexicon; + +val no_token = ""; + + +(* empty_lexicon *) + +val empty_lexicon = Empty; + + +(* extend_lexicon *) + +fun extend_lexicon lexicon strs = + let + fun ext (lex, s) = + let + fun add (Branch (d, a, lt, eq, gt)) (chs as c :: cs) = + if c < d then Branch (d, a, add lt chs, eq, gt) + else if c > d then Branch (d, a, lt, eq, add gt chs) + else Branch (d, if null cs then s else a, lt, add eq cs, gt) + | add Empty [c] = + Branch (c, s, Empty, Empty, Empty) + | add Empty (c :: cs) = + Branch (c, no_token, Empty, add Empty cs, Empty) + | add lex [] = lex; + + val cs = explode s; + in + if exists is_blank cs then + error ("extend_lexicon: blank in literal " ^ quote s) + else add lex cs + end; + in + foldl ext (lexicon, strs) + end; + + +(* mk_lexicon *) + +val mk_lexicon = extend_lexicon empty_lexicon; + + + +(** scanners **) + +exception LEXICAL_ERROR; + -fun int_of_token(Token(tk)) = first_token | - int_of_token(IdentSy _) = first_token - 1 | - int_of_token(VarSy _) = first_token - 2 | - int_of_token(TFreeSy _) = first_token - 3 | - int_of_token(TVarSy _) = first_token - 4 | - int_of_token(end_token) = first_token - 5; +(* scanner combinators *) + +fun (scan >> f) cs = apfst f (scan cs); + +fun (scan1 || scan2) cs = scan1 cs handle LEXICAL_ERROR => scan2 cs; + +fun (scan1 -- scan2) cs = + let + val (x, cs') = scan1 cs; + val (y, cs'') = scan2 cs'; + in + ((x, y), cs'') + end; + +fun (scan1 ^^ scan2) = scan1 -- scan2 >> op ^; + + +(* generic scanners *) + +fun $$ _ [] = raise LEXICAL_ERROR + | $$ a (c :: cs) = + if a = c then (c, cs) else raise LEXICAL_ERROR; + +fun scan_empty cs = ([], cs); -fun lesstk(s, t) = int_of_token(s) < int_of_token(t) orelse - (case (s, t) of (Token(a), Token(b)) => a false); +fun scan_one _ [] = raise LEXICAL_ERROR + | scan_one pred (c :: cs) = + if pred c then (c, cs) else raise LEXICAL_ERROR; + +val scan_any = take_prefix; + +fun scan_any1 pred = scan_one pred -- scan_any pred >> op ::; + +fun scan_rest cs = (cs, []); + +fun scan_end [] = ([], []) + | scan_end _ = raise LEXICAL_ERROR; + +fun optional scan = scan >> Some || scan_empty >> K None; + +fun repeat scan cs = (scan -- repeat scan >> op :: || scan_empty) cs; + +fun repeat1 scan = scan -- repeat scan >> op ::; + + +(* other scanners *) + +val scan_letter_letdigs = scan_one is_letter -- scan_any is_letdig >> op ::; + +val scan_digits1 = scan_any1 is_digit; + +val scan_id = scan_letter_letdigs >> implode; -fun mkTokenMap(atll) = - let val aill = atll; - val dom = sort lesstk (distinct(flat(map snd aill))); - val mt = map fst (filter (null o snd) atll); - fun mktm(i) = - let fun add(l, (a, il)) = if i mem il then a::l else l - in (i, foldl add ([], aill)) end; - in (map mktm dom, mt) end; +val scan_id_nat = + scan_id ^^ ($$ "." ^^ (scan_digits1 >> implode) || scan_empty >> K ""); + + +(* scan_literal *) + +fun scan_literal lex chrs = + let + fun scan_lit _ s_cs [] = s_cs + | scan_lit Empty s_cs _ = s_cs + | scan_lit (Branch (d, a, lt, eq, gt)) s_cs (chs as c :: cs) = + if c < d then scan_lit lt s_cs chs + else if c > d then scan_lit gt s_cs chs + else scan_lit eq (if a = no_token then s_cs else Some (a, cs)) cs; + in + (case scan_lit lex None chrs of + None => raise LEXICAL_ERROR + | Some s_cs => s_cs) + end; + + + +(** tokenize **) + +fun tokenize lex xids str = + let + val scan_xid = + if xids then $$ "_" ^^ scan_id || scan_id + else scan_id; + + val scan_lit = scan_literal lex >> pair Token; + + val scan_ident = + $$ "?" ^^ $$ "'" ^^ scan_id_nat >> pair TVarSy || + $$ "?" ^^ scan_id_nat >> pair VarSy || + $$ "'" ^^ scan_id >> pair TFreeSy || + scan_xid >> pair IdentSy; + + fun scan_max_token cs = + (case (optional scan_lit cs, optional scan_ident cs) of + (tok1, (None, _)) => tok1 + | ((None, _), tok2) => tok2 + | (tok1 as (Some (_, s1), _), tok2 as (Some (_, s2), _)) => + if size s1 >= size s2 then tok1 else tok2); -fun find_al (i) = - let fun find((j, al)::l) = if lesstk(i, j) then [] else - if matching_tokens(i, j) then al else find l | - find [] = []; - in find end; -fun applyTokenMap((l, mt), tk:Token) = mt@(find_al tk l); + fun scan_tokens [] rev_toks = rev (EndToken :: rev_toks) + | scan_tokens (chs as c :: cs) rev_toks = + if is_blank c then scan_tokens cs rev_toks + else + (case scan_max_token chs of + (None, _) => error ("Lexical error at: " ^ quote (implode chs)) + | (Some (tk, s), chs') => scan_tokens chs' (tk s :: rev_toks)); + in + scan_tokens (explode str) [] + end; + + + +(** scan variables **) + +(* scan_vname *) + +fun scan_vname chrs = + let + fun nat_of_chs n [] = n + | nat_of_chs n (c :: cs) = nat_of_chs (n * 10 + (ord c - ord "0")) cs; + + val nat_of = nat_of_chs 0; + + fun split_vname chs = + let val (cs, ds) = take_suffix is_digit chs + in (implode cs, nat_of ds) end + + val scan = + scan_letter_letdigs -- + ($$ "." -- scan_digits1 >> (nat_of o #2) || scan_empty >> K ~1); + in + (case scan chrs of + ((cs, ~1), cs') => (split_vname cs, cs') + | ((cs, i), cs') => ((implode cs, i), cs')) + end; + + +(* scan_varname *) + +fun scan_varname chs = + scan_vname chs handle LEXICAL_ERROR + => error ("scan_varname: bad varname " ^ quote (implode chs)); + + +(* scan_var *) + +fun scan_var str = + let + fun tvar (x, i) = Var (("'" ^ x, i), dummyT); + fun var x_i = Var (x_i, dummyT); + fun free x = Free (x, dummyT); + + val scan = + $$ "?" -- $$ "'" -- scan_vname -- scan_end >> (tvar o #2 o #1) || + $$ "?" -- scan_vname -- scan_end >> (var o #2 o #1) || + scan_rest >> (free o implode); + in + #1 (scan (explode str)) + end; end;