--- a/src/Pure/Syntax/lexicon.ML Fri Oct 01 13:26:22 1993 +0100
+++ b/src/Pure/Syntax/lexicon.ML Mon Oct 04 15:30:49 1993 +0100
@@ -1,251 +1,388 @@
-(* Title: Lexicon
+(* Title: Pure/Syntax/lexicon.ML
ID: $Id$
- Author: Tobias Nipkow, TU Muenchen
- Copyright 1993 TU Muenchen
+ Author: Tobias Nipkow and Markus Wenzel, TU Muenchen
-The Isabelle Lexer
+Scanner combinators and Isabelle's main lexer (used for terms and typs).
+*)
-Changes:
-TODO:
- Lexicon -> lexicon, Token -> token
- end_token -> EndToken ?
-*)
+infix 5 -- ^^;
+infix 3 >>;
+infix 0 ||;
signature LEXICON0 =
sig
val is_identifier: string -> bool
+ val string_of_vname: indexname -> string
val scan_varname: string list -> indexname * string list
- val string_of_vname: indexname -> string
+ val scan_var: string -> term
end;
signature LEXICON =
sig
- type Lexicon
- datatype Token = Token of string
- | IdentSy of string
- | VarSy of string * int
- | TFreeSy of string
- | TVarSy of string * int
- | end_token;
- val empty: Lexicon
- val extend: Lexicon * string list -> Lexicon
- val matching_tokens: Token * Token -> bool
- val mk_lexicon: string list -> Lexicon
- val name_of_token: Token -> string
- val predef_term: string -> Token
+ include LEXICON0
+ val is_xid: string -> bool
+ val is_tfree: string -> bool
+ type lexicon
+ datatype token =
+ Token of string |
+ IdentSy of string |
+ VarSy of string |
+ TFreeSy of string |
+ TVarSy of string |
+ EndToken;
+ val id: string
+ val var: string
+ val tfree: string
+ val tvar: string
+ val str_of_token: token -> string
+ val display_token: token -> string
+ val matching_tokens: token * token -> bool
+ val valued_token: token -> bool
+ val predef_term: string -> token
val is_terminal: string -> bool
- val tokenize: Lexicon -> string -> Token list
- val token_to_string: Token -> string
- val valued_token: Token -> bool
- type 'b TokenMap
- val mkTokenMap: ('b * Token list) list -> 'b TokenMap
- val applyTokenMap: 'b TokenMap * Token -> 'b list
- include LEXICON0
+ val empty_lexicon: lexicon
+ val extend_lexicon: lexicon -> string list -> lexicon
+ val mk_lexicon: string list -> lexicon
+ val tokenize: lexicon -> bool -> string -> token list
+
+ exception LEXICAL_ERROR
+ val >> : ('a -> 'b * 'c) * ('b -> 'd) -> 'a -> 'd * 'c
+ val || : ('a -> 'b) * ('a -> 'b) -> 'a -> 'b
+ val -- : ('a -> 'b * 'c) * ('c -> 'd * 'e) -> 'a -> ('b * 'd) * 'e
+ val ^^ : ('a -> string * 'b) * ('b -> string * 'c) -> 'a -> string * 'c
+ val $$ : ''a -> ''a list -> ''a * ''a list
+ val scan_empty: 'a list -> 'b list * 'a list
+ val scan_one: ('a -> bool) -> 'a list -> 'a * 'a list
+ val scan_any: ('a -> bool) -> 'a list -> 'a list * 'a list
+ val scan_any1: ('a -> bool) -> 'a list -> 'a list * 'a list
+ val scan_end: 'a list -> 'b list * 'a list
+ val optional: ('a -> 'b * 'a) -> 'a -> 'b option * 'a
+ val repeat: ('a -> 'b * 'a) -> 'a -> 'b list * 'a
+ val repeat1: ('a -> 'b * 'a) -> 'a -> 'b list * 'a
end;
-functor LexiconFun(Extension: EXTENSION): LEXICON =
+functor LexiconFun(): LEXICON =
struct
-open Extension;
+
+(** is_identifier etc. **)
+
+fun is_ident [] = false
+ | is_ident (c :: cs) = is_letter c andalso forall is_letdig cs;
+
+val is_identifier = is_ident o explode;
+
+fun is_xid s =
+ (case explode s of
+ "_" :: cs => is_ident cs
+ | cs => is_ident cs);
+
+fun is_tfree s =
+ (case explode s of
+ "'" :: cs => is_ident cs
+ | _ => false);
+
-datatype Token = Token of string
- | IdentSy of string
- | VarSy of string * int
- | TFreeSy of string
- | TVarSy of string * int
- | end_token;
-val no_token = "";
-
-datatype dfa = Tip | Branch of string * string * dfa * dfa * dfa;
-
-type Lexicon = dfa;
-
-fun is_id(c::cs) = is_letter(c) andalso forall is_letdig cs
- | is_id([]) = false;
-
-fun is_identifier s = is_id(explode s);
+(** string_of_vname **)
-val empty = Tip;
+fun string_of_vname (x, i) =
+ let
+ val si = string_of_int i;
+ in
+ if is_digit (last_elem (explode x)) then "?" ^ x ^ "." ^ si
+ else if i = 0 then "?" ^ x
+ else "?" ^ x ^ si
+ end;
-fun extend1(dfa, s) =
- let fun add(Branch(c, a, less, eq, gr), cs as (d::ds)) =
- if c>d then Branch(c, a, add(less, cs), eq, gr) else
- if c<d then Branch(c, a, less, eq, add(gr, cs)) else
- Branch(c, if null ds then s else a, less, add(eq, ds), gr)
- | add(Tip, c::cs) =
- Branch(c, if null cs then s else no_token, Tip, add(Tip, cs), Tip)
- | add(dfa, []) = dfa
+
- in add(dfa, explode s) end;
-
-val extend = foldl extend1;
-
-fun mk_lexicon ss = extend(empty, ss);
+(** datatype token **)
-fun next_dfa (Tip, _) = None
- | next_dfa (Branch(d, a, less, eq, gr), c) =
- if c<d then next_dfa(less, c) else
- if c>d then next_dfa(gr, c) else Some(a, eq);
-
-exception ID of string * string list;
-
-val eof_id = "End of input - identifier expected.\n";
+datatype token =
+ Token of string |
+ IdentSy of string |
+ VarSy of string |
+ TFreeSy of string |
+ TVarSy of string |
+ EndToken;
-(*A string of letters, digits, or ' _ *)
-fun xscan_ident exn =
-let fun scan [] = raise exn(eof_id, [])
- | scan(c::cs) =
- if is_letter c
- then let val (ds, tail) = take_prefix is_letdig cs
- in (implode(c::ds), tail) end
- else raise exn("Identifier expected: ", c::cs)
-in scan end;
-(*Scan the offset of a Var, if present; otherwise ~1 *)
-fun scan_offset cs = case cs of
- ("."::[]) => (~1, cs)
- | ("."::(ds as c::cs')) => if is_digit c then scan_int ds else (~1, cs)
- | _ => (~1, cs);
-
-fun split_varname s =
- let val (rpost, rpref) = take_prefix is_digit (rev(explode s))
- val (i, _) = scan_int(rev rpost)
- in (implode(rev rpref), i) end;
+(* names of valued tokens *)
-fun xscan_varname exn cs : (string*int) * string list =
-let val (a, ds) = xscan_ident exn cs;
- val (i, es) = scan_offset ds
-in if i = ~1 then (split_varname a, es) else ((a, i), es) end;
+val id = "id";
+val var = "var";
+val tfree = "tfree";
+val tvar = "tvar";
-fun scan_varname s = xscan_varname ID s
- handle ID(err, cs) => error(err^(implode cs));
-fun tokenize (dfa) (s:string) : Token list =
-let exception LEX_ERR;
- exception FAIL of string * string list;
- val lexerr = "Lexical error: ";
+(* str_of_token *)
- fun tokenize1 (_:dfa, []:string list) : Token * string list =
- raise LEX_ERR
- | tokenize1(dfa, c::cs) =
- case next_dfa(dfa, c) of
- None => raise LEX_ERR
- | Some(a, dfa') =>
- if a=no_token then tokenize1(dfa', cs)
- else (tokenize1(dfa', cs) handle LEX_ERR =>
- if is_identifier a andalso not(null cs) andalso
- is_letdig(hd cs)
- then raise LEX_ERR else (Token(a), cs));
+fun str_of_token (Token s) = s
+ | str_of_token (IdentSy s) = s
+ | str_of_token (VarSy s) = s
+ | str_of_token (TFreeSy s) = s
+ | str_of_token (TVarSy s) = s
+ | str_of_token EndToken = "";
- fun token(cs) = tokenize1(dfa, cs) handle LEX_ERR => raise FAIL(lexerr, cs);
+
+(* display_token *)
- fun id([]) = raise FAIL(eof_id, [])
- | id(cs as c::cs') =
- if is_letter(c)
- then let val (id, cs'') = xscan_ident FAIL cs
- in (IdentSy(id), cs'') end
- else
- if c = "?"
- then case cs' of
- "'"::xs => let val ((a, i), ys) = xscan_varname FAIL xs
- in (TVarSy("'"^a, i), ys) end
- | _ => let val ((a, i), ys) = xscan_varname FAIL cs'
- in (VarSy(a, i), ys) end
- else
- if c = "'"
- then let val (a, cs'') = xscan_ident FAIL cs'
- in (TFreeSy("'" ^ a), cs'') end
- else raise FAIL(lexerr, cs);
+fun display_token (Token s) = quote s
+ | display_token (IdentSy s) = "id(" ^ s ^ ")"
+ | display_token (VarSy s) = "var(" ^ s ^ ")"
+ | display_token (TFreeSy s) = "tfree(" ^ s ^ ")"
+ | display_token (TVarSy s) = "tvar(" ^ s ^ ")"
+ | display_token EndToken = "";
- fun tknize([], ts) = rev(ts)
- | tknize(cs as c::cs', ts) =
- if is_blank(c) then tknize(cs', ts) else
- let val (t, cs'') =
- if c="?" then id(cs) handle FAIL _ => token(cs)
- else (token(cs) handle FAIL _ => id(cs))
- in tknize(cs'', t::ts) end
+
+(* matching_tokens *)
-in tknize(explode s, []) handle FAIL(s, cs) => error(s^(implode cs)) end;
+fun matching_tokens (Token x, Token y) = (x = y)
+ | matching_tokens (IdentSy _, IdentSy _) = true
+ | matching_tokens (VarSy _, VarSy _) = true
+ | matching_tokens (TFreeSy _, TFreeSy _) = true
+ | matching_tokens (TVarSy _, TVarSy _) = true
+ | matching_tokens (EndToken, EndToken) = true
+ | matching_tokens _ = false;
-(*Variables have the form ?nameidx, or ?name.idx if "name" ends with a digit*)
-fun string_of_vname (a, idx) = case rev(explode a) of
- [] => "?NULL_VARIABLE_NAME"
- | c::_ => "?" ^
- (if is_digit c then a ^ "." ^ string_of_int idx
- else if idx = 0 then a
- else a ^ string_of_int idx);
-fun token_to_string (Token(s)) = s
- | token_to_string (IdentSy(s)) = s
- | token_to_string (VarSy v) = string_of_vname v
- | token_to_string (TFreeSy a) = a
- | token_to_string (TVarSy v) = string_of_vname v
- | token_to_string end_token = "\n";
+(* valued_token *)
-(* MMW *)
-fun name_of_token (Token s) = quote s
- | name_of_token (IdentSy _) = id
- | name_of_token (VarSy _) = var
- | name_of_token (TFreeSy _) = tfree
- | name_of_token (TVarSy _) = tvar;
+fun valued_token (Token _) = false
+ | valued_token (IdentSy _) = true
+ | valued_token (VarSy _) = true
+ | valued_token (TFreeSy _) = true
+ | valued_token (TVarSy _) = true
+ | valued_token EndToken = false;
-fun matching_tokens(Token(i), Token(j)) = (i=j) |
- matching_tokens(IdentSy(_), IdentSy(_)) = true |
- matching_tokens(VarSy _, VarSy _) = true |
- matching_tokens(TFreeSy _, TFreeSy _) = true |
- matching_tokens(TVarSy _, TVarSy _) = true |
- matching_tokens(end_token, end_token) = true |
- matching_tokens(_, _) = false;
-fun valued_token(end_token) = false
- | valued_token(Token _) = false
- | valued_token(IdentSy _) = true
- | valued_token(VarSy _) = true
- | valued_token(TFreeSy _) = true
- | valued_token(TVarSy _) = true;
+(* predef_term *)
-(* MMW *)
fun predef_term name =
if name = id then IdentSy name
- else if name = var then VarSy (name, 0)
+ else if name = var then VarSy name
else if name = tfree then TFreeSy name
- else if name = tvar then TVarSy (name, 0)
- else end_token;
+ else if name = tvar then TVarSy name
+ else EndToken;
-(* MMW *)
+
+(* is_terminal **)
+
fun is_terminal s = s mem [id, var, tfree, tvar];
-type 'b TokenMap = (Token * 'b list) list * 'b list;
-val first_token = 0;
+(** datatype lexicon **)
+
+datatype lexicon =
+ Empty |
+ Branch of string * string * lexicon * lexicon * lexicon;
+
+val no_token = "";
+
+
+(* empty_lexicon *)
+
+val empty_lexicon = Empty;
+
+
+(* extend_lexicon *)
+
+fun extend_lexicon lexicon strs =
+ let
+ fun ext (lex, s) =
+ let
+ fun add (Branch (d, a, lt, eq, gt)) (chs as c :: cs) =
+ if c < d then Branch (d, a, add lt chs, eq, gt)
+ else if c > d then Branch (d, a, lt, eq, add gt chs)
+ else Branch (d, if null cs then s else a, lt, add eq cs, gt)
+ | add Empty [c] =
+ Branch (c, s, Empty, Empty, Empty)
+ | add Empty (c :: cs) =
+ Branch (c, no_token, Empty, add Empty cs, Empty)
+ | add lex [] = lex;
+
+ val cs = explode s;
+ in
+ if exists is_blank cs then
+ error ("extend_lexicon: blank in literal " ^ quote s)
+ else add lex cs
+ end;
+ in
+ foldl ext (lexicon, strs)
+ end;
+
+
+(* mk_lexicon *)
+
+val mk_lexicon = extend_lexicon empty_lexicon;
+
+
+
+(** scanners **)
+
+exception LEXICAL_ERROR;
+
-fun int_of_token(Token(tk)) = first_token |
- int_of_token(IdentSy _) = first_token - 1 |
- int_of_token(VarSy _) = first_token - 2 |
- int_of_token(TFreeSy _) = first_token - 3 |
- int_of_token(TVarSy _) = first_token - 4 |
- int_of_token(end_token) = first_token - 5;
+(* scanner combinators *)
+
+fun (scan >> f) cs = apfst f (scan cs);
+
+fun (scan1 || scan2) cs = scan1 cs handle LEXICAL_ERROR => scan2 cs;
+
+fun (scan1 -- scan2) cs =
+ let
+ val (x, cs') = scan1 cs;
+ val (y, cs'') = scan2 cs';
+ in
+ ((x, y), cs'')
+ end;
+
+fun (scan1 ^^ scan2) = scan1 -- scan2 >> op ^;
+
+
+(* generic scanners *)
+
+fun $$ _ [] = raise LEXICAL_ERROR
+ | $$ a (c :: cs) =
+ if a = c then (c, cs) else raise LEXICAL_ERROR;
+
+fun scan_empty cs = ([], cs);
-fun lesstk(s, t) = int_of_token(s) < int_of_token(t) orelse
- (case (s, t) of (Token(a), Token(b)) => a<b | _ => false);
+fun scan_one _ [] = raise LEXICAL_ERROR
+ | scan_one pred (c :: cs) =
+ if pred c then (c, cs) else raise LEXICAL_ERROR;
+
+val scan_any = take_prefix;
+
+fun scan_any1 pred = scan_one pred -- scan_any pred >> op ::;
+
+fun scan_rest cs = (cs, []);
+
+fun scan_end [] = ([], [])
+ | scan_end _ = raise LEXICAL_ERROR;
+
+fun optional scan = scan >> Some || scan_empty >> K None;
+
+fun repeat scan cs = (scan -- repeat scan >> op :: || scan_empty) cs;
+
+fun repeat1 scan = scan -- repeat scan >> op ::;
+
+
+(* other scanners *)
+
+val scan_letter_letdigs = scan_one is_letter -- scan_any is_letdig >> op ::;
+
+val scan_digits1 = scan_any1 is_digit;
+
+val scan_id = scan_letter_letdigs >> implode;
-fun mkTokenMap(atll) =
- let val aill = atll;
- val dom = sort lesstk (distinct(flat(map snd aill)));
- val mt = map fst (filter (null o snd) atll);
- fun mktm(i) =
- let fun add(l, (a, il)) = if i mem il then a::l else l
- in (i, foldl add ([], aill)) end;
- in (map mktm dom, mt) end;
+val scan_id_nat =
+ scan_id ^^ ($$ "." ^^ (scan_digits1 >> implode) || scan_empty >> K "");
+
+
+(* scan_literal *)
+
+fun scan_literal lex chrs =
+ let
+ fun scan_lit _ s_cs [] = s_cs
+ | scan_lit Empty s_cs _ = s_cs
+ | scan_lit (Branch (d, a, lt, eq, gt)) s_cs (chs as c :: cs) =
+ if c < d then scan_lit lt s_cs chs
+ else if c > d then scan_lit gt s_cs chs
+ else scan_lit eq (if a = no_token then s_cs else Some (a, cs)) cs;
+ in
+ (case scan_lit lex None chrs of
+ None => raise LEXICAL_ERROR
+ | Some s_cs => s_cs)
+ end;
+
+
+
+(** tokenize **)
+
+fun tokenize lex xids str =
+ let
+ val scan_xid =
+ if xids then $$ "_" ^^ scan_id || scan_id
+ else scan_id;
+
+ val scan_lit = scan_literal lex >> pair Token;
+
+ val scan_ident =
+ $$ "?" ^^ $$ "'" ^^ scan_id_nat >> pair TVarSy ||
+ $$ "?" ^^ scan_id_nat >> pair VarSy ||
+ $$ "'" ^^ scan_id >> pair TFreeSy ||
+ scan_xid >> pair IdentSy;
+
+ fun scan_max_token cs =
+ (case (optional scan_lit cs, optional scan_ident cs) of
+ (tok1, (None, _)) => tok1
+ | ((None, _), tok2) => tok2
+ | (tok1 as (Some (_, s1), _), tok2 as (Some (_, s2), _)) =>
+ if size s1 >= size s2 then tok1 else tok2);
-fun find_al (i) =
- let fun find((j, al)::l) = if lesstk(i, j) then [] else
- if matching_tokens(i, j) then al else find l |
- find [] = [];
- in find end;
-fun applyTokenMap((l, mt), tk:Token) = mt@(find_al tk l);
+ fun scan_tokens [] rev_toks = rev (EndToken :: rev_toks)
+ | scan_tokens (chs as c :: cs) rev_toks =
+ if is_blank c then scan_tokens cs rev_toks
+ else
+ (case scan_max_token chs of
+ (None, _) => error ("Lexical error at: " ^ quote (implode chs))
+ | (Some (tk, s), chs') => scan_tokens chs' (tk s :: rev_toks));
+ in
+ scan_tokens (explode str) []
+ end;
+
+
+
+(** scan variables **)
+
+(* scan_vname *)
+
+fun scan_vname chrs =
+ let
+ fun nat_of_chs n [] = n
+ | nat_of_chs n (c :: cs) = nat_of_chs (n * 10 + (ord c - ord "0")) cs;
+
+ val nat_of = nat_of_chs 0;
+
+ fun split_vname chs =
+ let val (cs, ds) = take_suffix is_digit chs
+ in (implode cs, nat_of ds) end
+
+ val scan =
+ scan_letter_letdigs --
+ ($$ "." -- scan_digits1 >> (nat_of o #2) || scan_empty >> K ~1);
+ in
+ (case scan chrs of
+ ((cs, ~1), cs') => (split_vname cs, cs')
+ | ((cs, i), cs') => ((implode cs, i), cs'))
+ end;
+
+
+(* scan_varname *)
+
+fun scan_varname chs =
+ scan_vname chs handle LEXICAL_ERROR
+ => error ("scan_varname: bad varname " ^ quote (implode chs));
+
+
+(* scan_var *)
+
+fun scan_var str =
+ let
+ fun tvar (x, i) = Var (("'" ^ x, i), dummyT);
+ fun var x_i = Var (x_i, dummyT);
+ fun free x = Free (x, dummyT);
+
+ val scan =
+ $$ "?" -- $$ "'" -- scan_vname -- scan_end >> (tvar o #2 o #1) ||
+ $$ "?" -- scan_vname -- scan_end >> (var o #2 o #1) ||
+ scan_rest >> (free o implode);
+ in
+ #1 (scan (explode str))
+ end;
end;