src/Pure/Syntax/lexicon.ML
changeset 0 a5a9c433f639
child 18 c9ec452ff08f
equal deleted inserted replaced
-1:000000000000 0:a5a9c433f639
       
     1 (*  Title:      Lexicon
       
     2     ID:         $Id$
       
     3     Author:     Tobias Nipkow, TU Muenchen
       
     4     Copyright   1993  TU Muenchen
       
     5 
       
     6 The Isabelle Lexer
       
     7 
       
     8 Changes:
       
     9 TODO:
       
    10   Lexicon -> lexicon, Token -> token
       
    11   end_token -> EndToken ?
       
    12 *)
       
    13 
       
    14 signature LEXICON0 =
       
    15 sig
       
    16   val is_identifier: string -> bool
       
    17   val scan_varname: string list -> indexname * string list
       
    18   val string_of_vname: indexname -> string
       
    19 end;
       
    20 
       
    21 signature LEXICON =
       
    22 sig
       
    23   type Lexicon
       
    24   datatype Token = Token of string
       
    25                  | IdentSy of string
       
    26                  | VarSy of string * int
       
    27                  | TFreeSy of string
       
    28                  | TVarSy of string * int
       
    29                  | end_token;
       
    30   val empty: Lexicon
       
    31   val extend: Lexicon * string list -> Lexicon
       
    32   val matching_tokens: Token * Token -> bool
       
    33   val mk_lexicon: string list -> Lexicon
       
    34   val name_of_token: Token -> string
       
    35   val predef_term: string -> Token
       
    36   val is_terminal: string -> bool
       
    37   val tokenize: Lexicon -> string -> Token list
       
    38   val token_to_string: Token -> string
       
    39   val valued_token: Token -> bool
       
    40   type 'b TokenMap
       
    41   val mkTokenMap: ('b * Token list) list -> 'b TokenMap
       
    42   val applyTokenMap: 'b TokenMap * Token -> 'b list
       
    43   include LEXICON0
       
    44 end;
       
    45 
       
    46 functor LexiconFun(Extension: EXTENSION): LEXICON =
       
    47 struct
       
    48 
       
    49 open Extension;
       
    50 
       
    51 
       
    52 datatype Token = Token of string
       
    53                | IdentSy of string
       
    54                | VarSy of string * int
       
    55                | TFreeSy of string
       
    56                | TVarSy of string * int
       
    57                | end_token;
       
    58 val no_token = "";
       
    59 
       
    60 datatype dfa = Tip | Branch of string * string * dfa * dfa * dfa;
       
    61 
       
    62 type Lexicon = dfa;
       
    63 
       
    64 fun is_id(c::cs) = is_letter(c) andalso forall is_letdig cs
       
    65   | is_id([])    = false;
       
    66 
       
    67 fun is_identifier s = is_id(explode s);
       
    68 
       
    69 val empty = Tip;
       
    70 
       
    71 fun extend1(dfa, s) =
       
    72     let fun add(Branch(c, a, less, eq, gr), cs as (d::ds)) =
       
    73               if c>d then Branch(c, a, add(less, cs), eq, gr) else
       
    74               if c<d then Branch(c, a, less, eq, add(gr, cs)) else
       
    75               Branch(c, if null ds then s else a, less, add(eq, ds), gr)
       
    76           | add(Tip, c::cs) =
       
    77               Branch(c, if null cs then s else no_token, Tip, add(Tip, cs), Tip)
       
    78           | add(dfa, []) = dfa
       
    79 
       
    80     in add(dfa, explode s) end;
       
    81 
       
    82 val extend = foldl extend1;
       
    83 
       
    84 fun mk_lexicon ss = extend(empty, ss);
       
    85 
       
    86 fun next_dfa (Tip, _) = None
       
    87   | next_dfa (Branch(d, a, less, eq, gr), c) =
       
    88       if c<d then next_dfa(less, c) else
       
    89       if c>d then next_dfa(gr, c)   else Some(a, eq);
       
    90 
       
    91 exception ID of string * string list;
       
    92 
       
    93 val eof_id = "End of input - identifier expected.\n";
       
    94 
       
    95 (*A string of letters, digits, or ' _ *)
       
    96 fun xscan_ident exn =
       
    97 let fun scan []  =  raise exn(eof_id, [])
       
    98       | scan(c::cs) =
       
    99         if  is_letter c
       
   100         then let val (ds, tail) = take_prefix is_letdig cs
       
   101              in  (implode(c::ds), tail)  end
       
   102         else raise exn("Identifier expected: ", c::cs)
       
   103 in scan end;
       
   104 
       
   105 (*Scan the offset of a Var, if present; otherwise ~1 *)
       
   106 fun scan_offset cs = case cs of
       
   107     ("."::[]) => (~1, cs)
       
   108   | ("."::(ds as c::cs')) => if is_digit c then scan_int ds else (~1, cs)
       
   109   | _ => (~1, cs);
       
   110 
       
   111 fun split_varname s =
       
   112     let val (rpost, rpref) = take_prefix is_digit (rev(explode s))
       
   113         val (i, _) = scan_int(rev rpost)
       
   114     in (implode(rev rpref), i) end;
       
   115 
       
   116 fun xscan_varname exn cs : (string*int) * string list =
       
   117 let val (a, ds) = xscan_ident exn cs;
       
   118     val (i, es) = scan_offset ds
       
   119 in if i = ~1 then (split_varname a, es) else ((a, i), es) end;
       
   120 
       
   121 fun scan_varname s = xscan_varname ID s
       
   122         handle ID(err, cs) => error(err^(implode cs));
       
   123 
       
   124 fun tokenize (dfa) (s:string) : Token list =
       
   125 let exception LEX_ERR;
       
   126     exception FAIL of string * string list;
       
   127     val lexerr = "Lexical error: ";
       
   128 
       
   129     fun tokenize1 (_:dfa, []:string list) : Token * string list =
       
   130           raise LEX_ERR
       
   131       | tokenize1(dfa, c::cs) =
       
   132           case next_dfa(dfa, c) of
       
   133             None => raise LEX_ERR
       
   134           | Some(a, dfa') =>
       
   135              if a=no_token then tokenize1(dfa', cs)
       
   136              else (tokenize1(dfa', cs) handle LEX_ERR =>
       
   137                    if is_identifier a andalso not(null cs) andalso
       
   138                       is_letdig(hd cs)
       
   139                    then raise LEX_ERR else (Token(a), cs));
       
   140 
       
   141     fun token(cs) = tokenize1(dfa, cs) handle LEX_ERR => raise FAIL(lexerr, cs);
       
   142 
       
   143     fun id([]) = raise FAIL(eof_id, [])
       
   144       | id(cs as c::cs') =
       
   145         if is_letter(c)
       
   146         then let val (id, cs'') = xscan_ident FAIL cs
       
   147              in (IdentSy(id), cs'') end
       
   148         else
       
   149         if c = "?"
       
   150         then case cs' of
       
   151                 "'"::xs => let val ((a, i), ys) = xscan_varname FAIL xs
       
   152                            in (TVarSy("'"^a, i), ys) end
       
   153              | _ => let val ((a, i), ys) = xscan_varname FAIL cs'
       
   154                     in (VarSy(a, i), ys) end
       
   155         else
       
   156         if c = "'"
       
   157         then let val (a, cs'') = xscan_ident FAIL cs'
       
   158              in (TFreeSy("'" ^ a), cs'') end
       
   159         else raise FAIL(lexerr, cs);
       
   160 
       
   161     fun tknize([], ts) = rev(ts)
       
   162       | tknize(cs as c::cs', ts) =
       
   163         if is_blank(c) then tknize(cs', ts) else
       
   164         let val (t, cs'') =
       
   165                 if c="?" then id(cs) handle FAIL _ => token(cs)
       
   166                 else (token(cs) handle FAIL _ => id(cs))
       
   167         in tknize(cs'', t::ts) end
       
   168 
       
   169 in tknize(explode s, []) handle FAIL(s, cs) => error(s^(implode cs)) end;
       
   170 
       
   171 (*Variables have the form ?nameidx, or ?name.idx if "name" ends with a digit*)
       
   172 fun string_of_vname (a, idx) = case rev(explode a) of
       
   173         []   => "?NULL_VARIABLE_NAME"
       
   174       | c::_ => "?" ^
       
   175                 (if is_digit c then a ^ "." ^ string_of_int idx
       
   176                  else if idx = 0 then a
       
   177                  else a ^ string_of_int idx);
       
   178 
       
   179 fun token_to_string (Token(s)) = s
       
   180   | token_to_string (IdentSy(s)) = s
       
   181   | token_to_string (VarSy v) = string_of_vname v
       
   182   | token_to_string (TFreeSy a) = a
       
   183   | token_to_string (TVarSy v) = string_of_vname v
       
   184   | token_to_string end_token = "\n";
       
   185 
       
   186 (* MMW *)
       
   187 fun name_of_token (Token s) = quote s
       
   188   | name_of_token (IdentSy _) = id
       
   189   | name_of_token (VarSy _) = var
       
   190   | name_of_token (TFreeSy _) = tfree
       
   191   | name_of_token (TVarSy _) = tvar;
       
   192 
       
   193 fun matching_tokens(Token(i), Token(j)) = (i=j) |
       
   194     matching_tokens(IdentSy(_), IdentSy(_)) = true |
       
   195     matching_tokens(VarSy _, VarSy _) = true |
       
   196     matching_tokens(TFreeSy _, TFreeSy _) = true |
       
   197     matching_tokens(TVarSy _, TVarSy _) = true |
       
   198     matching_tokens(end_token, end_token) = true |
       
   199     matching_tokens(_, _) = false;
       
   200 
       
   201 fun valued_token(end_token) = false
       
   202   | valued_token(Token _) = false
       
   203   | valued_token(IdentSy _) = true
       
   204   | valued_token(VarSy _) = true
       
   205   | valued_token(TFreeSy _) = true
       
   206   | valued_token(TVarSy _) = true;
       
   207 
       
   208 (* MMW *)
       
   209 fun predef_term name =
       
   210   if name = id then IdentSy name
       
   211   else if name = var then VarSy (name, 0)
       
   212   else if name = tfree then TFreeSy name
       
   213   else if name = tvar then TVarSy (name, 0)
       
   214   else end_token;
       
   215 
       
   216 (* MMW *)
       
   217 fun is_terminal s = s mem [id, var, tfree, tvar];
       
   218 
       
   219 
       
   220 
       
   221 type 'b TokenMap = (Token * 'b list) list * 'b list;
       
   222 val first_token = 0;
       
   223 
       
   224 fun int_of_token(Token(tk)) = first_token |
       
   225     int_of_token(IdentSy _) = first_token - 1 |
       
   226     int_of_token(VarSy _) = first_token - 2 |
       
   227     int_of_token(TFreeSy _) = first_token - 3 |
       
   228     int_of_token(TVarSy _) = first_token - 4 |
       
   229     int_of_token(end_token) = first_token - 5;
       
   230 
       
   231 fun lesstk(s, t) = int_of_token(s) < int_of_token(t) orelse
       
   232                   (case (s, t) of (Token(a), Token(b)) => a<b | _ => false);
       
   233 
       
   234 fun mkTokenMap(atll) =
       
   235     let val aill = atll;
       
   236         val dom = sort lesstk (distinct(flat(map snd aill)));
       
   237         val mt = map fst (filter (null o snd) atll);
       
   238         fun mktm(i) =
       
   239             let fun add(l, (a, il)) = if i mem il then a::l else l
       
   240             in (i, foldl add ([], aill)) end;
       
   241     in (map mktm dom, mt) end;
       
   242 
       
   243 fun find_al (i) =
       
   244     let fun find((j, al)::l) = if lesstk(i, j) then [] else
       
   245                               if matching_tokens(i, j) then al else find l |
       
   246             find [] = [];
       
   247     in find end;
       
   248 fun applyTokenMap((l, mt), tk:Token) = mt@(find_al tk l);
       
   249 
       
   250 
       
   251 end;
       
   252