src/Pure/Thy/scan.ML
author wenzelm
Wed, 24 May 2000 19:09:36 +0200
changeset 8965 d46b36785c70
parent 213 42f2b8a3581f
permissions -rw-r--r--
proper token_translation for latex mode;

(*  Title: 	Pure/Thy/scan
    ID:         $Id$
    Author: 	Sonia Mahjoub / Tobias Nipkow
    Copyright   1992  TU Muenchen

    modified    December 1993 by Max Breitling (Type-variables added)

The scanner. Adapted from Larry Paulson's ML for the Working Programmer.
*)

signature LEXICAL =
sig


datatype token = Id  of string 
               | Key of string
               | Nat of string
               | Stg of string
               | Txt of string
               | TypVar of string

val scan : string list -> (token * int) list
end;

signature KEYWORD = 
sig
val alphas  : string list
val symbols : string list
end;


functor LexicalFUN (Keyword: KEYWORD): LEXICAL = 

struct



datatype token = Id  of string 
               | Key of string
               | Nat of string
               | Stg of string
               | Txt of string
               | TypVar of string;


fun lexerr(n,s) =
    error("Lexical error on line " ^ (string_of_int n) ^ ": " ^ s);

val specials = explode"!{}@#$%^&*()+-=[]:\";,./?`_~<>|\\";

fun is_symbol c = "_" = c orelse "'" = c;

fun alphanum (id, c::cs) =
       if is_letter c orelse is_digit c orelse is_symbol c
       then alphanum (id ^ c , cs)
       else (id , c :: cs)
  | alphanum (id ,[]) = (id ,[]);

fun numeric (nat, c::cs) =
      if is_digit c 
      then numeric (nat^c, cs)
      else (nat, c::cs)
  | numeric (nat, []) = (nat,[]);
 
fun tokenof (a, n) =
      if a mem Keyword.alphas
      then (Key a, n) 
      else (Id a, n);

fun symbolic (sy, c::cs) =
       if (sy mem Keyword.symbols) andalso 
          not((sy^c) mem Keyword.symbols) 
          orelse not (c mem specials)
       then (sy, c::cs)
       else symbolic(sy^c, cs)
  | symbolic (sy, []) = (sy, []);

fun stringerr(n) = lexerr(n, "No matching quote found on this line");

fun is_control_chr ([],_,n) = stringerr(n)
  | is_control_chr (c::cs,s,n) = 
          let val m = ord c
          in if (m >= 64 andalso m <= 95)
             then (cs, s^c, n)
             else stringerr(n)
          end;

fun is_3_dgt (c1::c2::cs, c,n) = 
          let val s = c^c1^c2
          in  if (s >= "000" andalso s <= "255")
              then (cs, s)
              else stringerr(n)
          end 
  | is_3_dgt (_,_,n) = stringerr(n); 

fun is_imprt_seq ([],_,n) = stringerr(n)
  | is_imprt_seq ((c::cs),s,n) = 
          if c = "\\" then (cs,s^"\\",n)
          else if c = "\n"
               then is_imprt_seq (cs,s^"\n",n+1)
          else if (c = "\t") orelse (c = " ")
               then is_imprt_seq (cs,s^c,n)
          else stringerr(n);

fun is_escape_seq ([],_,n) = stringerr(n)
  | is_escape_seq ((c::cs),s,n) =  
          if c = "\\" then (cs,s^"\\",n)
          else if c = "\n" 
               then is_imprt_seq (cs,s^"\n",n+1) 
          else if (c = "\t") orelse (c = " ")
               then is_imprt_seq (cs,s^c,n)
          else if c = "^" 
               then is_control_chr (cs,s^"^",n)
          else if ("0" <= c andalso c <= "2") 
               then let val (cs',s') = 
                            is_3_dgt(cs,c,n)
                    in (cs',s^s',n)
                    end
          else stringerr(n);


fun string (_,[],_,n) = stringerr(n)
  | string (toks, c::cs, s, n) =
       if c  = "\"" then ((Stg s, n)::toks , cs, n)
       else if c = "\\" 
            then  let val (cs',s',n') = 
                          is_escape_seq (cs, s^"\\",n) 
                  in string (toks,cs',s',n') end 
       else string (toks,cs,s^c,n);


fun comment ((c1::c2::cs), n) =
      if c1 = "*" andalso c2 = ")" then (cs,n) else
      if c1 = "\n" then comment((c2::cs), n+1)
      else comment((c2::cs), n)
  | comment (_, n) = lexerr(n, "Missing end of comment");

fun scanning (toks , [], n) = rev toks
  | scanning (toks , c :: cs, n) = 
       if is_letter c 
       then let val (id , cs2) = alphanum (c , cs)
            in if id = "ML"
               then let val text = implode cs2
                    in  scanning ((Txt text,n) :: toks , [], n)
                    end
               else scanning (tokenof(id,n) :: toks , cs2, n) 
            end
       else if is_digit c 
            then let val (nat , cs2) = numeric(c , cs)
                 in scanning ((Nat nat,n) :: toks , cs2, n) end

       else if c = "'" andalso is_letter(hd cs)
            then let val (var, cs2) = alphanum (hd cs, tl cs)
                 in scanning((TypVar (c^var),n) :: toks, cs2, n) end

       else if c mem specials
            then if c = "\""
                 then let val (toks', cs', n') = string (toks, cs, "", n)
                      in scanning (toks', cs', n') end
                 else let val (sy , cs2) = symbolic (c , cs)
                      in if sy = "(*"
                         then let val (cs3,n2) = comment(cs2,n)
                              in scanning (toks , cs3, n2) end
                         else scanning ((Key sy,n) :: toks, cs2, n)
                      end
       else if c = "\n" then scanning (toks, cs, n+1)
       else if c = " " orelse c = "\t" then scanning (toks , cs, n)
       else lexerr(n,"Illegal character " ^ c);

fun scan a = scanning ([] , a, 1);

end;