src/Pure/Thy/scan.ML
author wenzelm
Wed Jan 29 15:45:40 1997 +0100 (1997-01-29)
changeset 2564 9d66b758bce5
parent 213 42f2b8a3581f
permissions -rw-r--r--
removed warning for unprintable chars in strings (functionality will
be put into administrative script);
clasohm@0
     1
(*  Title: 	Pure/Thy/scan
clasohm@0
     2
    ID:         $Id$
clasohm@0
     3
    Author: 	Sonia Mahjoub / Tobias Nipkow
clasohm@0
     4
    Copyright   1992  TU Muenchen
clasohm@0
     5
nipkow@213
     6
    modified    December 1993 by Max Breitling (Type-variables added)
nipkow@213
     7
clasohm@0
     8
The scanner. Adapted from Larry Paulson's ML for the Working Programmer.
clasohm@0
     9
*)
clasohm@0
    10
clasohm@0
    11
signature LEXICAL =
clasohm@0
    12
sig
clasohm@0
    13
clasohm@0
    14
clasohm@0
    15
datatype token = Id  of string 
clasohm@0
    16
               | Key of string
clasohm@0
    17
               | Nat of string
clasohm@0
    18
               | Stg of string
clasohm@0
    19
               | Txt of string
nipkow@213
    20
               | TypVar of string
clasohm@0
    21
clasohm@0
    22
val scan : string list -> (token * int) list
clasohm@0
    23
end;
clasohm@0
    24
clasohm@0
    25
signature KEYWORD = 
clasohm@0
    26
sig
clasohm@0
    27
val alphas  : string list
clasohm@0
    28
val symbols : string list
clasohm@0
    29
end;
clasohm@0
    30
clasohm@0
    31
clasohm@0
    32
functor LexicalFUN (Keyword: KEYWORD): LEXICAL = 
clasohm@0
    33
clasohm@0
    34
struct
clasohm@0
    35
clasohm@0
    36
clasohm@0
    37
clasohm@0
    38
datatype token = Id  of string 
clasohm@0
    39
               | Key of string
clasohm@0
    40
               | Nat of string
clasohm@0
    41
               | Stg of string
nipkow@213
    42
               | Txt of string
nipkow@213
    43
               | TypVar of string;
clasohm@0
    44
clasohm@0
    45
clasohm@0
    46
fun lexerr(n,s) =
clasohm@0
    47
    error("Lexical error on line " ^ (string_of_int n) ^ ": " ^ s);
clasohm@0
    48
nipkow@213
    49
val specials = explode"!{}@#$%^&*()+-=[]:\";,./?`_~<>|\\";
clasohm@0
    50
clasohm@0
    51
fun is_symbol c = "_" = c orelse "'" = c;
clasohm@0
    52
clasohm@0
    53
fun alphanum (id, c::cs) =
clasohm@0
    54
       if is_letter c orelse is_digit c orelse is_symbol c
clasohm@0
    55
       then alphanum (id ^ c , cs)
clasohm@0
    56
       else (id , c :: cs)
clasohm@0
    57
  | alphanum (id ,[]) = (id ,[]);
clasohm@0
    58
clasohm@0
    59
fun numeric (nat, c::cs) =
clasohm@0
    60
      if is_digit c 
clasohm@0
    61
      then numeric (nat^c, cs)
clasohm@0
    62
      else (nat, c::cs)
clasohm@0
    63
  | numeric (nat, []) = (nat,[]);
clasohm@0
    64
 
clasohm@0
    65
fun tokenof (a, n) =
clasohm@0
    66
      if a mem Keyword.alphas
clasohm@0
    67
      then (Key a, n) 
clasohm@0
    68
      else (Id a, n);
clasohm@0
    69
clasohm@0
    70
fun symbolic (sy, c::cs) =
clasohm@0
    71
       if (sy mem Keyword.symbols) andalso 
clasohm@0
    72
          not((sy^c) mem Keyword.symbols) 
clasohm@0
    73
          orelse not (c mem specials)
clasohm@0
    74
       then (sy, c::cs)
clasohm@0
    75
       else symbolic(sy^c, cs)
clasohm@0
    76
  | symbolic (sy, []) = (sy, []);
clasohm@0
    77
clasohm@0
    78
fun stringerr(n) = lexerr(n, "No matching quote found on this line");
clasohm@0
    79
clasohm@0
    80
fun is_control_chr ([],_,n) = stringerr(n)
clasohm@0
    81
  | is_control_chr (c::cs,s,n) = 
clasohm@0
    82
          let val m = ord c
clasohm@0
    83
          in if (m >= 64 andalso m <= 95)
clasohm@0
    84
             then (cs, s^c, n)
clasohm@0
    85
             else stringerr(n)
clasohm@0
    86
          end;
clasohm@0
    87
clasohm@0
    88
fun is_3_dgt (c1::c2::cs, c,n) = 
clasohm@0
    89
          let val s = c^c1^c2
clasohm@0
    90
          in  if (s >= "000" andalso s <= "255")
clasohm@0
    91
              then (cs, s)
clasohm@0
    92
              else stringerr(n)
clasohm@0
    93
          end 
clasohm@0
    94
  | is_3_dgt (_,_,n) = stringerr(n); 
clasohm@0
    95
clasohm@0
    96
fun is_imprt_seq ([],_,n) = stringerr(n)
clasohm@0
    97
  | is_imprt_seq ((c::cs),s,n) = 
clasohm@0
    98
          if c = "\\" then (cs,s^"\\",n)
clasohm@0
    99
          else if c = "\n"
clasohm@0
   100
               then is_imprt_seq (cs,s^"\n",n+1)
clasohm@0
   101
          else if (c = "\t") orelse (c = " ")
clasohm@0
   102
               then is_imprt_seq (cs,s^c,n)
clasohm@0
   103
          else stringerr(n);
clasohm@0
   104
clasohm@0
   105
fun is_escape_seq ([],_,n) = stringerr(n)
clasohm@0
   106
  | is_escape_seq ((c::cs),s,n) =  
clasohm@0
   107
          if c = "\\" then (cs,s^"\\",n)
clasohm@0
   108
          else if c = "\n" 
clasohm@0
   109
               then is_imprt_seq (cs,s^"\n",n+1) 
clasohm@0
   110
          else if (c = "\t") orelse (c = " ")
clasohm@0
   111
               then is_imprt_seq (cs,s^c,n)
clasohm@0
   112
          else if c = "^" 
clasohm@0
   113
               then is_control_chr (cs,s^"^",n)
clasohm@0
   114
          else if ("0" <= c andalso c <= "2") 
clasohm@0
   115
               then let val (cs',s') = 
clasohm@0
   116
                            is_3_dgt(cs,c,n)
clasohm@0
   117
                    in (cs',s^s',n)
clasohm@0
   118
                    end
clasohm@0
   119
          else stringerr(n);
clasohm@0
   120
clasohm@0
   121
clasohm@0
   122
fun string (_,[],_,n) = stringerr(n)
clasohm@0
   123
  | string (toks, c::cs, s, n) =
clasohm@0
   124
       if c  = "\"" then ((Stg s, n)::toks , cs, n)
clasohm@0
   125
       else if c = "\\" 
clasohm@0
   126
            then  let val (cs',s',n') = 
clasohm@0
   127
                          is_escape_seq (cs, s^"\\",n) 
clasohm@0
   128
                  in string (toks,cs',s',n') end 
clasohm@0
   129
       else string (toks,cs,s^c,n);
clasohm@0
   130
clasohm@0
   131
clasohm@0
   132
fun comment ((c1::c2::cs), n) =
clasohm@0
   133
      if c1 = "*" andalso c2 = ")" then (cs,n) else
clasohm@0
   134
      if c1 = "\n" then comment((c2::cs), n+1)
clasohm@0
   135
      else comment((c2::cs), n)
clasohm@0
   136
  | comment (_, n) = lexerr(n, "Missing end of comment");
clasohm@0
   137
clasohm@0
   138
fun scanning (toks , [], n) = rev toks
clasohm@0
   139
  | scanning (toks , c :: cs, n) = 
clasohm@0
   140
       if is_letter c 
clasohm@0
   141
       then let val (id , cs2) = alphanum (c , cs)
clasohm@0
   142
            in if id = "ML"
clasohm@0
   143
               then let val text = implode cs2
clasohm@0
   144
                    in  scanning ((Txt text,n) :: toks , [], n)
clasohm@0
   145
                    end
clasohm@0
   146
               else scanning (tokenof(id,n) :: toks , cs2, n) 
clasohm@0
   147
            end
clasohm@0
   148
       else if is_digit c 
clasohm@0
   149
            then let val (nat , cs2) = numeric(c , cs)
clasohm@0
   150
                 in scanning ((Nat nat,n) :: toks , cs2, n) end
nipkow@213
   151
nipkow@213
   152
       else if c = "'" andalso is_letter(hd cs)
nipkow@213
   153
            then let val (var, cs2) = alphanum (hd cs, tl cs)
nipkow@213
   154
                 in scanning((TypVar (c^var),n) :: toks, cs2, n) end
nipkow@213
   155
clasohm@0
   156
       else if c mem specials
clasohm@0
   157
            then if c = "\""
clasohm@0
   158
                 then let val (toks', cs', n') = string (toks, cs, "", n)
clasohm@0
   159
                      in scanning (toks', cs', n') end
clasohm@0
   160
                 else let val (sy , cs2) = symbolic (c , cs)
clasohm@0
   161
                      in if sy = "(*"
clasohm@0
   162
                         then let val (cs3,n2) = comment(cs2,n)
clasohm@0
   163
                              in scanning (toks , cs3, n2) end
clasohm@0
   164
                         else scanning ((Key sy,n) :: toks, cs2, n)
clasohm@0
   165
                      end
clasohm@0
   166
       else if c = "\n" then scanning (toks, cs, n+1)
clasohm@0
   167
       else if c = " " orelse c = "\t" then scanning (toks , cs, n)
clasohm@0
   168
       else lexerr(n,"Illegal character " ^ c);
clasohm@0
   169
clasohm@0
   170
fun scan a = scanning ([] , a, 1);
clasohm@0
   171
clasohm@0
   172
end;