| 0 |      1 | (*  Title: 	Pure/Thy/scan
 | 
|  |      2 |     ID:         $Id$
 | 
|  |      3 |     Author: 	Sonia Mahjoub / Tobias Nipkow
 | 
|  |      4 |     Copyright   1992  TU Muenchen
 | 
|  |      5 | 
 | 
| 213 |      6 |     modified    December 1993 by Max Breitling (Type-variables added)
 | 
|  |      7 | 
 | 
| 0 |      8 | The scanner. Adapted from Larry Paulson's ML for the Working Programmer.
 | 
|  |      9 | *)
 | 
|  |     10 | 
 | 
|  |     11 | signature LEXICAL =
 | 
|  |     12 | sig
 | 
|  |     13 | 
 | 
|  |     14 | 
 | 
|  |     15 | datatype token = Id  of string 
 | 
|  |     16 |                | Key of string
 | 
|  |     17 |                | Nat of string
 | 
|  |     18 |                | Stg of string
 | 
|  |     19 |                | Txt of string
 | 
| 213 |     20 |                | TypVar of string
 | 
| 0 |     21 | 
 | 
|  |     22 | val scan : string list -> (token * int) list
 | 
|  |     23 | end;
 | 
|  |     24 | 
 | 
|  |     25 | signature KEYWORD = 
 | 
|  |     26 | sig
 | 
|  |     27 | val alphas  : string list
 | 
|  |     28 | val symbols : string list
 | 
|  |     29 | end;
 | 
|  |     30 | 
 | 
|  |     31 | 
 | 
|  |     32 | functor LexicalFUN (Keyword: KEYWORD): LEXICAL = 
 | 
|  |     33 | 
 | 
|  |     34 | struct
 | 
|  |     35 | 
 | 
|  |     36 | 
 | 
|  |     37 | 
 | 
|  |     38 | datatype token = Id  of string 
 | 
|  |     39 |                | Key of string
 | 
|  |     40 |                | Nat of string
 | 
|  |     41 |                | Stg of string
 | 
| 213 |     42 |                | Txt of string
 | 
|  |     43 |                | TypVar of string;
 | 
| 0 |     44 | 
 | 
|  |     45 | 
 | 
|  |     46 | fun lexerr(n,s) =
 | 
|  |     47 |     error("Lexical error on line " ^ (string_of_int n) ^ ": " ^ s);
 | 
|  |     48 | 
 | 
| 213 |     49 | val specials = explode"!{}@#$%^&*()+-=[]:\";,./?`_~<>|\\";
 | 
| 0 |     50 | 
 | 
|  |     51 | fun is_symbol c = "_" = c orelse "'" = c;
 | 
|  |     52 | 
 | 
|  |     53 | fun alphanum (id, c::cs) =
 | 
|  |     54 |        if is_letter c orelse is_digit c orelse is_symbol c
 | 
|  |     55 |        then alphanum (id ^ c , cs)
 | 
|  |     56 |        else (id , c :: cs)
 | 
|  |     57 |   | alphanum (id ,[]) = (id ,[]);
 | 
|  |     58 | 
 | 
|  |     59 | fun numeric (nat, c::cs) =
 | 
|  |     60 |       if is_digit c 
 | 
|  |     61 |       then numeric (nat^c, cs)
 | 
|  |     62 |       else (nat, c::cs)
 | 
|  |     63 |   | numeric (nat, []) = (nat,[]);
 | 
|  |     64 |  
 | 
|  |     65 | fun tokenof (a, n) =
 | 
|  |     66 |       if a mem Keyword.alphas
 | 
|  |     67 |       then (Key a, n) 
 | 
|  |     68 |       else (Id a, n);
 | 
|  |     69 | 
 | 
|  |     70 | fun symbolic (sy, c::cs) =
 | 
|  |     71 |        if (sy mem Keyword.symbols) andalso 
 | 
|  |     72 |           not((sy^c) mem Keyword.symbols) 
 | 
|  |     73 |           orelse not (c mem specials)
 | 
|  |     74 |        then (sy, c::cs)
 | 
|  |     75 |        else symbolic(sy^c, cs)
 | 
|  |     76 |   | symbolic (sy, []) = (sy, []);
 | 
|  |     77 | 
 | 
|  |     78 | fun stringerr(n) = lexerr(n, "No matching quote found on this line");
 | 
|  |     79 | 
 | 
|  |     80 | fun is_control_chr ([],_,n) = stringerr(n)
 | 
|  |     81 |   | is_control_chr (c::cs,s,n) = 
 | 
|  |     82 |           let val m = ord c
 | 
|  |     83 |           in if (m >= 64 andalso m <= 95)
 | 
|  |     84 |              then (cs, s^c, n)
 | 
|  |     85 |              else stringerr(n)
 | 
|  |     86 |           end;
 | 
|  |     87 | 
 | 
|  |     88 | fun is_3_dgt (c1::c2::cs, c,n) = 
 | 
|  |     89 |           let val s = c^c1^c2
 | 
|  |     90 |           in  if (s >= "000" andalso s <= "255")
 | 
|  |     91 |               then (cs, s)
 | 
|  |     92 |               else stringerr(n)
 | 
|  |     93 |           end 
 | 
|  |     94 |   | is_3_dgt (_,_,n) = stringerr(n); 
 | 
|  |     95 | 
 | 
|  |     96 | fun is_imprt_seq ([],_,n) = stringerr(n)
 | 
|  |     97 |   | is_imprt_seq ((c::cs),s,n) = 
 | 
|  |     98 |           if c = "\\" then (cs,s^"\\",n)
 | 
|  |     99 |           else if c = "\n"
 | 
|  |    100 |                then is_imprt_seq (cs,s^"\n",n+1)
 | 
|  |    101 |           else if (c = "\t") orelse (c = " ")
 | 
|  |    102 |                then is_imprt_seq (cs,s^c,n)
 | 
|  |    103 |           else stringerr(n);
 | 
|  |    104 | 
 | 
|  |    105 | fun is_escape_seq ([],_,n) = stringerr(n)
 | 
|  |    106 |   | is_escape_seq ((c::cs),s,n) =  
 | 
|  |    107 |           if c = "\\" then (cs,s^"\\",n)
 | 
|  |    108 |           else if c = "\n" 
 | 
|  |    109 |                then is_imprt_seq (cs,s^"\n",n+1) 
 | 
|  |    110 |           else if (c = "\t") orelse (c = " ")
 | 
|  |    111 |                then is_imprt_seq (cs,s^c,n)
 | 
|  |    112 |           else if c = "^" 
 | 
|  |    113 |                then is_control_chr (cs,s^"^",n)
 | 
|  |    114 |           else if ("0" <= c andalso c <= "2") 
 | 
|  |    115 |                then let val (cs',s') = 
 | 
|  |    116 |                             is_3_dgt(cs,c,n)
 | 
|  |    117 |                     in (cs',s^s',n)
 | 
|  |    118 |                     end
 | 
|  |    119 |           else stringerr(n);
 | 
|  |    120 | 
 | 
|  |    121 | 
 | 
|  |    122 | fun string (_,[],_,n) = stringerr(n)
 | 
|  |    123 |   | string (toks, c::cs, s, n) =
 | 
|  |    124 |        if c  = "\"" then ((Stg s, n)::toks , cs, n)
 | 
|  |    125 |        else if c = "\\" 
 | 
|  |    126 |             then  let val (cs',s',n') = 
 | 
|  |    127 |                           is_escape_seq (cs, s^"\\",n) 
 | 
|  |    128 |                   in string (toks,cs',s',n') end 
 | 
|  |    129 |        else string (toks,cs,s^c,n);
 | 
|  |    130 | 
 | 
|  |    131 | 
 | 
|  |    132 | fun comment ((c1::c2::cs), n) =
 | 
|  |    133 |       if c1 = "*" andalso c2 = ")" then (cs,n) else
 | 
|  |    134 |       if c1 = "\n" then comment((c2::cs), n+1)
 | 
|  |    135 |       else comment((c2::cs), n)
 | 
|  |    136 |   | comment (_, n) = lexerr(n, "Missing end of comment");
 | 
|  |    137 | 
 | 
|  |    138 | fun scanning (toks , [], n) = rev toks
 | 
|  |    139 |   | scanning (toks , c :: cs, n) = 
 | 
|  |    140 |        if is_letter c 
 | 
|  |    141 |        then let val (id , cs2) = alphanum (c , cs)
 | 
|  |    142 |             in if id = "ML"
 | 
|  |    143 |                then let val text = implode cs2
 | 
|  |    144 |                     in  scanning ((Txt text,n) :: toks , [], n)
 | 
|  |    145 |                     end
 | 
|  |    146 |                else scanning (tokenof(id,n) :: toks , cs2, n) 
 | 
|  |    147 |             end
 | 
|  |    148 |        else if is_digit c 
 | 
|  |    149 |             then let val (nat , cs2) = numeric(c , cs)
 | 
|  |    150 |                  in scanning ((Nat nat,n) :: toks , cs2, n) end
 | 
| 213 |    151 | 
 | 
|  |    152 |        else if c = "'" andalso is_letter(hd cs)
 | 
|  |    153 |             then let val (var, cs2) = alphanum (hd cs, tl cs)
 | 
|  |    154 |                  in scanning((TypVar (c^var),n) :: toks, cs2, n) end
 | 
|  |    155 | 
 | 
| 0 |    156 |        else if c mem specials
 | 
|  |    157 |             then if c = "\""
 | 
|  |    158 |                  then let val (toks', cs', n') = string (toks, cs, "", n)
 | 
|  |    159 |                       in scanning (toks', cs', n') end
 | 
|  |    160 |                  else let val (sy , cs2) = symbolic (c , cs)
 | 
|  |    161 |                       in if sy = "(*"
 | 
|  |    162 |                          then let val (cs3,n2) = comment(cs2,n)
 | 
|  |    163 |                               in scanning (toks , cs3, n2) end
 | 
|  |    164 |                          else scanning ((Key sy,n) :: toks, cs2, n)
 | 
|  |    165 |                       end
 | 
|  |    166 |        else if c = "\n" then scanning (toks, cs, n+1)
 | 
|  |    167 |        else if c = " " orelse c = "\t" then scanning (toks , cs, n)
 | 
|  |    168 |        else lexerr(n,"Illegal character " ^ c);
 | 
|  |    169 | 
 | 
|  |    170 | fun scan a = scanning ([] , a, 1);
 | 
|  |    171 | 
 | 
|  |    172 | end;
 |