0
|
1 |
(* Title: Pure/Thy/scan
|
|
2 |
ID: $Id$
|
|
3 |
Author: Sonia Mahjoub / Tobias Nipkow
|
|
4 |
Copyright 1992 TU Muenchen
|
|
5 |
|
|
6 |
The scanner. Adapted from Larry Paulson's ML for the Working Programmer.
|
|
7 |
*)
|
|
8 |
|
|
9 |
signature LEXICAL =
|
|
10 |
sig
|
|
11 |
|
|
12 |
|
|
13 |
datatype token = Id of string
|
|
14 |
| Key of string
|
|
15 |
| Nat of string
|
|
16 |
| Stg of string
|
|
17 |
| Txt of string
|
|
18 |
|
|
19 |
val scan : string list -> (token * int) list
|
|
20 |
end;
|
|
21 |
|
|
22 |
signature KEYWORD =
|
|
23 |
sig
|
|
24 |
val alphas : string list
|
|
25 |
val symbols : string list
|
|
26 |
end;
|
|
27 |
|
|
28 |
|
|
29 |
functor LexicalFUN (Keyword: KEYWORD): LEXICAL =
|
|
30 |
|
|
31 |
struct
|
|
32 |
|
|
33 |
|
|
34 |
|
|
35 |
datatype token = Id of string
|
|
36 |
| Key of string
|
|
37 |
| Nat of string
|
|
38 |
| Stg of string
|
|
39 |
| Txt of string;
|
|
40 |
|
|
41 |
|
|
42 |
fun lexerr(n,s) =
|
|
43 |
error("Lexical error on line " ^ (string_of_int n) ^ ": " ^ s);
|
|
44 |
|
|
45 |
val specials = explode"!{}@#$%^&*()+-=[]:\";',./?`_~<>|\\";
|
|
46 |
|
|
47 |
fun is_symbol c = "_" = c orelse "'" = c;
|
|
48 |
|
|
49 |
fun alphanum (id, c::cs) =
|
|
50 |
if is_letter c orelse is_digit c orelse is_symbol c
|
|
51 |
then alphanum (id ^ c , cs)
|
|
52 |
else (id , c :: cs)
|
|
53 |
| alphanum (id ,[]) = (id ,[]);
|
|
54 |
|
|
55 |
fun numeric (nat, c::cs) =
|
|
56 |
if is_digit c
|
|
57 |
then numeric (nat^c, cs)
|
|
58 |
else (nat, c::cs)
|
|
59 |
| numeric (nat, []) = (nat,[]);
|
|
60 |
|
|
61 |
fun tokenof (a, n) =
|
|
62 |
if a mem Keyword.alphas
|
|
63 |
then (Key a, n)
|
|
64 |
else (Id a, n);
|
|
65 |
|
|
66 |
fun symbolic (sy, c::cs) =
|
|
67 |
if (sy mem Keyword.symbols) andalso
|
|
68 |
not((sy^c) mem Keyword.symbols)
|
|
69 |
orelse not (c mem specials)
|
|
70 |
then (sy, c::cs)
|
|
71 |
else symbolic(sy^c, cs)
|
|
72 |
| symbolic (sy, []) = (sy, []);
|
|
73 |
|
|
74 |
|
|
75 |
fun stringerr(n) = lexerr(n, "No matching quote found on this line");
|
|
76 |
|
|
77 |
fun is_control_chr ([],_,n) = stringerr(n)
|
|
78 |
| is_control_chr (c::cs,s,n) =
|
|
79 |
let val m = ord c
|
|
80 |
in if (m >= 64 andalso m <= 95)
|
|
81 |
then (cs, s^c, n)
|
|
82 |
else stringerr(n)
|
|
83 |
end;
|
|
84 |
|
|
85 |
fun is_3_dgt (c1::c2::cs, c,n) =
|
|
86 |
let val s = c^c1^c2
|
|
87 |
in if (s >= "000" andalso s <= "255")
|
|
88 |
then (cs, s)
|
|
89 |
else stringerr(n)
|
|
90 |
end
|
|
91 |
| is_3_dgt (_,_,n) = stringerr(n);
|
|
92 |
|
|
93 |
fun is_imprt_seq ([],_,n) = stringerr(n)
|
|
94 |
| is_imprt_seq ((c::cs),s,n) =
|
|
95 |
if c = "\\" then (cs,s^"\\",n)
|
|
96 |
else if c = "\n"
|
|
97 |
then is_imprt_seq (cs,s^"\n",n+1)
|
|
98 |
else if (c = "\t") orelse (c = " ")
|
|
99 |
then is_imprt_seq (cs,s^c,n)
|
|
100 |
else stringerr(n);
|
|
101 |
|
|
102 |
fun is_escape_seq ([],_,n) = stringerr(n)
|
|
103 |
| is_escape_seq ((c::cs),s,n) =
|
|
104 |
if c = "\\" then (cs,s^"\\",n)
|
|
105 |
else if c = "\n"
|
|
106 |
then is_imprt_seq (cs,s^"\n",n+1)
|
|
107 |
else if (c = "\t") orelse (c = " ")
|
|
108 |
then is_imprt_seq (cs,s^c,n)
|
|
109 |
else if c = "^"
|
|
110 |
then is_control_chr (cs,s^"^",n)
|
|
111 |
else if ("0" <= c andalso c <= "2")
|
|
112 |
then let val (cs',s') =
|
|
113 |
is_3_dgt(cs,c,n)
|
|
114 |
in (cs',s^s',n)
|
|
115 |
end
|
|
116 |
else stringerr(n);
|
|
117 |
|
|
118 |
|
|
119 |
fun string (_,[],_,n) = stringerr(n)
|
|
120 |
| string (toks, c::cs, s, n) =
|
|
121 |
if c = "\"" then ((Stg s, n)::toks , cs, n)
|
|
122 |
else if c = "\\"
|
|
123 |
then let val (cs',s',n') =
|
|
124 |
is_escape_seq (cs, s^"\\",n)
|
|
125 |
in string (toks,cs',s',n') end
|
|
126 |
else string (toks,cs,s^c,n);
|
|
127 |
|
|
128 |
|
|
129 |
fun comment ((c1::c2::cs), n) =
|
|
130 |
if c1 = "*" andalso c2 = ")" then (cs,n) else
|
|
131 |
if c1 = "\n" then comment((c2::cs), n+1)
|
|
132 |
else comment((c2::cs), n)
|
|
133 |
| comment (_, n) = lexerr(n, "Missing end of comment");
|
|
134 |
|
|
135 |
fun scanning (toks , [], n) = rev toks
|
|
136 |
| scanning (toks , c :: cs, n) =
|
|
137 |
if is_letter c
|
|
138 |
then let val (id , cs2) = alphanum (c , cs)
|
|
139 |
in if id = "ML"
|
|
140 |
then let val text = implode cs2
|
|
141 |
in scanning ((Txt text,n) :: toks , [], n)
|
|
142 |
end
|
|
143 |
else scanning (tokenof(id,n) :: toks , cs2, n)
|
|
144 |
end
|
|
145 |
else if is_digit c
|
|
146 |
then let val (nat , cs2) = numeric(c , cs)
|
|
147 |
in scanning ((Nat nat,n) :: toks , cs2, n) end
|
|
148 |
else if c mem specials
|
|
149 |
then if c = "\""
|
|
150 |
then let val (toks', cs', n') = string (toks, cs, "", n)
|
|
151 |
in scanning (toks', cs', n') end
|
|
152 |
else let val (sy , cs2) = symbolic (c , cs)
|
|
153 |
in if sy = "(*"
|
|
154 |
then let val (cs3,n2) = comment(cs2,n)
|
|
155 |
in scanning (toks , cs3, n2) end
|
|
156 |
else scanning ((Key sy,n) :: toks, cs2, n)
|
|
157 |
end
|
|
158 |
else if c = "\n" then scanning (toks, cs, n+1)
|
|
159 |
else if c = " " orelse c = "\t" then scanning (toks , cs, n)
|
|
160 |
else lexerr(n,"Illegal character " ^ c);
|
|
161 |
|
|
162 |
fun scan a = scanning ([] , a, 1);
|
|
163 |
|
|
164 |
end;
|