author | wenzelm |
Fri, 23 May 2008 21:18:47 +0200 | |
changeset 26977 | e736139b553d |
parent 26632 | 90c0b075c0d3 |
child 27732 | 8dbf5761a24a |
permissions | -rw-r--r-- |
6118 | 1 |
(* Title: Pure/General/symbol.ML |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
2 |
ID: $Id$ |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
3 |
Author: Markus Wenzel, TU Muenchen |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
4 |
|
21897 | 5 |
Generalized characters with infinitely many named symbols. |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
6 |
*) |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
7 |
|
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
8 |
signature SYMBOL = |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
9 |
sig |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
10 |
type symbol |
26524 | 11 |
val SOH: symbol |
12 |
val STX: symbol |
|
26538
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26524
diff
changeset
|
13 |
val ENQ: symbol |
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26524
diff
changeset
|
14 |
val ACK: symbol |
26524 | 15 |
val DEL: symbol |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
16 |
val space: symbol |
23618 | 17 |
val spaces: int -> string |
14678 | 18 |
val is_char: symbol -> bool |
19 |
val is_symbolic: symbol -> bool |
|
20 |
val is_printable: symbol -> bool |
|
26632 | 21 |
val is_utf8_trailer: symbol -> bool |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
22 |
val eof: symbol |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
23 |
val is_eof: symbol -> bool |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
24 |
val stopper: symbol * (symbol -> bool) |
14678 | 25 |
val sync: symbol |
26 |
val is_sync: symbol -> bool |
|
27 |
val malformed: symbol |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
28 |
val end_malformed: symbol |
25641 | 29 |
val separate_chars: string -> string |
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23728
diff
changeset
|
30 |
val is_regular: symbol -> bool |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
31 |
val is_ascii: symbol -> bool |
14678 | 32 |
val is_ascii_letter: symbol -> bool |
33 |
val is_ascii_digit: symbol -> bool |
|
24580
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
34 |
val is_ascii_hex: symbol -> bool |
14678 | 35 |
val is_ascii_quasi: symbol -> bool |
36 |
val is_ascii_blank: symbol -> bool |
|
20200 | 37 |
val is_ascii_lower: symbol -> bool |
38 |
val is_ascii_upper: symbol -> bool |
|
39 |
val to_ascii_lower: symbol -> symbol |
|
40 |
val to_ascii_upper: symbol -> symbol |
|
14834 | 41 |
val is_raw: symbol -> bool |
42 |
val decode_raw: symbol -> string |
|
14977
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
43 |
val encode_raw: string -> string |
14873 | 44 |
datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string |
45 |
val decode: symbol -> sym |
|
14678 | 46 |
datatype kind = Letter | Digit | Quasi | Blank | Other |
47 |
val kind: symbol -> kind |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
48 |
val is_letter: symbol -> bool |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
49 |
val is_digit: symbol -> bool |
12904 | 50 |
val is_quasi: symbol -> bool |
14678 | 51 |
val is_blank: symbol -> bool |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
52 |
val is_quasi_letter: symbol -> bool |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
53 |
val is_letdig: symbol -> bool |
16138 | 54 |
val is_ident: symbol list -> bool |
14728 | 55 |
val beginning: int -> symbol list -> string |
14678 | 56 |
val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a |
13730 | 57 |
val scan_id: string list -> string * string list |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
58 |
val source: bool -> (string, 'a) Source.source -> |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
59 |
(symbol, (string, 'a) Source.source) Source.source |
6272 | 60 |
val explode: string -> symbol list |
14977
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
61 |
val escape: string -> string |
14678 | 62 |
val strip_blanks: string -> string |
63 |
val bump_init: string -> string |
|
12904 | 64 |
val bump_string: string -> string |
14678 | 65 |
val length: symbol list -> int |
6692 | 66 |
val xsymbolsN: string |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
67 |
end; |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
68 |
|
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
69 |
structure Symbol: SYMBOL = |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
70 |
struct |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
71 |
|
14678 | 72 |
(** type symbol **) |
6272 | 73 |
|
14678 | 74 |
(*Symbols, which are considered the smallest entities of any Isabelle |
6272 | 75 |
string, may be of the following form: |
14678 | 76 |
|
14834 | 77 |
(1) ASCII symbols: a |
17823 | 78 |
(2) regular symbols: \<ident> |
14834 | 79 |
(3) control symbols: \<^ident> |
80 |
(4) raw control symbols: \<^raw:...>, where "..." may be any printable |
|
20205
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
wenzelm
parents:
20200
diff
changeset
|
81 |
character (excluding ".", ">"), or \<^raw000> |
6272 | 82 |
|
14678 | 83 |
Output is subject to the print_mode variable (default: verbatim), |
84 |
actual interpretation in display is up to front-end tools. |
|
6272 | 85 |
*) |
86 |
||
87 |
type symbol = string; |
|
88 |
||
26524 | 89 |
val SOH = chr 1; |
90 |
val STX = chr 2; |
|
26538
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26524
diff
changeset
|
91 |
val ENQ = chr 5; |
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26524
diff
changeset
|
92 |
val ACK = chr 6; |
26524 | 93 |
val DEL = chr 127; |
94 |
||
95 |
val space = chr 32; |
|
17063 | 96 |
|
97 |
local |
|
98 |
val small_spaces = Vector.tabulate (65, fn i => Library.replicate_string i space); |
|
99 |
in |
|
100 |
fun spaces k = |
|
101 |
if k < 64 then Vector.sub (small_spaces, k) |
|
102 |
else Library.replicate_string (k div 64) (Vector.sub (small_spaces, 64)) ^ |
|
103 |
Vector.sub (small_spaces, k mod 64); |
|
104 |
end; |
|
14678 | 105 |
|
106 |
fun is_char s = size s = 1; |
|
107 |
||
108 |
fun is_symbolic s = |
|
109 |
String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s); |
|
110 |
||
111 |
fun is_printable s = |
|
112 |
if is_char s then ord space <= ord s andalso ord s <= ord "~" |
|
113 |
else not (String.isPrefix "\\<^" s); |
|
6272 | 114 |
|
26632 | 115 |
fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192; |
116 |
||
6272 | 117 |
|
14678 | 118 |
(* input source control *) |
6272 | 119 |
|
14678 | 120 |
val eof = ""; |
6272 | 121 |
fun is_eof s = s = eof; |
122 |
fun not_eof s = s <> eof; |
|
123 |
val stopper = (eof, is_eof); |
|
124 |
||
14678 | 125 |
val sync = "\\<^sync>"; |
126 |
fun is_sync s = s = sync; |
|
127 |
||
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
128 |
val malformed = "[["; |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
129 |
val end_malformed = "]]"; |
25641 | 130 |
|
131 |
val separate_chars = explode #> space_implode space; |
|
132 |
fun malformed_msg s = "Malformed symbolic character: " ^ quote (separate_chars s); |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
133 |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23728
diff
changeset
|
134 |
fun is_regular s = |
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23728
diff
changeset
|
135 |
not_eof s andalso s <> sync andalso s <> malformed andalso s <> end_malformed; |
14678 | 136 |
|
137 |
||
138 |
(* ascii symbols *) |
|
139 |
||
140 |
fun is_ascii s = is_char s andalso ord s < 128; |
|
141 |
||
142 |
fun is_ascii_letter s = |
|
143 |
is_char s andalso |
|
144 |
(ord "A" <= ord s andalso ord s <= ord "Z" orelse |
|
145 |
ord "a" <= ord s andalso ord s <= ord "z"); |
|
146 |
||
147 |
fun is_ascii_digit s = |
|
148 |
is_char s andalso ord "0" <= ord s andalso ord s <= ord "9"; |
|
149 |
||
24580
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
150 |
fun is_ascii_hex s = |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
151 |
is_char s andalso |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
152 |
(ord "0" <= ord s andalso ord s <= ord "9" orelse |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
153 |
ord "A" <= ord s andalso ord s <= ord "F" orelse |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
154 |
ord "a" <= ord s andalso ord s <= ord "f"); |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
155 |
|
14678 | 156 |
fun is_ascii_quasi "_" = true |
157 |
| is_ascii_quasi "'" = true |
|
158 |
| is_ascii_quasi _ = false; |
|
159 |
||
160 |
val is_ascii_blank = |
|
24580
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
161 |
fn " " => true | "\t" => true | "\n" => true | "\^K" => true | "\^L" => true | "\^M" => true |
14678 | 162 |
| _ => false; |
163 |
||
20200 | 164 |
fun is_ascii_lower s = is_char s andalso (ord "a" <= ord s andalso ord s <= ord "z"); |
165 |
fun is_ascii_upper s = is_char s andalso (ord "A" <= ord s andalso ord s <= ord "Z"); |
|
166 |
||
167 |
fun to_ascii_lower s = if is_ascii_upper s then chr (ord s + ord "a" - ord "A") else s; |
|
168 |
fun to_ascii_upper s = if is_ascii_lower s then chr (ord s + ord "A" - ord "a") else s; |
|
169 |
||
14678 | 170 |
|
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
171 |
(* encode_raw *) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
172 |
|
20205
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
wenzelm
parents:
20200
diff
changeset
|
173 |
fun raw_chr c = |
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
wenzelm
parents:
20200
diff
changeset
|
174 |
ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">" |
17823 | 175 |
orelse ord c >= 128; |
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
176 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
177 |
fun encode_raw str = |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
178 |
let |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
179 |
val raw0 = enclose "\\<^raw:" ">"; |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
180 |
val raw1 = raw0 o implode; |
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
181 |
val raw2 = enclose "\\<^raw" ">" o string_of_int o ord; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
182 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
183 |
fun encode cs = enc (Library.take_prefix raw_chr cs) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
184 |
and enc ([], []) = [] |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
185 |
| enc (cs, []) = [raw1 cs] |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
186 |
| enc ([], d :: ds) = raw2 d :: encode ds |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
187 |
| enc (cs, d :: ds) = raw1 cs :: raw2 d :: encode ds; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
188 |
in |
14977
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
189 |
if exists_string (not o raw_chr) str then implode (encode (explode str)) |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
190 |
else raw0 str |
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
191 |
end; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
192 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
193 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
194 |
(* diagnostics *) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
195 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
196 |
fun beginning n cs = |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
197 |
let |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
198 |
val drop_blanks = #1 o Library.take_suffix is_ascii_blank; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
199 |
val all_cs = drop_blanks cs; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
200 |
val dots = if length all_cs > n then " ..." else ""; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
201 |
in |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
202 |
(drop_blanks (Library.take (n, all_cs)) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
203 |
|> map (fn c => if is_ascii_blank c then space else c) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
204 |
|> implode) ^ dots |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
205 |
end; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
206 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
207 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
208 |
(* decode_raw *) |
14834 | 209 |
|
210 |
fun is_raw s = |
|
17063 | 211 |
String.isPrefix "\\<^raw" s andalso String.isSuffix ">" s; |
14834 | 212 |
|
213 |
fun decode_raw s = |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
214 |
if not (is_raw s) then error (malformed_msg s) |
14834 | 215 |
else if String.isPrefix "\\<^raw:" s then String.substring (s, 7, size s - 8) |
216 |
else chr (#1 (Library.read_int (explode (String.substring (s, 6, size s - 7))))); |
|
217 |
||
218 |
||
14873 | 219 |
(* symbol variants *) |
220 |
||
221 |
datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string; |
|
222 |
||
223 |
fun decode s = |
|
224 |
if is_char s then Char s |
|
225 |
else if is_raw s then Raw (decode_raw s) |
|
226 |
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4)) |
|
227 |
else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3)) |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
228 |
else error (malformed_msg s); |
14873 | 229 |
|
230 |
||
14678 | 231 |
(* standard symbol kinds *) |
232 |
||
233 |
datatype kind = Letter | Digit | Quasi | Blank | Other; |
|
6272 | 234 |
|
14171
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
skalberg
parents:
13730
diff
changeset
|
235 |
local |
14678 | 236 |
val symbol_kinds = Symtab.make |
237 |
[("\\<A>", Letter), |
|
238 |
("\\<B>", Letter), |
|
239 |
("\\<C>", Letter), |
|
240 |
("\\<D>", Letter), |
|
241 |
("\\<E>", Letter), |
|
242 |
("\\<F>", Letter), |
|
243 |
("\\<G>", Letter), |
|
244 |
("\\<H>", Letter), |
|
245 |
("\\<I>", Letter), |
|
246 |
("\\<J>", Letter), |
|
247 |
("\\<K>", Letter), |
|
248 |
("\\<L>", Letter), |
|
249 |
("\\<M>", Letter), |
|
250 |
("\\<N>", Letter), |
|
251 |
("\\<O>", Letter), |
|
252 |
("\\<P>", Letter), |
|
253 |
("\\<Q>", Letter), |
|
254 |
("\\<R>", Letter), |
|
255 |
("\\<S>", Letter), |
|
256 |
("\\<T>", Letter), |
|
257 |
("\\<U>", Letter), |
|
258 |
("\\<V>", Letter), |
|
259 |
("\\<W>", Letter), |
|
260 |
("\\<X>", Letter), |
|
261 |
("\\<Y>", Letter), |
|
262 |
("\\<Z>", Letter), |
|
263 |
("\\<a>", Letter), |
|
264 |
("\\<b>", Letter), |
|
265 |
("\\<c>", Letter), |
|
266 |
("\\<d>", Letter), |
|
267 |
("\\<e>", Letter), |
|
268 |
("\\<f>", Letter), |
|
269 |
("\\<g>", Letter), |
|
270 |
("\\<h>", Letter), |
|
271 |
("\\<i>", Letter), |
|
272 |
("\\<j>", Letter), |
|
273 |
("\\<k>", Letter), |
|
274 |
("\\<l>", Letter), |
|
275 |
("\\<m>", Letter), |
|
276 |
("\\<n>", Letter), |
|
277 |
("\\<o>", Letter), |
|
278 |
("\\<p>", Letter), |
|
279 |
("\\<q>", Letter), |
|
280 |
("\\<r>", Letter), |
|
281 |
("\\<s>", Letter), |
|
282 |
("\\<t>", Letter), |
|
283 |
("\\<u>", Letter), |
|
284 |
("\\<v>", Letter), |
|
285 |
("\\<w>", Letter), |
|
286 |
("\\<x>", Letter), |
|
287 |
("\\<y>", Letter), |
|
288 |
("\\<z>", Letter), |
|
289 |
("\\<AA>", Letter), |
|
290 |
("\\<BB>", Letter), |
|
291 |
("\\<CC>", Letter), |
|
292 |
("\\<DD>", Letter), |
|
293 |
("\\<EE>", Letter), |
|
294 |
("\\<FF>", Letter), |
|
295 |
("\\<GG>", Letter), |
|
296 |
("\\<HH>", Letter), |
|
297 |
("\\<II>", Letter), |
|
298 |
("\\<JJ>", Letter), |
|
299 |
("\\<KK>", Letter), |
|
300 |
("\\<LL>", Letter), |
|
301 |
("\\<MM>", Letter), |
|
302 |
("\\<NN>", Letter), |
|
303 |
("\\<OO>", Letter), |
|
304 |
("\\<PP>", Letter), |
|
305 |
("\\<QQ>", Letter), |
|
306 |
("\\<RR>", Letter), |
|
307 |
("\\<SS>", Letter), |
|
308 |
("\\<TT>", Letter), |
|
309 |
("\\<UU>", Letter), |
|
310 |
("\\<VV>", Letter), |
|
311 |
("\\<WW>", Letter), |
|
312 |
("\\<XX>", Letter), |
|
313 |
("\\<YY>", Letter), |
|
314 |
("\\<ZZ>", Letter), |
|
315 |
("\\<aa>", Letter), |
|
316 |
("\\<bb>", Letter), |
|
317 |
("\\<cc>", Letter), |
|
318 |
("\\<dd>", Letter), |
|
319 |
("\\<ee>", Letter), |
|
320 |
("\\<ff>", Letter), |
|
321 |
("\\<gg>", Letter), |
|
322 |
("\\<hh>", Letter), |
|
323 |
("\\<ii>", Letter), |
|
324 |
("\\<jj>", Letter), |
|
325 |
("\\<kk>", Letter), |
|
326 |
("\\<ll>", Letter), |
|
327 |
("\\<mm>", Letter), |
|
328 |
("\\<nn>", Letter), |
|
329 |
("\\<oo>", Letter), |
|
330 |
("\\<pp>", Letter), |
|
331 |
("\\<qq>", Letter), |
|
332 |
("\\<rr>", Letter), |
|
333 |
("\\<ss>", Letter), |
|
334 |
("\\<tt>", Letter), |
|
335 |
("\\<uu>", Letter), |
|
336 |
("\\<vv>", Letter), |
|
337 |
("\\<ww>", Letter), |
|
338 |
("\\<xx>", Letter), |
|
339 |
("\\<yy>", Letter), |
|
340 |
("\\<zz>", Letter), |
|
341 |
("\\<alpha>", Letter), |
|
342 |
("\\<beta>", Letter), |
|
343 |
("\\<gamma>", Letter), |
|
344 |
("\\<delta>", Letter), |
|
345 |
("\\<epsilon>", Letter), |
|
346 |
("\\<zeta>", Letter), |
|
347 |
("\\<eta>", Letter), |
|
348 |
("\\<theta>", Letter), |
|
349 |
("\\<iota>", Letter), |
|
350 |
("\\<kappa>", Letter), |
|
351 |
("\\<lambda>", Other), (*sic!*) |
|
352 |
("\\<mu>", Letter), |
|
353 |
("\\<nu>", Letter), |
|
354 |
("\\<xi>", Letter), |
|
355 |
("\\<pi>", Letter), |
|
356 |
("\\<rho>", Letter), |
|
357 |
("\\<sigma>", Letter), |
|
358 |
("\\<tau>", Letter), |
|
359 |
("\\<upsilon>", Letter), |
|
360 |
("\\<phi>", Letter), |
|
25521 | 361 |
("\\<chi>", Letter), |
14678 | 362 |
("\\<psi>", Letter), |
363 |
("\\<omega>", Letter), |
|
364 |
("\\<Gamma>", Letter), |
|
365 |
("\\<Delta>", Letter), |
|
366 |
("\\<Theta>", Letter), |
|
367 |
("\\<Lambda>", Letter), |
|
368 |
("\\<Xi>", Letter), |
|
369 |
("\\<Pi>", Letter), |
|
370 |
("\\<Sigma>", Letter), |
|
371 |
("\\<Upsilon>", Letter), |
|
372 |
("\\<Phi>", Letter), |
|
373 |
("\\<Psi>", Letter), |
|
374 |
("\\<Omega>", Letter), |
|
14961 | 375 |
("\\<^isub>", Letter), |
376 |
("\\<^isup>", Letter), |
|
14678 | 377 |
("\\<spacespace>", Blank)]; |
14171
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
skalberg
parents:
13730
diff
changeset
|
378 |
in |
14678 | 379 |
fun kind s = |
380 |
if is_ascii_letter s then Letter |
|
381 |
else if is_ascii_digit s then Digit |
|
382 |
else if is_ascii_quasi s then Quasi |
|
383 |
else if is_ascii_blank s then Blank |
|
384 |
else if is_char s then Other |
|
18939 | 385 |
else the_default Other (Symtab.lookup symbol_kinds s); |
14678 | 386 |
end; |
14173 | 387 |
|
14678 | 388 |
fun is_letter s = kind s = Letter; |
389 |
fun is_digit s = kind s = Digit; |
|
390 |
fun is_quasi s = kind s = Quasi; |
|
391 |
fun is_blank s = kind s = Blank; |
|
6272 | 392 |
|
14678 | 393 |
fun is_quasi_letter s = let val k = kind s in k = Letter orelse k = Quasi end; |
394 |
fun is_letdig s = let val k = kind s in k = Letter orelse k = Digit orelse k = Quasi end; |
|
11010 | 395 |
|
16138 | 396 |
fun is_ident [] = false |
397 |
| is_ident (c :: cs) = is_letter c andalso forall is_letdig cs; |
|
398 |
||
6272 | 399 |
|
400 |
||
14678 | 401 |
(** symbol input **) |
402 |
||
403 |
(* scanning through symbols *) |
|
6272 | 404 |
|
6640 | 405 |
fun scanner msg scan chs = |
406 |
let |
|
15531 | 407 |
fun message (cs, NONE) = msg ^ ": " ^ quote (beginning 10 cs) |
408 |
| message (cs, SOME msg') = msg ^ ", " ^ msg' ^ ": " ^ quote (beginning 10 cs); |
|
14961 | 409 |
val fin_scan = Scan.error (Scan.finite stopper (!! message scan)); |
6640 | 410 |
in |
411 |
(case fin_scan chs of |
|
412 |
(result, []) => result |
|
15531 | 413 |
| (_, rest) => error (message (rest, NONE))) |
6640 | 414 |
end; |
415 |
||
21858
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
21495
diff
changeset
|
416 |
val scan_id = Scan.one is_letter ^^ (Scan.many is_letdig >> implode); |
14678 | 417 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
418 |
|
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
419 |
(* source *) |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
420 |
|
14678 | 421 |
local |
14561
c53396af770e
* raw control symbols are of the form \<^raw:...> now.
schirmer
parents:
14559
diff
changeset
|
422 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
423 |
fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s; |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
424 |
|
14678 | 425 |
val scan_encoded_newline = |
17756 | 426 |
$$ "\^M" -- $$ "\n" >> K "\n" || |
427 |
$$ "\^M" >> K "\n" || |
|
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
428 |
$$ "\\" -- Scan.optional ($$ "\\") "" -- Scan.this_string "<^newline>" >> K "\n"; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
429 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
430 |
val scan_raw = |
21858
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
21495
diff
changeset
|
431 |
Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) || |
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
21495
diff
changeset
|
432 |
Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode); |
14678 | 433 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
434 |
val scan = |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
435 |
Scan.one is_plain || |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
436 |
scan_encoded_newline || |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
437 |
(($$ "\\" --| Scan.optional ($$ "\\") "") ^^ $$ "<" ^^ |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
438 |
!! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs))) |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
439 |
(($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) || |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
440 |
Scan.one not_eof; |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
441 |
|
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
442 |
val recover = |
25644 | 443 |
Scan.many (fn s => not (is_blank s) andalso s <> "\"" andalso s <> "`" andalso not_eof s) |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
444 |
>> (fn ss => malformed :: ss @ [end_malformed]); |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
445 |
|
14678 | 446 |
in |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
447 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
448 |
fun source do_recover src = |
23682
cf4773532006
nested source: explicit interactive flag for recover avoids duplicate errors;
wenzelm
parents:
23676
diff
changeset
|
449 |
Source.source stopper (Scan.bulk scan) |
cf4773532006
nested source: explicit interactive flag for recover avoids duplicate errors;
wenzelm
parents:
23676
diff
changeset
|
450 |
(if do_recover then SOME (false, K recover) else NONE) src; |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
451 |
|
14678 | 452 |
end; |
453 |
||
14562
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
454 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
455 |
(* explode *) |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
456 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
457 |
local |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
458 |
|
14562
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
459 |
fun no_explode [] = true |
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
460 |
| no_explode ("\\" :: "<" :: _) = false |
17756 | 461 |
| no_explode ("\^M" :: _) = false |
14562
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
462 |
| no_explode (_ :: cs) = no_explode cs; |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
463 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
464 |
in |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
465 |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
466 |
fun sym_explode str = |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
467 |
let val chs = explode str in |
14562
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
468 |
if no_explode chs then chs |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
469 |
else Source.exhaust (source false (Source.of_list chs)) |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
470 |
end; |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
471 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
472 |
end; |
14994 | 473 |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
474 |
|
14977
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
475 |
(* escape *) |
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
476 |
|
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
477 |
val escape = implode o map (fn s => if is_char s then s else "\\" ^ s) o sym_explode; |
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
478 |
|
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
479 |
|
14678 | 480 |
(* blanks *) |
481 |
||
482 |
fun strip_blanks s = |
|
483 |
sym_explode s |
|
484 |
|> Library.take_prefix is_blank |> #2 |
|
485 |
|> Library.take_suffix is_blank |> #1 |
|
486 |
|> implode; |
|
487 |
||
488 |
||
489 |
(* bump string -- treat as base 26 or base 1 numbers *) |
|
490 |
||
15979 | 491 |
fun symbolic_end (_ :: "\\<^isub>" :: _) = true |
492 |
| symbolic_end (_ :: "\\<^isup>" :: _) = true |
|
14908 | 493 |
| symbolic_end (s :: _) = is_symbolic s |
494 |
| symbolic_end [] = false; |
|
14678 | 495 |
|
496 |
fun bump_init str = |
|
14908 | 497 |
if symbolic_end (rev (sym_explode str)) then str ^ "'" |
14678 | 498 |
else str ^ "a"; |
12904 | 499 |
|
500 |
fun bump_string str = |
|
501 |
let |
|
502 |
fun bump [] = ["a"] |
|
503 |
| bump ("z" :: ss) = "a" :: bump ss |
|
504 |
| bump (s :: ss) = |
|
14678 | 505 |
if is_char s andalso ord "a" <= ord s andalso ord s < ord "z" |
12904 | 506 |
then chr (ord s + 1) :: ss |
507 |
else "a" :: s :: ss; |
|
14678 | 508 |
|
509 |
val (ss, qs) = apfst rev (Library.take_suffix is_quasi (sym_explode str)); |
|
14908 | 510 |
val ss' = if symbolic_end ss then "'" :: ss else bump ss; |
14678 | 511 |
in implode (rev ss' @ qs) end; |
512 |
||
12904 | 513 |
|
6272 | 514 |
|
23618 | 515 |
(** xsymbols **) |
14977
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
516 |
|
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
517 |
val xsymbolsN = "xsymbols"; |
6272 | 518 |
|
14678 | 519 |
fun sym_len s = |
24593
1547ea587f5a
added some int constraints (ML_Parse.fix_ints not active here);
wenzelm
parents:
24580
diff
changeset
|
520 |
if not (is_printable s) then (0: int) |
14678 | 521 |
else if String.isPrefix "\\<long" s then 2 |
522 |
else if String.isPrefix "\\<Long" s then 2 |
|
523 |
else if s = "\\<spacespace>" then 2 |
|
524 |
else 1; |
|
525 |
||
19473 | 526 |
fun sym_length ss = fold (fn s => fn n => sym_len s + n) ss 0; |
14678 | 527 |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
528 |
(*final declarations of this structure!*) |
6272 | 529 |
val length = sym_length; |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
530 |
val explode = sym_explode; |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
531 |
|
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
532 |
end; |