author | wenzelm |
Wed, 27 Feb 2013 19:39:16 +0100 | |
changeset 51297 | d9f3d91208af |
parent 50242 | 56b9c792a98b |
child 52616 | 3ac2878764f9 |
permissions | -rw-r--r-- |
6118 | 1 |
(* Title: Pure/General/symbol.ML |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
2 |
Author: Markus Wenzel, TU Muenchen |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
3 |
|
21897 | 4 |
Generalized characters with infinitely many named symbols. |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
5 |
*) |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
6 |
|
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
7 |
signature SYMBOL = |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
8 |
sig |
40509 | 9 |
type symbol = string |
26524 | 10 |
val STX: symbol |
11 |
val DEL: symbol |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
12 |
val space: symbol |
14678 | 13 |
val is_char: symbol -> bool |
37533
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
14 |
val is_utf8: symbol -> bool |
14678 | 15 |
val is_symbolic: symbol -> bool |
16 |
val is_printable: symbol -> bool |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
17 |
val eof: symbol |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
18 |
val is_eof: symbol -> bool |
27766 | 19 |
val not_eof: symbol -> bool |
27732 | 20 |
val stopper: symbol Scan.stopper |
14678 | 21 |
val sync: symbol |
22 |
val is_sync: symbol -> bool |
|
23784
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
wenzelm
parents:
23728
diff
changeset
|
23 |
val is_regular: symbol -> bool |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
24 |
val is_malformed: symbol -> bool |
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
25 |
val malformed_msg: symbol -> string |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
26 |
val is_ascii: symbol -> bool |
14678 | 27 |
val is_ascii_letter: symbol -> bool |
28 |
val is_ascii_digit: symbol -> bool |
|
24580
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
29 |
val is_ascii_hex: symbol -> bool |
14678 | 30 |
val is_ascii_quasi: symbol -> bool |
31 |
val is_ascii_blank: symbol -> bool |
|
34095
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
33955
diff
changeset
|
32 |
val is_ascii_control: symbol -> bool |
50236 | 33 |
val is_ascii_letdig: symbol -> bool |
20200 | 34 |
val is_ascii_lower: symbol -> bool |
35 |
val is_ascii_upper: symbol -> bool |
|
36 |
val to_ascii_lower: symbol -> symbol |
|
37 |
val to_ascii_upper: symbol -> symbol |
|
50238
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
wenzelm
parents:
50237
diff
changeset
|
38 |
val is_ascii_identifier: string -> bool |
50236 | 39 |
val scan_ascii_id: string list -> string * string list |
14834 | 40 |
val is_raw: symbol -> bool |
41 |
val decode_raw: symbol -> string |
|
14977
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
42 |
val encode_raw: string -> string |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
43 |
datatype sym = |
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
44 |
Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string | |
43485 | 45 |
Malformed of string | EOF |
14873 | 46 |
val decode: symbol -> sym |
14678 | 47 |
datatype kind = Letter | Digit | Quasi | Blank | Other |
48 |
val kind: symbol -> kind |
|
50242
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
49 |
val is_letter_symbol: symbol -> bool |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
50 |
val is_letter: symbol -> bool |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
51 |
val is_digit: symbol -> bool |
12904 | 52 |
val is_quasi: symbol -> bool |
14678 | 53 |
val is_blank: symbol -> bool |
47850
c638127b4653
avoid interference of markup for literal tokens, which may contain slightly odd \<^bsub> \<^esub> counted as pseudo-markup (especially relevant for HTML output, e.g. of thm power3_eq_cube);
wenzelm
parents:
43947
diff
changeset
|
54 |
val is_block_ctrl: symbol -> bool |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
55 |
val is_quasi_letter: symbol -> bool |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
56 |
val is_letdig: symbol -> bool |
14728 | 57 |
val beginning: int -> symbol list -> string |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
58 |
val source: (string, 'a) Source.source -> (symbol, (string, 'a) Source.source) Source.source |
6272 | 59 |
val explode: string -> symbol list |
50237 | 60 |
val esc: symbol -> string |
61 |
val escape: string -> string |
|
62 |
val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a |
|
50162 | 63 |
val split_words: symbol list -> string list |
64 |
val explode_words: string -> string list |
|
14678 | 65 |
val strip_blanks: string -> string |
66 |
val bump_init: string -> string |
|
12904 | 67 |
val bump_string: string -> string |
14678 | 68 |
val length: symbol list -> int |
6692 | 69 |
val xsymbolsN: string |
40131
7cbebd636e79
explicitly qualify type Output.output, which is a slightly odd internal feature;
wenzelm
parents:
37728
diff
changeset
|
70 |
val output: string -> Output.output * int |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
71 |
end; |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
72 |
|
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
73 |
structure Symbol: SYMBOL = |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
74 |
struct |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
75 |
|
14678 | 76 |
(** type symbol **) |
6272 | 77 |
|
14678 | 78 |
(*Symbols, which are considered the smallest entities of any Isabelle |
6272 | 79 |
string, may be of the following form: |
14678 | 80 |
|
14834 | 81 |
(1) ASCII symbols: a |
17823 | 82 |
(2) regular symbols: \<ident> |
14834 | 83 |
(3) control symbols: \<^ident> |
84 |
(4) raw control symbols: \<^raw:...>, where "..." may be any printable |
|
20205
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
wenzelm
parents:
20200
diff
changeset
|
85 |
character (excluding ".", ">"), or \<^raw000> |
6272 | 86 |
|
14678 | 87 |
Output is subject to the print_mode variable (default: verbatim), |
88 |
actual interpretation in display is up to front-end tools. |
|
6272 | 89 |
*) |
90 |
||
91 |
type symbol = string; |
|
92 |
||
26524 | 93 |
val STX = chr 2; |
94 |
val DEL = chr 127; |
|
95 |
||
96 |
val space = chr 32; |
|
17063 | 97 |
|
14678 | 98 |
fun is_char s = size s = 1; |
99 |
||
37533
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
100 |
fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s; |
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
101 |
|
14678 | 102 |
fun is_symbolic s = |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
103 |
String.isPrefix "\\<" s andalso String.isSuffix ">" s andalso not (String.isPrefix "\\<^" s); |
14678 | 104 |
|
105 |
fun is_printable s = |
|
106 |
if is_char s then ord space <= ord s andalso ord s <= ord "~" |
|
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
107 |
else is_utf8 s orelse is_symbolic s; |
26632 | 108 |
|
6272 | 109 |
|
14678 | 110 |
(* input source control *) |
6272 | 111 |
|
14678 | 112 |
val eof = ""; |
6272 | 113 |
fun is_eof s = s = eof; |
114 |
fun not_eof s = s <> eof; |
|
27732 | 115 |
val stopper = Scan.stopper (K eof) is_eof; |
6272 | 116 |
|
14678 | 117 |
val sync = "\\<^sync>"; |
118 |
fun is_sync s = s = sync; |
|
119 |
||
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
120 |
fun is_regular s = not_eof s andalso s <> sync; |
25641 | 121 |
|
48774 | 122 |
fun is_malformed s = |
123 |
String.isPrefix "\\<" s andalso not (String.isSuffix ">" s) |
|
124 |
orelse s = "\\<>" orelse s = "\\<^>"; |
|
125 |
||
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
126 |
fun malformed_msg s = "Malformed symbolic character: " ^ quote s; |
14678 | 127 |
|
128 |
||
43418 | 129 |
(* ASCII symbols *) |
14678 | 130 |
|
131 |
fun is_ascii s = is_char s andalso ord s < 128; |
|
132 |
||
133 |
fun is_ascii_letter s = |
|
134 |
is_char s andalso |
|
135 |
(ord "A" <= ord s andalso ord s <= ord "Z" orelse |
|
136 |
ord "a" <= ord s andalso ord s <= ord "z"); |
|
137 |
||
138 |
fun is_ascii_digit s = |
|
139 |
is_char s andalso ord "0" <= ord s andalso ord s <= ord "9"; |
|
140 |
||
24580
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
141 |
fun is_ascii_hex s = |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
142 |
is_char s andalso |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
143 |
(ord "0" <= ord s andalso ord s <= ord "9" orelse |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
144 |
ord "A" <= ord s andalso ord s <= ord "F" orelse |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
145 |
ord "a" <= ord s andalso ord s <= ord "f"); |
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
wenzelm
parents:
23784
diff
changeset
|
146 |
|
14678 | 147 |
fun is_ascii_quasi "_" = true |
148 |
| is_ascii_quasi "'" = true |
|
149 |
| is_ascii_quasi _ = false; |
|
150 |
||
151 |
val is_ascii_blank = |
|
43845
d89353d17f54
added File.fold_pages for streaming of large files;
wenzelm
parents:
43777
diff
changeset
|
152 |
fn " " => true | "\t" => true | "\n" => true | "\^K" => true | "\f" => true | "\^M" => true |
14678 | 153 |
| _ => false; |
154 |
||
34095
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
33955
diff
changeset
|
155 |
fun is_ascii_control s = is_char s andalso ord s < 32 andalso not (is_ascii_blank s); |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
33955
diff
changeset
|
156 |
|
50236 | 157 |
fun is_ascii_letdig s = is_ascii_letter s orelse is_ascii_digit s orelse is_ascii_quasi s; |
158 |
||
20200 | 159 |
fun is_ascii_lower s = is_char s andalso (ord "a" <= ord s andalso ord s <= ord "z"); |
160 |
fun is_ascii_upper s = is_char s andalso (ord "A" <= ord s andalso ord s <= ord "Z"); |
|
161 |
||
162 |
fun to_ascii_lower s = if is_ascii_upper s then chr (ord s + ord "a" - ord "A") else s; |
|
163 |
fun to_ascii_upper s = if is_ascii_lower s then chr (ord s + ord "A" - ord "a") else s; |
|
164 |
||
50238
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
wenzelm
parents:
50237
diff
changeset
|
165 |
fun is_ascii_identifier s = |
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
wenzelm
parents:
50237
diff
changeset
|
166 |
size s > 0 andalso is_ascii_letter (String.substring (s, 0, 1)) andalso |
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
wenzelm
parents:
50237
diff
changeset
|
167 |
forall_string is_ascii_letdig s; |
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
wenzelm
parents:
50237
diff
changeset
|
168 |
|
50236 | 169 |
val scan_ascii_id = Scan.one is_ascii_letter ^^ (Scan.many is_ascii_letdig >> implode); |
170 |
||
14678 | 171 |
|
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
172 |
(* encode_raw *) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
173 |
|
20205
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
wenzelm
parents:
20200
diff
changeset
|
174 |
fun raw_chr c = |
48773
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset
|
175 |
is_char c andalso |
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset
|
176 |
(ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">" |
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset
|
177 |
orelse ord c >= 128); |
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
178 |
|
29324 | 179 |
fun encode_raw "" = "" |
180 |
| encode_raw str = |
|
181 |
let |
|
182 |
val raw0 = enclose "\\<^raw:" ">"; |
|
183 |
val raw1 = raw0 o implode; |
|
184 |
val raw2 = enclose "\\<^raw" ">" o string_of_int o ord; |
|
50162 | 185 |
|
33955 | 186 |
fun encode cs = enc (take_prefix raw_chr cs) |
29324 | 187 |
and enc ([], []) = [] |
188 |
| enc (cs, []) = [raw1 cs] |
|
189 |
| enc ([], d :: ds) = raw2 d :: encode ds |
|
190 |
| enc (cs, d :: ds) = raw1 cs :: raw2 d :: encode ds; |
|
191 |
in |
|
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40523
diff
changeset
|
192 |
if exists_string (not o raw_chr) str then implode (encode (raw_explode str)) |
29324 | 193 |
else raw0 str |
194 |
end; |
|
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
195 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
196 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
197 |
(* diagnostics *) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
198 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
199 |
fun beginning n cs = |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
200 |
let |
33955 | 201 |
val drop_blanks = #1 o take_suffix is_ascii_blank; |
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
202 |
val all_cs = drop_blanks cs; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
203 |
val dots = if length all_cs > n then " ..." else ""; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
204 |
in |
33955 | 205 |
(drop_blanks (take n all_cs) |
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
206 |
|> map (fn c => if is_ascii_blank c then space else c) |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
207 |
|> implode) ^ dots |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
208 |
end; |
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
209 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
210 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
211 |
(* decode_raw *) |
14834 | 212 |
|
213 |
fun is_raw s = |
|
17063 | 214 |
String.isPrefix "\\<^raw" s andalso String.isSuffix ">" s; |
14834 | 215 |
|
216 |
fun decode_raw s = |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
217 |
if not (is_raw s) then error (malformed_msg s) |
14834 | 218 |
else if String.isPrefix "\\<^raw:" s then String.substring (s, 7, size s - 8) |
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40523
diff
changeset
|
219 |
else chr (#1 (Library.read_int (raw_explode (String.substring (s, 6, size s - 7))))); |
14834 | 220 |
|
221 |
||
14873 | 222 |
(* symbol variants *) |
223 |
||
37533
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
224 |
datatype sym = |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
225 |
Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string | |
43485 | 226 |
Malformed of string | EOF; |
14873 | 227 |
|
228 |
fun decode s = |
|
43485 | 229 |
if s = "" then EOF |
230 |
else if is_char s then Char s |
|
37533
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
231 |
else if is_utf8 s then UTF8 s |
14873 | 232 |
else if is_raw s then Raw (decode_raw s) |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
233 |
else if is_malformed s then Malformed s |
14873 | 234 |
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4)) |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
235 |
else Sym (String.substring (s, 2, size s - 3)); |
14873 | 236 |
|
237 |
||
14678 | 238 |
(* standard symbol kinds *) |
239 |
||
14171
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
skalberg
parents:
13730
diff
changeset
|
240 |
local |
50235 | 241 |
val letter_symbols = |
242 |
Symtab.make_set [ |
|
243 |
"\\<A>", |
|
244 |
"\\<B>", |
|
245 |
"\\<C>", |
|
246 |
"\\<D>", |
|
247 |
"\\<E>", |
|
248 |
"\\<F>", |
|
249 |
"\\<G>", |
|
250 |
"\\<H>", |
|
251 |
"\\<I>", |
|
252 |
"\\<J>", |
|
253 |
"\\<K>", |
|
254 |
"\\<L>", |
|
255 |
"\\<M>", |
|
256 |
"\\<N>", |
|
257 |
"\\<O>", |
|
258 |
"\\<P>", |
|
259 |
"\\<Q>", |
|
260 |
"\\<R>", |
|
261 |
"\\<S>", |
|
262 |
"\\<T>", |
|
263 |
"\\<U>", |
|
264 |
"\\<V>", |
|
265 |
"\\<W>", |
|
266 |
"\\<X>", |
|
267 |
"\\<Y>", |
|
268 |
"\\<Z>", |
|
269 |
"\\<a>", |
|
270 |
"\\<b>", |
|
271 |
"\\<c>", |
|
272 |
"\\<d>", |
|
273 |
"\\<e>", |
|
274 |
"\\<f>", |
|
275 |
"\\<g>", |
|
276 |
"\\<h>", |
|
277 |
"\\<i>", |
|
278 |
"\\<j>", |
|
279 |
"\\<k>", |
|
280 |
"\\<l>", |
|
281 |
"\\<m>", |
|
282 |
"\\<n>", |
|
283 |
"\\<o>", |
|
284 |
"\\<p>", |
|
285 |
"\\<q>", |
|
286 |
"\\<r>", |
|
287 |
"\\<s>", |
|
288 |
"\\<t>", |
|
289 |
"\\<u>", |
|
290 |
"\\<v>", |
|
291 |
"\\<w>", |
|
292 |
"\\<x>", |
|
293 |
"\\<y>", |
|
294 |
"\\<z>", |
|
295 |
"\\<AA>", |
|
296 |
"\\<BB>", |
|
297 |
"\\<CC>", |
|
298 |
"\\<DD>", |
|
299 |
"\\<EE>", |
|
300 |
"\\<FF>", |
|
301 |
"\\<GG>", |
|
302 |
"\\<HH>", |
|
303 |
"\\<II>", |
|
304 |
"\\<JJ>", |
|
305 |
"\\<KK>", |
|
306 |
"\\<LL>", |
|
307 |
"\\<MM>", |
|
308 |
"\\<NN>", |
|
309 |
"\\<OO>", |
|
310 |
"\\<PP>", |
|
311 |
"\\<QQ>", |
|
312 |
"\\<RR>", |
|
313 |
"\\<SS>", |
|
314 |
"\\<TT>", |
|
315 |
"\\<UU>", |
|
316 |
"\\<VV>", |
|
317 |
"\\<WW>", |
|
318 |
"\\<XX>", |
|
319 |
"\\<YY>", |
|
320 |
"\\<ZZ>", |
|
321 |
"\\<aa>", |
|
322 |
"\\<bb>", |
|
323 |
"\\<cc>", |
|
324 |
"\\<dd>", |
|
325 |
"\\<ee>", |
|
326 |
"\\<ff>", |
|
327 |
"\\<gg>", |
|
328 |
"\\<hh>", |
|
329 |
"\\<ii>", |
|
330 |
"\\<jj>", |
|
331 |
"\\<kk>", |
|
332 |
"\\<ll>", |
|
333 |
"\\<mm>", |
|
334 |
"\\<nn>", |
|
335 |
"\\<oo>", |
|
336 |
"\\<pp>", |
|
337 |
"\\<qq>", |
|
338 |
"\\<rr>", |
|
339 |
"\\<ss>", |
|
340 |
"\\<tt>", |
|
341 |
"\\<uu>", |
|
342 |
"\\<vv>", |
|
343 |
"\\<ww>", |
|
344 |
"\\<xx>", |
|
345 |
"\\<yy>", |
|
346 |
"\\<zz>", |
|
347 |
"\\<alpha>", |
|
348 |
"\\<beta>", |
|
349 |
"\\<gamma>", |
|
350 |
"\\<delta>", |
|
351 |
"\\<epsilon>", |
|
352 |
"\\<zeta>", |
|
353 |
"\\<eta>", |
|
354 |
"\\<theta>", |
|
355 |
"\\<iota>", |
|
356 |
"\\<kappa>", |
|
357 |
(*"\\<lambda>", sic!*) |
|
358 |
"\\<mu>", |
|
359 |
"\\<nu>", |
|
360 |
"\\<xi>", |
|
361 |
"\\<pi>", |
|
362 |
"\\<rho>", |
|
363 |
"\\<sigma>", |
|
364 |
"\\<tau>", |
|
365 |
"\\<upsilon>", |
|
366 |
"\\<phi>", |
|
367 |
"\\<chi>", |
|
368 |
"\\<psi>", |
|
369 |
"\\<omega>", |
|
370 |
"\\<Gamma>", |
|
371 |
"\\<Delta>", |
|
372 |
"\\<Theta>", |
|
373 |
"\\<Lambda>", |
|
374 |
"\\<Xi>", |
|
375 |
"\\<Pi>", |
|
376 |
"\\<Sigma>", |
|
377 |
"\\<Upsilon>", |
|
378 |
"\\<Phi>", |
|
379 |
"\\<Psi>", |
|
380 |
"\\<Omega>", |
|
381 |
"\\<^isub>", |
|
382 |
"\\<^isup>" |
|
383 |
]; |
|
14171
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
skalberg
parents:
13730
diff
changeset
|
384 |
in |
50242
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
385 |
|
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
386 |
val is_letter_symbol = Symtab.defined letter_symbols; |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
387 |
|
14678 | 388 |
end; |
14173 | 389 |
|
50242
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
390 |
datatype kind = Letter | Digit | Quasi | Blank | Other; |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
391 |
|
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
392 |
fun kind s = |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
393 |
if is_ascii_letter s then Letter |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
394 |
else if is_ascii_digit s then Digit |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
395 |
else if is_ascii_quasi s then Quasi |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
396 |
else if is_ascii_blank s then Blank |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
397 |
else if is_char s then Other |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
398 |
else if is_letter_symbol s then Letter |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
399 |
else Other; |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
400 |
|
14678 | 401 |
fun is_letter s = kind s = Letter; |
402 |
fun is_digit s = kind s = Digit; |
|
403 |
fun is_quasi s = kind s = Quasi; |
|
404 |
fun is_blank s = kind s = Blank; |
|
6272 | 405 |
|
47850
c638127b4653
avoid interference of markup for literal tokens, which may contain slightly odd \<^bsub> \<^esub> counted as pseudo-markup (especially relevant for HTML output, e.g. of thm power3_eq_cube);
wenzelm
parents:
43947
diff
changeset
|
406 |
val is_block_ctrl = member (op =) ["\\<^bsub>", "\\<^esub>", "\\<^bsup>", "\\<^esup>"]; |
c638127b4653
avoid interference of markup for literal tokens, which may contain slightly odd \<^bsub> \<^esub> counted as pseudo-markup (especially relevant for HTML output, e.g. of thm power3_eq_cube);
wenzelm
parents:
43947
diff
changeset
|
407 |
|
14678 | 408 |
fun is_quasi_letter s = let val k = kind s in k = Letter orelse k = Quasi end; |
409 |
fun is_letdig s = let val k = kind s in k = Letter orelse k = Digit orelse k = Quasi end; |
|
11010 | 410 |
|
6272 | 411 |
|
412 |
||
14678 | 413 |
(** symbol input **) |
414 |
||
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
415 |
(* source *) |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
416 |
|
14678 | 417 |
local |
14561
c53396af770e
* raw control symbols are of the form \<^raw:...> now.
schirmer
parents:
14559
diff
changeset
|
418 |
|
37533
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
419 |
fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\"; |
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
420 |
|
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
421 |
fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192; |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
422 |
|
37728
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
wenzelm
parents:
37534
diff
changeset
|
423 |
fun implode_pseudo_utf8 (cs as ["\192", c]) = |
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
wenzelm
parents:
37534
diff
changeset
|
424 |
if ord c < 160 then chr (ord c - 128) else implode cs |
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
wenzelm
parents:
37534
diff
changeset
|
425 |
| implode_pseudo_utf8 cs = implode cs; |
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
wenzelm
parents:
37534
diff
changeset
|
426 |
|
14678 | 427 |
val scan_encoded_newline = |
17756 | 428 |
$$ "\^M" -- $$ "\n" >> K "\n" || |
429 |
$$ "\^M" >> K "\n" || |
|
31545
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
wenzelm
parents:
31425
diff
changeset
|
430 |
Scan.this_string "\\<^newline>" >> K "\n"; |
14956
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
431 |
|
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
wenzelm
parents:
14908
diff
changeset
|
432 |
val scan_raw = |
21858
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
21495
diff
changeset
|
433 |
Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) || |
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
wenzelm
parents:
21495
diff
changeset
|
434 |
Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode); |
14678 | 435 |
|
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
436 |
val scan_total = |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
437 |
Scan.one is_plain || |
37728
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
wenzelm
parents:
37534
diff
changeset
|
438 |
Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 || |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
439 |
scan_encoded_newline || |
48773
0e1bab274672
more liberal scanning of potentially malformed symbols;
wenzelm
parents:
48704
diff
changeset
|
440 |
($$ "\\" ^^ $$ "<" ^^ |
50236 | 441 |
(($$ "^" ^^ Scan.optional (scan_raw || scan_ascii_id) "" || Scan.optional scan_ascii_id "") ^^ |
48774 | 442 |
Scan.optional ($$ ">") "")) || |
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
443 |
Scan.one not_eof; |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
444 |
|
14678 | 445 |
in |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
446 |
|
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
447 |
fun source src = Source.source stopper (Scan.bulk scan_total) NONE src; |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
448 |
|
14678 | 449 |
end; |
450 |
||
14562
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
451 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
452 |
(* explode *) |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
453 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
454 |
local |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
455 |
|
14562
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
456 |
fun no_explode [] = true |
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
457 |
| no_explode ("\\" :: "<" :: _) = false |
17756 | 458 |
| no_explode ("\^M" :: _) = false |
37533
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
wenzelm
parents:
34095
diff
changeset
|
459 |
| no_explode (c :: cs) = is_ascii c andalso no_explode cs; |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
460 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
461 |
in |
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
462 |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
463 |
fun sym_explode str = |
40627
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
wenzelm
parents:
40523
diff
changeset
|
464 |
let val chs = raw_explode str in |
14562
980da32f4617
proper handling of lines terminated by CRLF or CR;
wenzelm
parents:
14561
diff
changeset
|
465 |
if no_explode chs then chs |
40523
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
wenzelm
parents:
40509
diff
changeset
|
466 |
else Source.exhaust (source (Source.of_list chs)) |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
467 |
end; |
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
468 |
|
23676
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
wenzelm
parents:
23618
diff
changeset
|
469 |
end; |
14994 | 470 |
|
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
471 |
|
50237 | 472 |
(* escape *) |
473 |
||
474 |
val esc = fn s => |
|
475 |
if is_char s then s |
|
476 |
else if is_utf8 s then translate_string (fn c => "\\" ^ string_of_int (ord c)) s |
|
477 |
else "\\" ^ s; |
|
478 |
||
479 |
val escape = implode o map esc o sym_explode; |
|
480 |
||
481 |
||
482 |
||
483 |
(** scanning through symbols **) |
|
484 |
||
485 |
(* scanner *) |
|
486 |
||
487 |
fun scanner msg scan syms = |
|
488 |
let |
|
489 |
fun message (ss, NONE) = (fn () => msg ^ ": " ^ quote (beginning 10 ss)) |
|
490 |
| message (ss, SOME msg') = (fn () => msg ^ ", " ^ msg' () ^ ": " ^ quote (beginning 10 ss)); |
|
491 |
val finite_scan = Scan.error (Scan.finite stopper (!! message scan)); |
|
492 |
in |
|
493 |
(case finite_scan syms of |
|
494 |
(result, []) => result |
|
495 |
| (_, rest) => error (message (rest, NONE) ())) |
|
496 |
end; |
|
497 |
||
498 |
||
50162 | 499 |
(* space-separated words *) |
500 |
||
501 |
val scan_word = |
|
502 |
Scan.many1 is_ascii_blank >> K NONE || |
|
503 |
Scan.many1 (fn s => not (is_ascii_blank s) andalso not_eof s) >> (SOME o implode); |
|
504 |
||
505 |
val split_words = scanner "Bad text" (Scan.repeat scan_word >> map_filter I); |
|
506 |
||
507 |
val explode_words = split_words o sym_explode; |
|
508 |
||
509 |
||
14678 | 510 |
(* blanks *) |
511 |
||
512 |
fun strip_blanks s = |
|
513 |
sym_explode s |
|
33955 | 514 |
|> take_prefix is_blank |> #2 |
515 |
|> take_suffix is_blank |> #1 |
|
14678 | 516 |
|> implode; |
517 |
||
518 |
||
519 |
(* bump string -- treat as base 26 or base 1 numbers *) |
|
520 |
||
50242
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
521 |
fun symbolic_end (_ :: "\\<^sub>" :: _) = true |
56b9c792a98b
support for sub-structured identifier syntax (inactive);
wenzelm
parents:
50239
diff
changeset
|
522 |
| symbolic_end (_ :: "\\<^isub>" :: _) = true |
15979 | 523 |
| symbolic_end (_ :: "\\<^isup>" :: _) = true |
14908 | 524 |
| symbolic_end (s :: _) = is_symbolic s |
525 |
| symbolic_end [] = false; |
|
14678 | 526 |
|
527 |
fun bump_init str = |
|
14908 | 528 |
if symbolic_end (rev (sym_explode str)) then str ^ "'" |
14678 | 529 |
else str ^ "a"; |
12904 | 530 |
|
531 |
fun bump_string str = |
|
532 |
let |
|
533 |
fun bump [] = ["a"] |
|
534 |
| bump ("z" :: ss) = "a" :: bump ss |
|
535 |
| bump (s :: ss) = |
|
14678 | 536 |
if is_char s andalso ord "a" <= ord s andalso ord s < ord "z" |
12904 | 537 |
then chr (ord s + 1) :: ss |
538 |
else "a" :: s :: ss; |
|
14678 | 539 |
|
33955 | 540 |
val (ss, qs) = apfst rev (take_suffix is_quasi (sym_explode str)); |
14908 | 541 |
val ss' = if symbolic_end ss then "'" :: ss else bump ss; |
14678 | 542 |
in implode (rev ss' @ qs) end; |
543 |
||
12904 | 544 |
|
6272 | 545 |
|
29324 | 546 |
(** symbol output **) |
14977
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
wenzelm
parents:
14961
diff
changeset
|
547 |
|
29324 | 548 |
(* length *) |
6272 | 549 |
|
14678 | 550 |
fun sym_len s = |
24593
1547ea587f5a
added some int constraints (ML_Parse.fix_ints not active here);
wenzelm
parents:
24580
diff
changeset
|
551 |
if not (is_printable s) then (0: int) |
14678 | 552 |
else if String.isPrefix "\\<long" s then 2 |
553 |
else if String.isPrefix "\\<Long" s then 2 |
|
554 |
else 1; |
|
555 |
||
19473 | 556 |
fun sym_length ss = fold (fn s => fn n => sym_len s + n) ss 0; |
14678 | 557 |
|
29324 | 558 |
|
559 |
(* print mode *) |
|
560 |
||
561 |
val xsymbolsN = "xsymbols"; |
|
562 |
||
563 |
fun output s = (s, sym_length (sym_explode s)); |
|
564 |
||
565 |
||
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
566 |
(*final declarations of this structure!*) |
29324 | 567 |
val explode = sym_explode; |
6272 | 568 |
val length = sym_length; |
6116
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
569 |
|
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
wenzelm
parents:
diff
changeset
|
570 |
end; |