| author | wenzelm | 
| Tue, 13 Aug 2013 17:26:22 +0200 | |
| changeset 53016 | fa9c38891cf2 | 
| parent 52920 | 4539e4a06339 | 
| child 53198 | 06b096cf89ca | 
| permissions | -rw-r--r-- | 
| 6118 | 1  | 
(* Title: Pure/General/symbol.ML  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
2  | 
Author: Markus Wenzel, TU Muenchen  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
3  | 
|
| 21897 | 4  | 
Generalized characters with infinitely many named symbols.  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
5  | 
*)  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
6  | 
|
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
7  | 
signature SYMBOL =  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
8  | 
sig  | 
| 40509 | 9  | 
type symbol = string  | 
| 26524 | 10  | 
val STX: symbol  | 
11  | 
val DEL: symbol  | 
|
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
12  | 
val space: symbol  | 
| 14678 | 13  | 
val is_char: symbol -> bool  | 
| 
37533
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
14  | 
val is_utf8: symbol -> bool  | 
| 14678 | 15  | 
val is_symbolic: symbol -> bool  | 
16  | 
val is_printable: symbol -> bool  | 
|
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
17  | 
val eof: symbol  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
18  | 
val is_eof: symbol -> bool  | 
| 27766 | 19  | 
val not_eof: symbol -> bool  | 
| 27732 | 20  | 
val stopper: symbol Scan.stopper  | 
| 14678 | 21  | 
val sync: symbol  | 
22  | 
val is_sync: symbol -> bool  | 
|
| 
23784
 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 
wenzelm 
parents: 
23728 
diff
changeset
 | 
23  | 
val is_regular: symbol -> bool  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
24  | 
val is_malformed: symbol -> bool  | 
| 
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
25  | 
val malformed_msg: symbol -> string  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
26  | 
val is_ascii: symbol -> bool  | 
| 14678 | 27  | 
val is_ascii_letter: symbol -> bool  | 
28  | 
val is_ascii_digit: symbol -> bool  | 
|
| 
24580
 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 
wenzelm 
parents: 
23784 
diff
changeset
 | 
29  | 
val is_ascii_hex: symbol -> bool  | 
| 14678 | 30  | 
val is_ascii_quasi: symbol -> bool  | 
31  | 
val is_ascii_blank: symbol -> bool  | 
|
| 
34095
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
33955 
diff
changeset
 | 
32  | 
val is_ascii_control: symbol -> bool  | 
| 50236 | 33  | 
val is_ascii_letdig: symbol -> bool  | 
| 20200 | 34  | 
val is_ascii_lower: symbol -> bool  | 
35  | 
val is_ascii_upper: symbol -> bool  | 
|
36  | 
val to_ascii_lower: symbol -> symbol  | 
|
37  | 
val to_ascii_upper: symbol -> symbol  | 
|
| 
50238
 
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
 
wenzelm 
parents: 
50237 
diff
changeset
 | 
38  | 
val is_ascii_identifier: string -> bool  | 
| 50236 | 39  | 
val scan_ascii_id: string list -> string * string list  | 
| 14834 | 40  | 
val is_raw: symbol -> bool  | 
41  | 
val decode_raw: symbol -> string  | 
|
| 
14977
 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 
wenzelm 
parents: 
14961 
diff
changeset
 | 
42  | 
val encode_raw: string -> string  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
43  | 
datatype sym =  | 
| 
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
44  | 
Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string |  | 
| 43485 | 45  | 
Malformed of string | EOF  | 
| 14873 | 46  | 
val decode: symbol -> sym  | 
| 14678 | 47  | 
datatype kind = Letter | Digit | Quasi | Blank | Other  | 
48  | 
val kind: symbol -> kind  | 
|
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
49  | 
val is_letter: symbol -> bool  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
50  | 
val is_digit: symbol -> bool  | 
| 12904 | 51  | 
val is_quasi: symbol -> bool  | 
| 14678 | 52  | 
val is_blank: symbol -> bool  | 
| 
47850
 
c638127b4653
avoid interference of markup for literal tokens, which may contain slightly odd \<^bsub> \<^esub> counted as pseudo-markup (especially relevant for HTML output, e.g. of thm power3_eq_cube);
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
53  | 
val is_block_ctrl: symbol -> bool  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
54  | 
val is_quasi_letter: symbol -> bool  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
55  | 
val is_letdig: symbol -> bool  | 
| 14728 | 56  | 
val beginning: int -> symbol list -> string  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
57  | 
val source: (string, 'a) Source.source -> (symbol, (string, 'a) Source.source) Source.source  | 
| 6272 | 58  | 
val explode: string -> symbol list  | 
| 50237 | 59  | 
val esc: symbol -> string  | 
60  | 
val escape: string -> string  | 
|
61  | 
val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a  | 
|
| 50162 | 62  | 
val split_words: symbol list -> string list  | 
63  | 
val explode_words: string -> string list  | 
|
| 14678 | 64  | 
val strip_blanks: string -> string  | 
65  | 
val bump_init: string -> string  | 
|
| 12904 | 66  | 
val bump_string: string -> string  | 
| 14678 | 67  | 
val length: symbol list -> int  | 
| 6692 | 68  | 
val xsymbolsN: string  | 
| 
40131
 
7cbebd636e79
explicitly qualify type Output.output, which is a slightly odd internal feature;
 
wenzelm 
parents: 
37728 
diff
changeset
 | 
69  | 
val output: string -> Output.output * int  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
70  | 
end;  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
71  | 
|
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
72  | 
structure Symbol: SYMBOL =  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
73  | 
struct  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
74  | 
|
| 14678 | 75  | 
(** type symbol **)  | 
| 6272 | 76  | 
|
| 14678 | 77  | 
(*Symbols, which are considered the smallest entities of any Isabelle  | 
| 6272 | 78  | 
string, may be of the following form:  | 
| 14678 | 79  | 
|
| 14834 | 80  | 
(1) ASCII symbols: a  | 
| 17823 | 81  | 
(2) regular symbols: \<ident>  | 
| 14834 | 82  | 
(3) control symbols: \<^ident>  | 
83  | 
(4) raw control symbols: \<^raw:...>, where "..." may be any printable  | 
|
| 
20205
 
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
 
wenzelm 
parents: 
20200 
diff
changeset
 | 
84  | 
character (excluding ".", ">"), or \<^raw000>  | 
| 6272 | 85  | 
|
| 14678 | 86  | 
Output is subject to the print_mode variable (default: verbatim),  | 
87  | 
actual interpretation in display is up to front-end tools.  | 
|
| 6272 | 88  | 
*)  | 
89  | 
||
90  | 
type symbol = string;  | 
|
91  | 
||
| 26524 | 92  | 
val STX = chr 2;  | 
93  | 
val DEL = chr 127;  | 
|
94  | 
||
95  | 
val space = chr 32;  | 
|
| 17063 | 96  | 
|
| 14678 | 97  | 
fun is_char s = size s = 1;  | 
98  | 
||
| 
37533
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
99  | 
fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;  | 
| 
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
100  | 
|
| 14678 | 101  | 
fun is_symbolic s =  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
102  | 
String.isPrefix "\\<" s andalso String.isSuffix ">" s andalso not (String.isPrefix "\\<^" s);  | 
| 14678 | 103  | 
|
104  | 
fun is_printable s =  | 
|
105  | 
if is_char s then ord space <= ord s andalso ord s <= ord "~"  | 
|
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
106  | 
else is_utf8 s orelse is_symbolic s;  | 
| 26632 | 107  | 
|
| 6272 | 108  | 
|
| 14678 | 109  | 
(* input source control *)  | 
| 6272 | 110  | 
|
| 14678 | 111  | 
val eof = "";  | 
| 6272 | 112  | 
fun is_eof s = s = eof;  | 
113  | 
fun not_eof s = s <> eof;  | 
|
| 27732 | 114  | 
val stopper = Scan.stopper (K eof) is_eof;  | 
| 6272 | 115  | 
|
| 14678 | 116  | 
val sync = "\\<^sync>";  | 
117  | 
fun is_sync s = s = sync;  | 
|
118  | 
||
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
119  | 
fun is_regular s = not_eof s andalso s <> sync;  | 
| 25641 | 120  | 
|
| 48774 | 121  | 
fun is_malformed s =  | 
122  | 
String.isPrefix "\\<" s andalso not (String.isSuffix ">" s)  | 
|
123  | 
orelse s = "\\<>" orelse s = "\\<^>";  | 
|
124  | 
||
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
125  | 
fun malformed_msg s = "Malformed symbolic character: " ^ quote s;  | 
| 14678 | 126  | 
|
127  | 
||
| 43418 | 128  | 
(* ASCII symbols *)  | 
| 14678 | 129  | 
|
130  | 
fun is_ascii s = is_char s andalso ord s < 128;  | 
|
131  | 
||
132  | 
fun is_ascii_letter s =  | 
|
133  | 
is_char s andalso  | 
|
134  | 
(ord "A" <= ord s andalso ord s <= ord "Z" orelse  | 
|
135  | 
ord "a" <= ord s andalso ord s <= ord "z");  | 
|
136  | 
||
137  | 
fun is_ascii_digit s =  | 
|
138  | 
is_char s andalso ord "0" <= ord s andalso ord s <= ord "9";  | 
|
139  | 
||
| 
24580
 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 
wenzelm 
parents: 
23784 
diff
changeset
 | 
140  | 
fun is_ascii_hex s =  | 
| 
 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 
wenzelm 
parents: 
23784 
diff
changeset
 | 
141  | 
is_char s andalso  | 
| 
 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 
wenzelm 
parents: 
23784 
diff
changeset
 | 
142  | 
(ord "0" <= ord s andalso ord s <= ord "9" orelse  | 
| 
 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 
wenzelm 
parents: 
23784 
diff
changeset
 | 
143  | 
ord "A" <= ord s andalso ord s <= ord "F" orelse  | 
| 
 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 
wenzelm 
parents: 
23784 
diff
changeset
 | 
144  | 
ord "a" <= ord s andalso ord s <= ord "f");  | 
| 
 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 
wenzelm 
parents: 
23784 
diff
changeset
 | 
145  | 
|
| 14678 | 146  | 
fun is_ascii_quasi "_" = true  | 
147  | 
| is_ascii_quasi "'" = true  | 
|
148  | 
| is_ascii_quasi _ = false;  | 
|
149  | 
||
150  | 
val is_ascii_blank =  | 
|
| 
43845
 
d89353d17f54
added File.fold_pages for streaming of large files;
 
wenzelm 
parents: 
43777 
diff
changeset
 | 
151  | 
fn " " => true | "\t" => true | "\n" => true | "\^K" => true | "\f" => true | "\^M" => true  | 
| 14678 | 152  | 
| _ => false;  | 
153  | 
||
| 
34095
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
33955 
diff
changeset
 | 
154  | 
fun is_ascii_control s = is_char s andalso ord s < 32 andalso not (is_ascii_blank s);  | 
| 
 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 
wenzelm 
parents: 
33955 
diff
changeset
 | 
155  | 
|
| 50236 | 156  | 
fun is_ascii_letdig s = is_ascii_letter s orelse is_ascii_digit s orelse is_ascii_quasi s;  | 
157  | 
||
| 20200 | 158  | 
fun is_ascii_lower s = is_char s andalso (ord "a" <= ord s andalso ord s <= ord "z");  | 
159  | 
fun is_ascii_upper s = is_char s andalso (ord "A" <= ord s andalso ord s <= ord "Z");  | 
|
160  | 
||
161  | 
fun to_ascii_lower s = if is_ascii_upper s then chr (ord s + ord "a" - ord "A") else s;  | 
|
162  | 
fun to_ascii_upper s = if is_ascii_lower s then chr (ord s + ord "A" - ord "a") else s;  | 
|
163  | 
||
| 
50238
 
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
 
wenzelm 
parents: 
50237 
diff
changeset
 | 
164  | 
fun is_ascii_identifier s =  | 
| 
 
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
 
wenzelm 
parents: 
50237 
diff
changeset
 | 
165  | 
size s > 0 andalso is_ascii_letter (String.substring (s, 0, 1)) andalso  | 
| 
 
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
 
wenzelm 
parents: 
50237 
diff
changeset
 | 
166  | 
forall_string is_ascii_letdig s;  | 
| 
 
98d35a7368bd
more uniform Symbol.is_ascii_identifier in ML/Scala;
 
wenzelm 
parents: 
50237 
diff
changeset
 | 
167  | 
|
| 50236 | 168  | 
val scan_ascii_id = Scan.one is_ascii_letter ^^ (Scan.many is_ascii_letdig >> implode);  | 
169  | 
||
| 14678 | 170  | 
|
| 
14956
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
171  | 
(* encode_raw *)  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
172  | 
|
| 
20205
 
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
 
wenzelm 
parents: 
20200 
diff
changeset
 | 
173  | 
fun raw_chr c =  | 
| 
48773
 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 
wenzelm 
parents: 
48704 
diff
changeset
 | 
174  | 
is_char c andalso  | 
| 
 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 
wenzelm 
parents: 
48704 
diff
changeset
 | 
175  | 
(ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">"  | 
| 
 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 
wenzelm 
parents: 
48704 
diff
changeset
 | 
176  | 
orelse ord c >= 128);  | 
| 
14956
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
177  | 
|
| 29324 | 178  | 
fun encode_raw "" = ""  | 
179  | 
| encode_raw str =  | 
|
180  | 
let  | 
|
181  | 
val raw0 = enclose "\\<^raw:" ">";  | 
|
182  | 
val raw1 = raw0 o implode;  | 
|
183  | 
val raw2 = enclose "\\<^raw" ">" o string_of_int o ord;  | 
|
| 50162 | 184  | 
|
| 33955 | 185  | 
fun encode cs = enc (take_prefix raw_chr cs)  | 
| 29324 | 186  | 
and enc ([], []) = []  | 
187  | 
| enc (cs, []) = [raw1 cs]  | 
|
188  | 
| enc ([], d :: ds) = raw2 d :: encode ds  | 
|
189  | 
| enc (cs, d :: ds) = raw1 cs :: raw2 d :: encode ds;  | 
|
190  | 
in  | 
|
| 
40627
 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
191  | 
if exists_string (not o raw_chr) str then implode (encode (raw_explode str))  | 
| 29324 | 192  | 
else raw0 str  | 
193  | 
end;  | 
|
| 
14956
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
194  | 
|
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
195  | 
|
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
196  | 
(* diagnostics *)  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
197  | 
|
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
198  | 
fun beginning n cs =  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
199  | 
let  | 
| 33955 | 200  | 
val drop_blanks = #1 o take_suffix is_ascii_blank;  | 
| 
14956
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
201  | 
val all_cs = drop_blanks cs;  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
202  | 
val dots = if length all_cs > n then " ..." else "";  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
203  | 
in  | 
| 33955 | 204  | 
(drop_blanks (take n all_cs)  | 
| 
14956
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
205  | 
|> map (fn c => if is_ascii_blank c then space else c)  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
206  | 
|> implode) ^ dots  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
207  | 
end;  | 
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
208  | 
|
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
209  | 
|
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
210  | 
(* decode_raw *)  | 
| 14834 | 211  | 
|
212  | 
fun is_raw s =  | 
|
| 17063 | 213  | 
String.isPrefix "\\<^raw" s andalso String.isSuffix ">" s;  | 
| 14834 | 214  | 
|
215  | 
fun decode_raw s =  | 
|
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
216  | 
if not (is_raw s) then error (malformed_msg s)  | 
| 14834 | 217  | 
else if String.isPrefix "\\<^raw:" s then String.substring (s, 7, size s - 8)  | 
| 
40627
 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
218  | 
else chr (#1 (Library.read_int (raw_explode (String.substring (s, 6, size s - 7)))));  | 
| 14834 | 219  | 
|
220  | 
||
| 14873 | 221  | 
(* symbol variants *)  | 
222  | 
||
| 
37533
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
223  | 
datatype sym =  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
224  | 
Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string |  | 
| 43485 | 225  | 
Malformed of string | EOF;  | 
| 14873 | 226  | 
|
227  | 
fun decode s =  | 
|
| 43485 | 228  | 
if s = "" then EOF  | 
229  | 
else if is_char s then Char s  | 
|
| 
37533
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
230  | 
else if is_utf8 s then UTF8 s  | 
| 14873 | 231  | 
else if is_raw s then Raw (decode_raw s)  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
232  | 
else if is_malformed s then Malformed s  | 
| 14873 | 233  | 
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
234  | 
else Sym (String.substring (s, 2, size s - 3));  | 
| 14873 | 235  | 
|
236  | 
||
| 14678 | 237  | 
(* standard symbol kinds *)  | 
238  | 
||
| 
14171
 
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
 
skalberg 
parents: 
13730 
diff
changeset
 | 
239  | 
local  | 
| 50235 | 240  | 
val letter_symbols =  | 
241  | 
Symtab.make_set [  | 
|
242  | 
"\\<A>",  | 
|
243  | 
"\\<B>",  | 
|
244  | 
"\\<C>",  | 
|
245  | 
"\\<D>",  | 
|
246  | 
"\\<E>",  | 
|
247  | 
"\\<F>",  | 
|
248  | 
"\\<G>",  | 
|
249  | 
"\\<H>",  | 
|
250  | 
"\\<I>",  | 
|
251  | 
"\\<J>",  | 
|
252  | 
"\\<K>",  | 
|
253  | 
"\\<L>",  | 
|
254  | 
"\\<M>",  | 
|
255  | 
"\\<N>",  | 
|
256  | 
"\\<O>",  | 
|
257  | 
"\\<P>",  | 
|
258  | 
"\\<Q>",  | 
|
259  | 
"\\<R>",  | 
|
260  | 
"\\<S>",  | 
|
261  | 
"\\<T>",  | 
|
262  | 
"\\<U>",  | 
|
263  | 
"\\<V>",  | 
|
264  | 
"\\<W>",  | 
|
265  | 
"\\<X>",  | 
|
266  | 
"\\<Y>",  | 
|
267  | 
"\\<Z>",  | 
|
268  | 
"\\<a>",  | 
|
269  | 
"\\<b>",  | 
|
270  | 
"\\<c>",  | 
|
271  | 
"\\<d>",  | 
|
272  | 
"\\<e>",  | 
|
273  | 
"\\<f>",  | 
|
274  | 
"\\<g>",  | 
|
275  | 
"\\<h>",  | 
|
276  | 
"\\<i>",  | 
|
277  | 
"\\<j>",  | 
|
278  | 
"\\<k>",  | 
|
279  | 
"\\<l>",  | 
|
280  | 
"\\<m>",  | 
|
281  | 
"\\<n>",  | 
|
282  | 
"\\<o>",  | 
|
283  | 
"\\<p>",  | 
|
284  | 
"\\<q>",  | 
|
285  | 
"\\<r>",  | 
|
286  | 
"\\<s>",  | 
|
287  | 
"\\<t>",  | 
|
288  | 
"\\<u>",  | 
|
289  | 
"\\<v>",  | 
|
290  | 
"\\<w>",  | 
|
291  | 
"\\<x>",  | 
|
292  | 
"\\<y>",  | 
|
293  | 
"\\<z>",  | 
|
294  | 
"\\<AA>",  | 
|
295  | 
"\\<BB>",  | 
|
296  | 
"\\<CC>",  | 
|
297  | 
"\\<DD>",  | 
|
298  | 
"\\<EE>",  | 
|
299  | 
"\\<FF>",  | 
|
300  | 
"\\<GG>",  | 
|
301  | 
"\\<HH>",  | 
|
302  | 
"\\<II>",  | 
|
303  | 
"\\<JJ>",  | 
|
304  | 
"\\<KK>",  | 
|
305  | 
"\\<LL>",  | 
|
306  | 
"\\<MM>",  | 
|
307  | 
"\\<NN>",  | 
|
308  | 
"\\<OO>",  | 
|
309  | 
"\\<PP>",  | 
|
310  | 
"\\<QQ>",  | 
|
311  | 
"\\<RR>",  | 
|
312  | 
"\\<SS>",  | 
|
313  | 
"\\<TT>",  | 
|
314  | 
"\\<UU>",  | 
|
315  | 
"\\<VV>",  | 
|
316  | 
"\\<WW>",  | 
|
317  | 
"\\<XX>",  | 
|
318  | 
"\\<YY>",  | 
|
319  | 
"\\<ZZ>",  | 
|
320  | 
"\\<aa>",  | 
|
321  | 
"\\<bb>",  | 
|
322  | 
"\\<cc>",  | 
|
323  | 
"\\<dd>",  | 
|
324  | 
"\\<ee>",  | 
|
325  | 
"\\<ff>",  | 
|
326  | 
"\\<gg>",  | 
|
327  | 
"\\<hh>",  | 
|
328  | 
"\\<ii>",  | 
|
329  | 
"\\<jj>",  | 
|
330  | 
"\\<kk>",  | 
|
331  | 
"\\<ll>",  | 
|
332  | 
"\\<mm>",  | 
|
333  | 
"\\<nn>",  | 
|
334  | 
"\\<oo>",  | 
|
335  | 
"\\<pp>",  | 
|
336  | 
"\\<qq>",  | 
|
337  | 
"\\<rr>",  | 
|
338  | 
"\\<ss>",  | 
|
339  | 
"\\<tt>",  | 
|
340  | 
"\\<uu>",  | 
|
341  | 
"\\<vv>",  | 
|
342  | 
"\\<ww>",  | 
|
343  | 
"\\<xx>",  | 
|
344  | 
"\\<yy>",  | 
|
345  | 
"\\<zz>",  | 
|
346  | 
"\\<alpha>",  | 
|
347  | 
"\\<beta>",  | 
|
348  | 
"\\<gamma>",  | 
|
349  | 
"\\<delta>",  | 
|
350  | 
"\\<epsilon>",  | 
|
351  | 
"\\<zeta>",  | 
|
352  | 
"\\<eta>",  | 
|
353  | 
"\\<theta>",  | 
|
354  | 
"\\<iota>",  | 
|
355  | 
"\\<kappa>",  | 
|
356  | 
(*"\\<lambda>", sic!*)  | 
|
357  | 
"\\<mu>",  | 
|
358  | 
"\\<nu>",  | 
|
359  | 
"\\<xi>",  | 
|
360  | 
"\\<pi>",  | 
|
361  | 
"\\<rho>",  | 
|
362  | 
"\\<sigma>",  | 
|
363  | 
"\\<tau>",  | 
|
364  | 
"\\<upsilon>",  | 
|
365  | 
"\\<phi>",  | 
|
366  | 
"\\<chi>",  | 
|
367  | 
"\\<psi>",  | 
|
368  | 
"\\<omega>",  | 
|
369  | 
"\\<Gamma>",  | 
|
370  | 
"\\<Delta>",  | 
|
371  | 
"\\<Theta>",  | 
|
372  | 
"\\<Lambda>",  | 
|
373  | 
"\\<Xi>",  | 
|
374  | 
"\\<Pi>",  | 
|
375  | 
"\\<Sigma>",  | 
|
376  | 
"\\<Upsilon>",  | 
|
377  | 
"\\<Phi>",  | 
|
378  | 
"\\<Psi>",  | 
|
| 
52616
 
3ac2878764f9
more robust identifier syntax: sub/superscript counts as modifier of LETDIG part instead of LETTER, both isub/isup and sub/sup are allowed;
 
wenzelm 
parents: 
50242 
diff
changeset
 | 
379  | 
"\\<Omega>"  | 
| 50235 | 380  | 
];  | 
| 
14171
 
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
 
skalberg 
parents: 
13730 
diff
changeset
 | 
381  | 
in  | 
| 
50242
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
382  | 
|
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
383  | 
val is_letter_symbol = Symtab.defined letter_symbols;  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
384  | 
|
| 14678 | 385  | 
end;  | 
| 14173 | 386  | 
|
| 
50242
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
387  | 
datatype kind = Letter | Digit | Quasi | Blank | Other;  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
388  | 
|
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
389  | 
fun kind s =  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
390  | 
if is_ascii_letter s then Letter  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
391  | 
else if is_ascii_digit s then Digit  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
392  | 
else if is_ascii_quasi s then Quasi  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
393  | 
else if is_ascii_blank s then Blank  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
394  | 
else if is_char s then Other  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
395  | 
else if is_letter_symbol s then Letter  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
396  | 
else Other;  | 
| 
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
397  | 
|
| 14678 | 398  | 
fun is_letter s = kind s = Letter;  | 
399  | 
fun is_digit s = kind s = Digit;  | 
|
400  | 
fun is_quasi s = kind s = Quasi;  | 
|
401  | 
fun is_blank s = kind s = Blank;  | 
|
| 6272 | 402  | 
|
| 
47850
 
c638127b4653
avoid interference of markup for literal tokens, which may contain slightly odd \<^bsub> \<^esub> counted as pseudo-markup (especially relevant for HTML output, e.g. of thm power3_eq_cube);
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
403  | 
val is_block_ctrl = member (op =) ["\\<^bsub>", "\\<^esub>", "\\<^bsup>", "\\<^esup>"];  | 
| 
 
c638127b4653
avoid interference of markup for literal tokens, which may contain slightly odd \<^bsub> \<^esub> counted as pseudo-markup (especially relevant for HTML output, e.g. of thm power3_eq_cube);
 
wenzelm 
parents: 
43947 
diff
changeset
 | 
404  | 
|
| 14678 | 405  | 
fun is_quasi_letter s = let val k = kind s in k = Letter orelse k = Quasi end;  | 
406  | 
fun is_letdig s = let val k = kind s in k = Letter orelse k = Digit orelse k = Quasi end;  | 
|
| 11010 | 407  | 
|
| 6272 | 408  | 
|
409  | 
||
| 14678 | 410  | 
(** symbol input **)  | 
411  | 
||
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
412  | 
(* source *)  | 
| 
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
413  | 
|
| 14678 | 414  | 
local  | 
| 
14561
 
c53396af770e
* raw control symbols are of the form \<^raw:...> now.
 
schirmer 
parents: 
14559 
diff
changeset
 | 
415  | 
|
| 
37533
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
416  | 
fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\";  | 
| 
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
417  | 
|
| 
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
418  | 
fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;  | 
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
419  | 
|
| 
37728
 
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
 
wenzelm 
parents: 
37534 
diff
changeset
 | 
420  | 
fun implode_pseudo_utf8 (cs as ["\192", c]) =  | 
| 
 
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
 
wenzelm 
parents: 
37534 
diff
changeset
 | 
421  | 
if ord c < 160 then chr (ord c - 128) else implode cs  | 
| 
 
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
 
wenzelm 
parents: 
37534 
diff
changeset
 | 
422  | 
| implode_pseudo_utf8 cs = implode cs;  | 
| 
 
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
 
wenzelm 
parents: 
37534 
diff
changeset
 | 
423  | 
|
| 14678 | 424  | 
val scan_encoded_newline =  | 
| 17756 | 425  | 
$$ "\^M" -- $$ "\n" >> K "\n" ||  | 
426  | 
$$ "\^M" >> K "\n" ||  | 
|
| 
31545
 
5f1f0a20af4d
discontinued escaped symbols such as \\<forall> -- only one backslash should be used;
 
wenzelm 
parents: 
31425 
diff
changeset
 | 
427  | 
Scan.this_string "\\<^newline>" >> K "\n";  | 
| 
14956
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
428  | 
|
| 
 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 
wenzelm 
parents: 
14908 
diff
changeset
 | 
429  | 
val scan_raw =  | 
| 
21858
 
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
 
wenzelm 
parents: 
21495 
diff
changeset
 | 
430  | 
Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) ||  | 
| 
 
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
 
wenzelm 
parents: 
21495 
diff
changeset
 | 
431  | 
Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode);  | 
| 14678 | 432  | 
|
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
433  | 
val scan_total =  | 
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
434  | 
Scan.one is_plain ||  | 
| 
37728
 
5d2b3e827371
implode pseudo utf8, i.e. decode byte-stuffed low ASCII characters;
 
wenzelm 
parents: 
37534 
diff
changeset
 | 
435  | 
Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 ||  | 
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
436  | 
scan_encoded_newline ||  | 
| 
48773
 
0e1bab274672
more liberal scanning of potentially malformed symbols;
 
wenzelm 
parents: 
48704 
diff
changeset
 | 
437  | 
($$ "\\" ^^ $$ "<" ^^  | 
| 50236 | 438  | 
(($$ "^" ^^ Scan.optional (scan_raw || scan_ascii_id) "" || Scan.optional scan_ascii_id "") ^^  | 
| 48774 | 439  | 
Scan.optional ($$ ">") "")) ||  | 
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
440  | 
Scan.one not_eof;  | 
| 
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
441  | 
|
| 14678 | 442  | 
in  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
443  | 
|
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
444  | 
fun source src = Source.source stopper (Scan.bulk scan_total) NONE src;  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
445  | 
|
| 14678 | 446  | 
end;  | 
447  | 
||
| 
14562
 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 
wenzelm 
parents: 
14561 
diff
changeset
 | 
448  | 
|
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
449  | 
(* explode *)  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
450  | 
|
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
451  | 
local  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
452  | 
|
| 
14562
 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 
wenzelm 
parents: 
14561 
diff
changeset
 | 
453  | 
fun no_explode [] = true  | 
| 
 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 
wenzelm 
parents: 
14561 
diff
changeset
 | 
454  | 
  | no_explode ("\\" :: "<" :: _) = false
 | 
| 17756 | 455  | 
  | no_explode ("\^M" :: _) = false
 | 
| 
37533
 
d775bd70f571
explicit treatment of UTF8 character sequences as Isabelle symbols;
 
wenzelm 
parents: 
34095 
diff
changeset
 | 
456  | 
| no_explode (c :: cs) = is_ascii c andalso no_explode cs;  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
457  | 
|
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
458  | 
in  | 
| 
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
459  | 
|
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
460  | 
fun sym_explode str =  | 
| 
40627
 
becf5d5187cc
renamed raw "explode" function to "raw_explode" to emphasize its meaning;
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
461  | 
let val chs = raw_explode str in  | 
| 
14562
 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 
wenzelm 
parents: 
14561 
diff
changeset
 | 
462  | 
if no_explode chs then chs  | 
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40509 
diff
changeset
 | 
463  | 
else Source.exhaust (source (Source.of_list chs))  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
464  | 
end;  | 
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
465  | 
|
| 
23676
 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 
wenzelm 
parents: 
23618 
diff
changeset
 | 
466  | 
end;  | 
| 14994 | 467  | 
|
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
468  | 
|
| 50237 | 469  | 
(* escape *)  | 
470  | 
||
471  | 
val esc = fn s =>  | 
|
472  | 
if is_char s then s  | 
|
473  | 
else if is_utf8 s then translate_string (fn c => "\\" ^ string_of_int (ord c)) s  | 
|
474  | 
else "\\" ^ s;  | 
|
475  | 
||
476  | 
val escape = implode o map esc o sym_explode;  | 
|
477  | 
||
478  | 
||
479  | 
||
480  | 
(** scanning through symbols **)  | 
|
481  | 
||
482  | 
(* scanner *)  | 
|
483  | 
||
484  | 
fun scanner msg scan syms =  | 
|
485  | 
let  | 
|
486  | 
fun message (ss, NONE) = (fn () => msg ^ ": " ^ quote (beginning 10 ss))  | 
|
487  | 
| message (ss, SOME msg') = (fn () => msg ^ ", " ^ msg' () ^ ": " ^ quote (beginning 10 ss));  | 
|
488  | 
val finite_scan = Scan.error (Scan.finite stopper (!! message scan));  | 
|
489  | 
in  | 
|
490  | 
(case finite_scan syms of  | 
|
491  | 
(result, []) => result  | 
|
492  | 
| (_, rest) => error (message (rest, NONE) ()))  | 
|
493  | 
end;  | 
|
494  | 
||
495  | 
||
| 50162 | 496  | 
(* space-separated words *)  | 
497  | 
||
498  | 
val scan_word =  | 
|
499  | 
Scan.many1 is_ascii_blank >> K NONE ||  | 
|
500  | 
Scan.many1 (fn s => not (is_ascii_blank s) andalso not_eof s) >> (SOME o implode);  | 
|
501  | 
||
502  | 
val split_words = scanner "Bad text" (Scan.repeat scan_word >> map_filter I);  | 
|
503  | 
||
504  | 
val explode_words = split_words o sym_explode;  | 
|
505  | 
||
506  | 
||
| 14678 | 507  | 
(* blanks *)  | 
508  | 
||
509  | 
fun strip_blanks s =  | 
|
510  | 
sym_explode s  | 
|
| 33955 | 511  | 
|> take_prefix is_blank |> #2  | 
512  | 
|> take_suffix is_blank |> #1  | 
|
| 14678 | 513  | 
|> implode;  | 
514  | 
||
515  | 
||
516  | 
(* bump string -- treat as base 26 or base 1 numbers *)  | 
|
517  | 
||
| 
50242
 
56b9c792a98b
support for sub-structured identifier syntax (inactive);
 
wenzelm 
parents: 
50239 
diff
changeset
 | 
518  | 
fun symbolic_end (_ :: "\\<^sub>" :: _) = true  | 
| 
53016
 
fa9c38891cf2
disable old identifier syntax by default, legacy_isub_isup := true may be used temporarily as fall-back;
 
wenzelm 
parents: 
52920 
diff
changeset
 | 
519  | 
| symbolic_end (_ :: "\\<^isub>" :: _) = true (*legacy*)  | 
| 
 
fa9c38891cf2
disable old identifier syntax by default, legacy_isub_isup := true may be used temporarily as fall-back;
 
wenzelm 
parents: 
52920 
diff
changeset
 | 
520  | 
| symbolic_end (_ :: "\\<^isup>" :: _) = true (*legacy*)  | 
| 14908 | 521  | 
| symbolic_end (s :: _) = is_symbolic s  | 
522  | 
| symbolic_end [] = false;  | 
|
| 14678 | 523  | 
|
524  | 
fun bump_init str =  | 
|
| 14908 | 525  | 
if symbolic_end (rev (sym_explode str)) then str ^ "'"  | 
| 14678 | 526  | 
else str ^ "a";  | 
| 12904 | 527  | 
|
528  | 
fun bump_string str =  | 
|
529  | 
let  | 
|
530  | 
fun bump [] = ["a"]  | 
|
531  | 
      | bump ("z" :: ss) = "a" :: bump ss
 | 
|
532  | 
| bump (s :: ss) =  | 
|
| 14678 | 533  | 
if is_char s andalso ord "a" <= ord s andalso ord s < ord "z"  | 
| 12904 | 534  | 
then chr (ord s + 1) :: ss  | 
535  | 
else "a" :: s :: ss;  | 
|
| 14678 | 536  | 
|
| 33955 | 537  | 
val (ss, qs) = apfst rev (take_suffix is_quasi (sym_explode str));  | 
| 14908 | 538  | 
val ss' = if symbolic_end ss then "'" :: ss else bump ss;  | 
| 14678 | 539  | 
in implode (rev ss' @ qs) end;  | 
540  | 
||
| 12904 | 541  | 
|
| 6272 | 542  | 
|
| 29324 | 543  | 
(** symbol output **)  | 
| 
14977
 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 
wenzelm 
parents: 
14961 
diff
changeset
 | 
544  | 
|
| 29324 | 545  | 
(* length *)  | 
| 6272 | 546  | 
|
| 14678 | 547  | 
fun sym_len s =  | 
| 
24593
 
1547ea587f5a
added some int constraints (ML_Parse.fix_ints not active here);
 
wenzelm 
parents: 
24580 
diff
changeset
 | 
548  | 
if not (is_printable s) then (0: int)  | 
| 14678 | 549  | 
else if String.isPrefix "\\<long" s then 2  | 
550  | 
else if String.isPrefix "\\<Long" s then 2  | 
|
551  | 
else 1;  | 
|
552  | 
||
| 19473 | 553  | 
fun sym_length ss = fold (fn s => fn n => sym_len s + n) ss 0;  | 
| 14678 | 554  | 
|
| 29324 | 555  | 
|
556  | 
(* print mode *)  | 
|
557  | 
||
558  | 
val xsymbolsN = "xsymbols";  | 
|
559  | 
||
560  | 
fun output s = (s, sym_length (sym_explode s));  | 
|
561  | 
||
562  | 
||
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
563  | 
(*final declarations of this structure!*)  | 
| 29324 | 564  | 
val explode = sym_explode;  | 
| 6272 | 565  | 
val length = sym_length;  | 
| 
6116
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
566  | 
|
| 
 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 
wenzelm 
parents:  
diff
changeset
 | 
567  | 
end;  |