| author | haftmann | 
| Fri, 15 May 2009 16:39:16 +0200 | |
| changeset 31180 | dae7be64d614 | 
| parent 31013 | 69a476d6fea6 | 
| child 31425 | e8d5417a1831 | 
| permissions | -rw-r--r-- | 
| 6118 | 1 | (* Title: Pure/General/symbol.ML | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 2 | Author: Markus Wenzel, TU Muenchen | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 3 | |
| 21897 | 4 | Generalized characters with infinitely many named symbols. | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 5 | *) | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 6 | |
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 7 | signature SYMBOL = | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 8 | sig | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 9 | type symbol | 
| 26524 | 10 | val SOH: symbol | 
| 11 | val STX: symbol | |
| 26538 
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26524diff
changeset | 12 | val ENQ: symbol | 
| 
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26524diff
changeset | 13 | val ACK: symbol | 
| 26524 | 14 | val DEL: symbol | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 15 | val space: symbol | 
| 23618 | 16 | val spaces: int -> string | 
| 14678 | 17 | val is_char: symbol -> bool | 
| 18 | val is_symbolic: symbol -> bool | |
| 19 | val is_printable: symbol -> bool | |
| 26632 | 20 | val is_utf8_trailer: symbol -> bool | 
| 31013 | 21 | val name_of: symbol -> string | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 22 | val eof: symbol | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 23 | val is_eof: symbol -> bool | 
| 27766 | 24 | val not_eof: symbol -> bool | 
| 27732 | 25 | val stopper: symbol Scan.stopper | 
| 14678 | 26 | val sync: symbol | 
| 27 | val is_sync: symbol -> bool | |
| 28 | val malformed: symbol | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 29 | val end_malformed: symbol | 
| 25641 | 30 | val separate_chars: string -> string | 
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
23728diff
changeset | 31 | val is_regular: symbol -> bool | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 32 | val is_ascii: symbol -> bool | 
| 14678 | 33 | val is_ascii_letter: symbol -> bool | 
| 34 | val is_ascii_digit: symbol -> bool | |
| 24580 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 35 | val is_ascii_hex: symbol -> bool | 
| 14678 | 36 | val is_ascii_quasi: symbol -> bool | 
| 37 | val is_ascii_blank: symbol -> bool | |
| 20200 | 38 | val is_ascii_lower: symbol -> bool | 
| 39 | val is_ascii_upper: symbol -> bool | |
| 40 | val to_ascii_lower: symbol -> symbol | |
| 41 | val to_ascii_upper: symbol -> symbol | |
| 14834 | 42 | val is_raw: symbol -> bool | 
| 43 | val decode_raw: symbol -> string | |
| 14977 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 44 | val encode_raw: string -> string | 
| 14873 | 45 | datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string | 
| 46 | val decode: symbol -> sym | |
| 14678 | 47 | datatype kind = Letter | Digit | Quasi | Blank | Other | 
| 48 | val kind: symbol -> kind | |
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 49 | val is_letter: symbol -> bool | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 50 | val is_digit: symbol -> bool | 
| 12904 | 51 | val is_quasi: symbol -> bool | 
| 14678 | 52 | val is_blank: symbol -> bool | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 53 | val is_quasi_letter: symbol -> bool | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 54 | val is_letdig: symbol -> bool | 
| 16138 | 55 | val is_ident: symbol list -> bool | 
| 14728 | 56 | val beginning: int -> symbol list -> string | 
| 14678 | 57 | val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a | 
| 13730 | 58 | val scan_id: string list -> string * string list | 
| 27835 
ff8b8513965a
Symbol.source/OuterLex.source: more explicit do_recover argument;
 wenzelm parents: 
27766diff
changeset | 59 |   val source: {do_recover: bool} -> (string, 'a) Source.source ->
 | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 60 | (symbol, (string, 'a) Source.source) Source.source | 
| 6272 | 61 | val explode: string -> symbol list | 
| 14977 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 62 | val escape: string -> string | 
| 14678 | 63 | val strip_blanks: string -> string | 
| 64 | val bump_init: string -> string | |
| 12904 | 65 | val bump_string: string -> string | 
| 14678 | 66 | val length: symbol list -> int | 
| 6692 | 67 | val xsymbolsN: string | 
| 29324 | 68 | val output: string -> output * int | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 69 | end; | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 70 | |
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 71 | structure Symbol: SYMBOL = | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 72 | struct | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 73 | |
| 14678 | 74 | (** type symbol **) | 
| 6272 | 75 | |
| 14678 | 76 | (*Symbols, which are considered the smallest entities of any Isabelle | 
| 6272 | 77 | string, may be of the following form: | 
| 14678 | 78 | |
| 14834 | 79 | (1) ASCII symbols: a | 
| 17823 | 80 | (2) regular symbols: \<ident> | 
| 14834 | 81 | (3) control symbols: \<^ident> | 
| 82 | (4) raw control symbols: \<^raw:...>, where "..." may be any printable | |
| 20205 
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
 wenzelm parents: 
20200diff
changeset | 83 | character (excluding ".", ">"), or \<^raw000> | 
| 6272 | 84 | |
| 14678 | 85 | Output is subject to the print_mode variable (default: verbatim), | 
| 86 | actual interpretation in display is up to front-end tools. | |
| 6272 | 87 | *) | 
| 88 | ||
| 89 | type symbol = string; | |
| 90 | ||
| 26524 | 91 | val SOH = chr 1; | 
| 92 | val STX = chr 2; | |
| 26538 
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26524diff
changeset | 93 | val ENQ = chr 5; | 
| 
d65504ffb47d
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26524diff
changeset | 94 | val ACK = chr 6; | 
| 26524 | 95 | val DEL = chr 127; | 
| 96 | ||
| 97 | val space = chr 32; | |
| 17063 | 98 | |
| 99 | local | |
| 100 | val small_spaces = Vector.tabulate (65, fn i => Library.replicate_string i space); | |
| 101 | in | |
| 102 | fun spaces k = | |
| 103 | if k < 64 then Vector.sub (small_spaces, k) | |
| 104 | else Library.replicate_string (k div 64) (Vector.sub (small_spaces, 64)) ^ | |
| 105 | Vector.sub (small_spaces, k mod 64); | |
| 106 | end; | |
| 14678 | 107 | |
| 108 | fun is_char s = size s = 1; | |
| 109 | ||
| 110 | fun is_symbolic s = | |
| 111 | String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s); | |
| 112 | ||
| 113 | fun is_printable s = | |
| 114 | if is_char s then ord space <= ord s andalso ord s <= ord "~" | |
| 115 | else not (String.isPrefix "\\<^" s); | |
| 6272 | 116 | |
| 26632 | 117 | fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192; | 
| 118 | ||
| 6272 | 119 | |
| 14678 | 120 | (* input source control *) | 
| 6272 | 121 | |
| 14678 | 122 | val eof = ""; | 
| 6272 | 123 | fun is_eof s = s = eof; | 
| 124 | fun not_eof s = s <> eof; | |
| 27732 | 125 | val stopper = Scan.stopper (K eof) is_eof; | 
| 6272 | 126 | |
| 14678 | 127 | val sync = "\\<^sync>"; | 
| 128 | fun is_sync s = s = sync; | |
| 129 | ||
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 130 | val malformed = "[["; | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 131 | val end_malformed = "]]"; | 
| 25641 | 132 | |
| 133 | val separate_chars = explode #> space_implode space; | |
| 134 | fun malformed_msg s = "Malformed symbolic character: " ^ quote (separate_chars s); | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 135 | |
| 23784 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
23728diff
changeset | 136 | fun is_regular s = | 
| 
75e6b9dd5336
Symbol.not_eof/sync is superceded by Symbol.is_regular (rules out further control symbols);
 wenzelm parents: 
23728diff
changeset | 137 | not_eof s andalso s <> sync andalso s <> malformed andalso s <> end_malformed; | 
| 14678 | 138 | |
| 31013 | 139 | fun name_of s = if is_symbolic s | 
| 140 | then (unsuffix ">" o unprefix "\\<") s | |
| 141 | else error (malformed_msg s); | |
| 142 | ||
| 14678 | 143 | |
| 144 | (* ascii symbols *) | |
| 145 | ||
| 146 | fun is_ascii s = is_char s andalso ord s < 128; | |
| 147 | ||
| 148 | fun is_ascii_letter s = | |
| 149 | is_char s andalso | |
| 150 | (ord "A" <= ord s andalso ord s <= ord "Z" orelse | |
| 151 | ord "a" <= ord s andalso ord s <= ord "z"); | |
| 152 | ||
| 153 | fun is_ascii_digit s = | |
| 154 | is_char s andalso ord "0" <= ord s andalso ord s <= ord "9"; | |
| 155 | ||
| 24580 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 156 | fun is_ascii_hex s = | 
| 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 157 | is_char s andalso | 
| 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 158 | (ord "0" <= ord s andalso ord s <= ord "9" orelse | 
| 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 159 | ord "A" <= ord s andalso ord s <= ord "F" orelse | 
| 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 160 | ord "a" <= ord s andalso ord s <= ord "f"); | 
| 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 161 | |
| 14678 | 162 | fun is_ascii_quasi "_" = true | 
| 163 | | is_ascii_quasi "'" = true | |
| 164 | | is_ascii_quasi _ = false; | |
| 165 | ||
| 166 | val is_ascii_blank = | |
| 24580 
916259859344
replaced Symbol.is_hex_letter to Symbol.is_ascii_hex;
 wenzelm parents: 
23784diff
changeset | 167 | fn " " => true | "\t" => true | "\n" => true | "\^K" => true | "\^L" => true | "\^M" => true | 
| 14678 | 168 | | _ => false; | 
| 169 | ||
| 20200 | 170 | fun is_ascii_lower s = is_char s andalso (ord "a" <= ord s andalso ord s <= ord "z"); | 
| 171 | fun is_ascii_upper s = is_char s andalso (ord "A" <= ord s andalso ord s <= ord "Z"); | |
| 172 | ||
| 173 | fun to_ascii_lower s = if is_ascii_upper s then chr (ord s + ord "a" - ord "A") else s; | |
| 174 | fun to_ascii_upper s = if is_ascii_lower s then chr (ord s + ord "A" - ord "a") else s; | |
| 175 | ||
| 14678 | 176 | |
| 14956 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 177 | (* encode_raw *) | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 178 | |
| 20205 
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
 wenzelm parents: 
20200diff
changeset | 179 | fun raw_chr c = | 
| 
7b2958d3d575
raw symbols: disallow dot to avoid confusion in NameSpace.unpack;
 wenzelm parents: 
20200diff
changeset | 180 | ord space <= ord c andalso ord c <= ord "~" andalso c <> "." andalso c <> ">" | 
| 17823 | 181 | orelse ord c >= 128; | 
| 14956 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 182 | |
| 29324 | 183 | fun encode_raw "" = "" | 
| 184 | | encode_raw str = | |
| 185 | let | |
| 186 | val raw0 = enclose "\\<^raw:" ">"; | |
| 187 | val raw1 = raw0 o implode; | |
| 188 | val raw2 = enclose "\\<^raw" ">" o string_of_int o ord; | |
| 189 | ||
| 190 | fun encode cs = enc (Library.take_prefix raw_chr cs) | |
| 191 | and enc ([], []) = [] | |
| 192 | | enc (cs, []) = [raw1 cs] | |
| 193 | | enc ([], d :: ds) = raw2 d :: encode ds | |
| 194 | | enc (cs, d :: ds) = raw1 cs :: raw2 d :: encode ds; | |
| 195 | in | |
| 196 | if exists_string (not o raw_chr) str then implode (encode (explode str)) | |
| 197 | else raw0 str | |
| 198 | end; | |
| 14956 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 199 | |
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 200 | |
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 201 | (* diagnostics *) | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 202 | |
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 203 | fun beginning n cs = | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 204 | let | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 205 | val drop_blanks = #1 o Library.take_suffix is_ascii_blank; | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 206 | val all_cs = drop_blanks cs; | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 207 | val dots = if length all_cs > n then " ..." else ""; | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 208 | in | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 209 | (drop_blanks (Library.take (n, all_cs)) | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 210 | |> map (fn c => if is_ascii_blank c then space else c) | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 211 | |> implode) ^ dots | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 212 | end; | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 213 | |
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 214 | |
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 215 | (* decode_raw *) | 
| 14834 | 216 | |
| 217 | fun is_raw s = | |
| 17063 | 218 | String.isPrefix "\\<^raw" s andalso String.isSuffix ">" s; | 
| 14834 | 219 | |
| 220 | fun decode_raw s = | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 221 | if not (is_raw s) then error (malformed_msg s) | 
| 14834 | 222 | else if String.isPrefix "\\<^raw:" s then String.substring (s, 7, size s - 8) | 
| 223 | else chr (#1 (Library.read_int (explode (String.substring (s, 6, size s - 7))))); | |
| 224 | ||
| 225 | ||
| 14873 | 226 | (* symbol variants *) | 
| 227 | ||
| 228 | datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string; | |
| 229 | ||
| 230 | fun decode s = | |
| 231 | if is_char s then Char s | |
| 232 | else if is_raw s then Raw (decode_raw s) | |
| 233 | else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4)) | |
| 234 | else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3)) | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 235 | else error (malformed_msg s); | 
| 14873 | 236 | |
| 237 | ||
| 14678 | 238 | (* standard symbol kinds *) | 
| 239 | ||
| 240 | datatype kind = Letter | Digit | Quasi | Blank | Other; | |
| 6272 | 241 | |
| 14171 
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
 skalberg parents: 
13730diff
changeset | 242 | local | 
| 14678 | 243 | val symbol_kinds = Symtab.make | 
| 244 |    [("\\<A>", Letter),
 | |
| 245 |     ("\\<B>", Letter),
 | |
| 246 |     ("\\<C>", Letter),
 | |
| 247 |     ("\\<D>", Letter),
 | |
| 248 |     ("\\<E>", Letter),
 | |
| 249 |     ("\\<F>", Letter),
 | |
| 250 |     ("\\<G>", Letter),
 | |
| 251 |     ("\\<H>", Letter),
 | |
| 252 |     ("\\<I>", Letter),
 | |
| 253 |     ("\\<J>", Letter),
 | |
| 254 |     ("\\<K>", Letter),
 | |
| 255 |     ("\\<L>", Letter),
 | |
| 256 |     ("\\<M>", Letter),
 | |
| 257 |     ("\\<N>", Letter),
 | |
| 258 |     ("\\<O>", Letter),
 | |
| 259 |     ("\\<P>", Letter),
 | |
| 260 |     ("\\<Q>", Letter),
 | |
| 261 |     ("\\<R>", Letter),
 | |
| 262 |     ("\\<S>", Letter),
 | |
| 263 |     ("\\<T>", Letter),
 | |
| 264 |     ("\\<U>", Letter),
 | |
| 265 |     ("\\<V>", Letter),
 | |
| 266 |     ("\\<W>", Letter),
 | |
| 267 |     ("\\<X>", Letter),
 | |
| 268 |     ("\\<Y>", Letter),
 | |
| 269 |     ("\\<Z>", Letter),
 | |
| 270 |     ("\\<a>", Letter),
 | |
| 271 |     ("\\<b>", Letter),
 | |
| 272 |     ("\\<c>", Letter),
 | |
| 273 |     ("\\<d>", Letter),
 | |
| 274 |     ("\\<e>", Letter),
 | |
| 275 |     ("\\<f>", Letter),
 | |
| 276 |     ("\\<g>", Letter),
 | |
| 277 |     ("\\<h>", Letter),
 | |
| 278 |     ("\\<i>", Letter),
 | |
| 279 |     ("\\<j>", Letter),
 | |
| 280 |     ("\\<k>", Letter),
 | |
| 281 |     ("\\<l>", Letter),
 | |
| 282 |     ("\\<m>", Letter),
 | |
| 283 |     ("\\<n>", Letter),
 | |
| 284 |     ("\\<o>", Letter),
 | |
| 285 |     ("\\<p>", Letter),
 | |
| 286 |     ("\\<q>", Letter),
 | |
| 287 |     ("\\<r>", Letter),
 | |
| 288 |     ("\\<s>", Letter),
 | |
| 289 |     ("\\<t>", Letter),
 | |
| 290 |     ("\\<u>", Letter),
 | |
| 291 |     ("\\<v>", Letter),
 | |
| 292 |     ("\\<w>", Letter),
 | |
| 293 |     ("\\<x>", Letter),
 | |
| 294 |     ("\\<y>", Letter),
 | |
| 295 |     ("\\<z>", Letter),
 | |
| 296 |     ("\\<AA>", Letter),
 | |
| 297 |     ("\\<BB>", Letter),
 | |
| 298 |     ("\\<CC>", Letter),
 | |
| 299 |     ("\\<DD>", Letter),
 | |
| 300 |     ("\\<EE>", Letter),
 | |
| 301 |     ("\\<FF>", Letter),
 | |
| 302 |     ("\\<GG>", Letter),
 | |
| 303 |     ("\\<HH>", Letter),
 | |
| 304 |     ("\\<II>", Letter),
 | |
| 305 |     ("\\<JJ>", Letter),
 | |
| 306 |     ("\\<KK>", Letter),
 | |
| 307 |     ("\\<LL>", Letter),
 | |
| 308 |     ("\\<MM>", Letter),
 | |
| 309 |     ("\\<NN>", Letter),
 | |
| 310 |     ("\\<OO>", Letter),
 | |
| 311 |     ("\\<PP>", Letter),
 | |
| 312 |     ("\\<QQ>", Letter),
 | |
| 313 |     ("\\<RR>", Letter),
 | |
| 314 |     ("\\<SS>", Letter),
 | |
| 315 |     ("\\<TT>", Letter),
 | |
| 316 |     ("\\<UU>", Letter),
 | |
| 317 |     ("\\<VV>", Letter),
 | |
| 318 |     ("\\<WW>", Letter),
 | |
| 319 |     ("\\<XX>", Letter),
 | |
| 320 |     ("\\<YY>", Letter),
 | |
| 321 |     ("\\<ZZ>", Letter),
 | |
| 322 |     ("\\<aa>", Letter),
 | |
| 323 |     ("\\<bb>", Letter),
 | |
| 324 |     ("\\<cc>", Letter),
 | |
| 325 |     ("\\<dd>", Letter),
 | |
| 326 |     ("\\<ee>", Letter),
 | |
| 327 |     ("\\<ff>", Letter),
 | |
| 328 |     ("\\<gg>", Letter),
 | |
| 329 |     ("\\<hh>", Letter),
 | |
| 330 |     ("\\<ii>", Letter),
 | |
| 331 |     ("\\<jj>", Letter),
 | |
| 332 |     ("\\<kk>", Letter),
 | |
| 333 |     ("\\<ll>", Letter),
 | |
| 334 |     ("\\<mm>", Letter),
 | |
| 335 |     ("\\<nn>", Letter),
 | |
| 336 |     ("\\<oo>", Letter),
 | |
| 337 |     ("\\<pp>", Letter),
 | |
| 338 |     ("\\<qq>", Letter),
 | |
| 339 |     ("\\<rr>", Letter),
 | |
| 340 |     ("\\<ss>", Letter),
 | |
| 341 |     ("\\<tt>", Letter),
 | |
| 342 |     ("\\<uu>", Letter),
 | |
| 343 |     ("\\<vv>", Letter),
 | |
| 344 |     ("\\<ww>", Letter),
 | |
| 345 |     ("\\<xx>", Letter),
 | |
| 346 |     ("\\<yy>", Letter),
 | |
| 347 |     ("\\<zz>", Letter),
 | |
| 348 |     ("\\<alpha>", Letter),
 | |
| 349 |     ("\\<beta>", Letter),
 | |
| 350 |     ("\\<gamma>", Letter),
 | |
| 351 |     ("\\<delta>", Letter),
 | |
| 352 |     ("\\<epsilon>", Letter),
 | |
| 353 |     ("\\<zeta>", Letter),
 | |
| 354 |     ("\\<eta>", Letter),
 | |
| 355 |     ("\\<theta>", Letter),
 | |
| 356 |     ("\\<iota>", Letter),
 | |
| 357 |     ("\\<kappa>", Letter),
 | |
| 358 |     ("\\<lambda>", Other),      (*sic!*)
 | |
| 359 |     ("\\<mu>", Letter),
 | |
| 360 |     ("\\<nu>", Letter),
 | |
| 361 |     ("\\<xi>", Letter),
 | |
| 362 |     ("\\<pi>", Letter),
 | |
| 363 |     ("\\<rho>", Letter),
 | |
| 364 |     ("\\<sigma>", Letter),
 | |
| 365 |     ("\\<tau>", Letter),
 | |
| 366 |     ("\\<upsilon>", Letter),
 | |
| 367 |     ("\\<phi>", Letter),
 | |
| 25521 | 368 |     ("\\<chi>", Letter),
 | 
| 14678 | 369 |     ("\\<psi>", Letter),
 | 
| 370 |     ("\\<omega>", Letter),
 | |
| 371 |     ("\\<Gamma>", Letter),
 | |
| 372 |     ("\\<Delta>", Letter),
 | |
| 373 |     ("\\<Theta>", Letter),
 | |
| 374 |     ("\\<Lambda>", Letter),
 | |
| 375 |     ("\\<Xi>", Letter),
 | |
| 376 |     ("\\<Pi>", Letter),
 | |
| 377 |     ("\\<Sigma>", Letter),
 | |
| 378 |     ("\\<Upsilon>", Letter),
 | |
| 379 |     ("\\<Phi>", Letter),
 | |
| 380 |     ("\\<Psi>", Letter),
 | |
| 381 |     ("\\<Omega>", Letter),
 | |
| 14961 | 382 |     ("\\<^isub>", Letter),
 | 
| 383 |     ("\\<^isup>", Letter),
 | |
| 14678 | 384 |     ("\\<spacespace>", Blank)];
 | 
| 14171 
0cab06e3bbd0
Extended the notion of letter and digit, such that now one may use greek,
 skalberg parents: 
13730diff
changeset | 385 | in | 
| 14678 | 386 | fun kind s = | 
| 387 | if is_ascii_letter s then Letter | |
| 388 | else if is_ascii_digit s then Digit | |
| 389 | else if is_ascii_quasi s then Quasi | |
| 390 | else if is_ascii_blank s then Blank | |
| 391 | else if is_char s then Other | |
| 18939 | 392 | else the_default Other (Symtab.lookup symbol_kinds s); | 
| 14678 | 393 | end; | 
| 14173 | 394 | |
| 14678 | 395 | fun is_letter s = kind s = Letter; | 
| 396 | fun is_digit s = kind s = Digit; | |
| 397 | fun is_quasi s = kind s = Quasi; | |
| 398 | fun is_blank s = kind s = Blank; | |
| 6272 | 399 | |
| 14678 | 400 | fun is_quasi_letter s = let val k = kind s in k = Letter orelse k = Quasi end; | 
| 401 | fun is_letdig s = let val k = kind s in k = Letter orelse k = Digit orelse k = Quasi end; | |
| 11010 | 402 | |
| 16138 | 403 | fun is_ident [] = false | 
| 404 | | is_ident (c :: cs) = is_letter c andalso forall is_letdig cs; | |
| 405 | ||
| 6272 | 406 | |
| 407 | ||
| 14678 | 408 | (** symbol input **) | 
| 409 | ||
| 410 | (* scanning through symbols *) | |
| 6272 | 411 | |
| 6640 | 412 | fun scanner msg scan chs = | 
| 413 | let | |
| 15531 | 414 | fun message (cs, NONE) = msg ^ ": " ^ quote (beginning 10 cs) | 
| 415 | | message (cs, SOME msg') = msg ^ ", " ^ msg' ^ ": " ^ quote (beginning 10 cs); | |
| 14961 | 416 | val fin_scan = Scan.error (Scan.finite stopper (!! message scan)); | 
| 6640 | 417 | in | 
| 418 | (case fin_scan chs of | |
| 419 | (result, []) => result | |
| 15531 | 420 | | (_, rest) => error (message (rest, NONE))) | 
| 6640 | 421 | end; | 
| 422 | ||
| 21858 
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
 wenzelm parents: 
21495diff
changeset | 423 | val scan_id = Scan.one is_letter ^^ (Scan.many is_letdig >> implode); | 
| 14678 | 424 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 425 | |
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 426 | (* source *) | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 427 | |
| 14678 | 428 | local | 
| 14561 
c53396af770e
* raw control symbols are of the form \<^raw:...> now.
 schirmer parents: 
14559diff
changeset | 429 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 430 | fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s; | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 431 | |
| 14678 | 432 | val scan_encoded_newline = | 
| 17756 | 433 | $$ "\^M" -- $$ "\n" >> K "\n" || | 
| 434 | $$ "\^M" >> K "\n" || | |
| 14956 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 435 | $$ "\\" -- Scan.optional ($$ "\\") "" -- Scan.this_string "<^newline>" >> K "\n"; | 
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 436 | |
| 
70ec4b8aef8d
prevent looping of error messages involving malformed symbols;
 wenzelm parents: 
14908diff
changeset | 437 | val scan_raw = | 
| 21858 
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
 wenzelm parents: 
21495diff
changeset | 438 | Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) || | 
| 
05f57309170c
avoid conflict with Alice keywords: renamed pack -> implode, unpack -> explode, any -> many, avoided assert;
 wenzelm parents: 
21495diff
changeset | 439 | Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode); | 
| 14678 | 440 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 441 | val scan = | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 442 | Scan.one is_plain || | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 443 | scan_encoded_newline || | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 444 | (($$ "\\" --| Scan.optional ($$ "\\") "") ^^ $$ "<" ^^ | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 445 |     !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
 | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 446 | (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) || | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 447 | Scan.one not_eof; | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 448 | |
| 27745 
31899d977a89
improved recover: also resync on symbol/comment/verbatim delimiters;
 wenzelm parents: 
27732diff
changeset | 449 | val scan_resync = | 
| 
31899d977a89
improved recover: also resync on symbol/comment/verbatim delimiters;
 wenzelm parents: 
27732diff
changeset | 450 | Scan.one is_ascii_blank || $$ "\"" || $$ "`" || $$ "\\" || | 
| 
31899d977a89
improved recover: also resync on symbol/comment/verbatim delimiters;
 wenzelm parents: 
27732diff
changeset | 451 | Scan.this_string "(*" || Scan.this_string "*)" || | 
| 
31899d977a89
improved recover: also resync on symbol/comment/verbatim delimiters;
 wenzelm parents: 
27732diff
changeset | 452 |   Scan.this_string "{*" || Scan.this_string "*}";
 | 
| 
31899d977a89
improved recover: also resync on symbol/comment/verbatim delimiters;
 wenzelm parents: 
27732diff
changeset | 453 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 454 | val recover = | 
| 27903 | 455 | (Scan.this (explode "\\\\<") || Scan.this (explode "\\<")) @@@ | 
| 27745 
31899d977a89
improved recover: also resync on symbol/comment/verbatim delimiters;
 wenzelm parents: 
27732diff
changeset | 456 | Scan.repeat (Scan.unless scan_resync (Scan.one not_eof)) | 
| 
31899d977a89
improved recover: also resync on symbol/comment/verbatim delimiters;
 wenzelm parents: 
27732diff
changeset | 457 | >> (fn ss => malformed :: ss @ [end_malformed]); | 
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 458 | |
| 14678 | 459 | in | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 460 | |
| 27835 
ff8b8513965a
Symbol.source/OuterLex.source: more explicit do_recover argument;
 wenzelm parents: 
27766diff
changeset | 461 | fun source {do_recover} src =
 | 
| 23682 
cf4773532006
nested source: explicit interactive flag for recover avoids duplicate errors;
 wenzelm parents: 
23676diff
changeset | 462 | Source.source stopper (Scan.bulk scan) | 
| 
cf4773532006
nested source: explicit interactive flag for recover avoids duplicate errors;
 wenzelm parents: 
23676diff
changeset | 463 | (if do_recover then SOME (false, K recover) else NONE) src; | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 464 | |
| 14678 | 465 | end; | 
| 466 | ||
| 14562 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 wenzelm parents: 
14561diff
changeset | 467 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 468 | (* explode *) | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 469 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 470 | local | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 471 | |
| 14562 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 wenzelm parents: 
14561diff
changeset | 472 | fun no_explode [] = true | 
| 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 wenzelm parents: 
14561diff
changeset | 473 |   | no_explode ("\\" :: "<" :: _) = false
 | 
| 17756 | 474 |   | no_explode ("\^M" :: _) = false
 | 
| 14562 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 wenzelm parents: 
14561diff
changeset | 475 | | no_explode (_ :: cs) = no_explode cs; | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 476 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 477 | in | 
| 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 478 | |
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 479 | fun sym_explode str = | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 480 | let val chs = explode str in | 
| 14562 
980da32f4617
proper handling of lines terminated by CRLF or CR;
 wenzelm parents: 
14561diff
changeset | 481 | if no_explode chs then chs | 
| 27835 
ff8b8513965a
Symbol.source/OuterLex.source: more explicit do_recover argument;
 wenzelm parents: 
27766diff
changeset | 482 |     else Source.exhaust (source {do_recover = false} (Source.of_list chs))
 | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 483 | end; | 
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 484 | |
| 23676 
ea9b7e9c2301
scan: changed treatment of malformed symbols, passed to next stage;
 wenzelm parents: 
23618diff
changeset | 485 | end; | 
| 14994 | 486 | |
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 487 | |
| 14977 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 488 | (* escape *) | 
| 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 489 | |
| 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 490 | val escape = implode o map (fn s => if is_char s then s else "\\" ^ s) o sym_explode; | 
| 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 491 | |
| 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 492 | |
| 14678 | 493 | (* blanks *) | 
| 494 | ||
| 495 | fun strip_blanks s = | |
| 496 | sym_explode s | |
| 497 | |> Library.take_prefix is_blank |> #2 | |
| 498 | |> Library.take_suffix is_blank |> #1 | |
| 499 | |> implode; | |
| 500 | ||
| 501 | ||
| 502 | (* bump string -- treat as base 26 or base 1 numbers *) | |
| 503 | ||
| 15979 | 504 | fun symbolic_end (_ :: "\\<^isub>" :: _) = true | 
| 505 | | symbolic_end (_ :: "\\<^isup>" :: _) = true | |
| 14908 | 506 | | symbolic_end (s :: _) = is_symbolic s | 
| 507 | | symbolic_end [] = false; | |
| 14678 | 508 | |
| 509 | fun bump_init str = | |
| 14908 | 510 | if symbolic_end (rev (sym_explode str)) then str ^ "'" | 
| 14678 | 511 | else str ^ "a"; | 
| 12904 | 512 | |
| 513 | fun bump_string str = | |
| 514 | let | |
| 515 | fun bump [] = ["a"] | |
| 516 |       | bump ("z" :: ss) = "a" :: bump ss
 | |
| 517 | | bump (s :: ss) = | |
| 14678 | 518 | if is_char s andalso ord "a" <= ord s andalso ord s < ord "z" | 
| 12904 | 519 | then chr (ord s + 1) :: ss | 
| 520 | else "a" :: s :: ss; | |
| 14678 | 521 | |
| 522 | val (ss, qs) = apfst rev (Library.take_suffix is_quasi (sym_explode str)); | |
| 14908 | 523 | val ss' = if symbolic_end ss then "'" :: ss else bump ss; | 
| 14678 | 524 | in implode (rev ss' @ qs) end; | 
| 525 | ||
| 12904 | 526 | |
| 6272 | 527 | |
| 29324 | 528 | (** symbol output **) | 
| 14977 
77d88064991a
added escape, export encode_raw, default mode now trivial, tuned;
 wenzelm parents: 
14961diff
changeset | 529 | |
| 29324 | 530 | (* length *) | 
| 6272 | 531 | |
| 14678 | 532 | fun sym_len s = | 
| 24593 
1547ea587f5a
added some int constraints (ML_Parse.fix_ints not active here);
 wenzelm parents: 
24580diff
changeset | 533 | if not (is_printable s) then (0: int) | 
| 14678 | 534 | else if String.isPrefix "\\<long" s then 2 | 
| 535 | else if String.isPrefix "\\<Long" s then 2 | |
| 536 | else if s = "\\<spacespace>" then 2 | |
| 537 | else 1; | |
| 538 | ||
| 19473 | 539 | fun sym_length ss = fold (fn s => fn n => sym_len s + n) ss 0; | 
| 14678 | 540 | |
| 29324 | 541 | |
| 542 | (* print mode *) | |
| 543 | ||
| 544 | val xsymbolsN = "xsymbols"; | |
| 545 | ||
| 546 | fun output s = (s, sym_length (sym_explode s)); | |
| 547 | ||
| 548 | ||
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 549 | (*final declarations of this structure!*) | 
| 29324 | 550 | val explode = sym_explode; | 
| 6272 | 551 | val length = sym_length; | 
| 6116 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 552 | |
| 
8ba2f25610f7
files scan.ML, source.ML, symbol.ML, pretty.ML moved to Pure/General;
 wenzelm parents: diff
changeset | 553 | end; |