author | wenzelm |
Sat, 24 Jul 2021 15:38:41 +0200 | |
changeset 74056 | fb8d5c0133c9 |
parent 63936 | b87784e19a77 |
child 74170 | 09d4175f473e |
permissions | -rw-r--r-- |
63936 | 1 |
(* Title: Pure/General/symbol_explode.ML |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
2 |
Author: Makarius |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
3 |
|
63936 | 4 |
String explode operation for Isabelle symbols (see also symbol.ML). |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
5 |
*) |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
6 |
|
63936 | 7 |
structure Symbol: sig val explode: string -> string list end = |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
8 |
struct |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
9 |
|
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
10 |
fun is_ascii_letter c = #"A" <= c andalso c <= #"Z" orelse #"a" <= c andalso c <= #"z"; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
11 |
fun is_ascii_letdig c = |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
12 |
is_ascii_letter c orelse #"0" <= c andalso c <= #"9" orelse c = #"_" orelse c = #"'"; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
13 |
|
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
14 |
fun is_utf8 c = c >= #"\128"; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
15 |
fun is_utf8_trailer c = #"\128" <= c andalso c < #"\192"; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
16 |
fun is_utf8_control c = #"\128" <= c andalso c < #"\160"; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
17 |
|
63936 | 18 |
fun explode string = |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
19 |
let |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
20 |
fun char i = String.sub (string, i); |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
21 |
fun string_range i j = String.substring (string, i, j - i); |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
22 |
|
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
23 |
val n = size string; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
24 |
fun test pred i = i < n andalso pred (char i); |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
25 |
fun many pred i = if test pred i then many pred (i + 1) else i; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
26 |
fun maybe pred i = if test pred i then i + 1 else i; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
27 |
fun maybe_char c = maybe (fn c' => c = c'); |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
28 |
fun maybe_ascii_id i = if test is_ascii_letter i then many is_ascii_letdig (i + 1) else i; |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
29 |
|
63936 | 30 |
fun scan i = |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
31 |
if i < n then |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
32 |
let val ch = char i in |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
33 |
(*encoded newline*) |
63936 | 34 |
if ch = #"\^M" then "\n" :: scan (maybe_char #"\n" (i + 1)) |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
35 |
(*pseudo utf8: encoded ascii control*) |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
36 |
else if ch = #"\192" andalso |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
37 |
test is_utf8_control (i + 1) andalso not (test is_utf8 (i + 2)) |
63936 | 38 |
then chr (Char.ord (char (i + 1)) - 128) :: scan (i + 2) |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
39 |
(*utf8*) |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
40 |
else if is_utf8 ch then |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
41 |
let val j = many is_utf8_trailer (i + 1) |
63936 | 42 |
in string_range i j :: scan j end |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
43 |
(*named symbol*) |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
44 |
else if ch = #"\\" andalso test (fn c => c = #"<") (i + 1) then |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
45 |
let val j = (i + 2) |> maybe_char #"^" |> maybe_ascii_id |> maybe_char #">" |
63936 | 46 |
in string_range i j :: scan j end |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
47 |
(*single character*) |
63936 | 48 |
else String.str ch :: scan (i + 1) |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
49 |
end |
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
50 |
else []; |
63936 | 51 |
in scan 0 end; |
63932
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
52 |
|
6040db6b929d
more tight implementation of symbol explode operation (without support for raw symbols);
wenzelm
parents:
diff
changeset
|
53 |
end; |