--- a/src/Pure/General/symbol.ML Thu Jun 24 16:27:40 2010 +0100
+++ b/src/Pure/General/symbol.ML Thu Jun 24 21:57:18 2010 +0200
@@ -15,9 +15,9 @@
val space: symbol
val spaces: int -> string
val is_char: symbol -> bool
+ val is_utf8: symbol -> bool
val is_symbolic: symbol -> bool
val is_printable: symbol -> bool
- val is_utf8_trailer: symbol -> bool
val eof: symbol
val is_eof: symbol -> bool
val not_eof: symbol -> bool
@@ -42,7 +42,7 @@
val is_raw: symbol -> bool
val decode_raw: symbol -> string
val encode_raw: string -> string
- datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string
+ datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string
val decode: symbol -> sym
datatype kind = Letter | Digit | Quasi | Blank | Other
val kind: symbol -> kind
@@ -108,14 +108,14 @@
fun is_char s = size s = 1;
+fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;
+
fun is_symbolic s =
String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
fun is_printable s =
if is_char s then ord space <= ord s andalso ord s <= ord "~"
- else not (String.isPrefix "\\<^" s);
-
-fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
+ else is_utf8 s orelse not (String.isPrefix "\\<^" s);
(* input source control *)
@@ -224,10 +224,12 @@
(* symbol variants *)
-datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string;
+datatype sym =
+ Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string;
fun decode s =
if is_char s then Char s
+ else if is_utf8 s then UTF8 s
else if is_raw s then Raw (decode_raw s)
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
@@ -426,7 +428,9 @@
local
-fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s;
+fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\";
+
+fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
val scan_encoded_newline =
$$ "\^M" -- $$ "\n" >> K "\n" ||
@@ -439,6 +443,7 @@
val scan =
Scan.one is_plain ||
+ Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode ||
scan_encoded_newline ||
($$ "\\" ^^ $$ "<" ^^
!! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
@@ -471,7 +476,7 @@
fun no_explode [] = true
| no_explode ("\\" :: "<" :: _) = false
| no_explode ("\^M" :: _) = false
- | no_explode (_ :: cs) = no_explode cs;
+ | no_explode (c :: cs) = is_ascii c andalso no_explode cs;
in