src/Pure/General/symbol.ML
changeset 37533 d775bd70f571
parent 34095 c2f176a38448
child 37534 62eb9d03b221
--- a/src/Pure/General/symbol.ML	Thu Jun 24 16:27:40 2010 +0100
+++ b/src/Pure/General/symbol.ML	Thu Jun 24 21:57:18 2010 +0200
@@ -15,9 +15,9 @@
   val space: symbol
   val spaces: int -> string
   val is_char: symbol -> bool
+  val is_utf8: symbol -> bool
   val is_symbolic: symbol -> bool
   val is_printable: symbol -> bool
-  val is_utf8_trailer: symbol -> bool
   val eof: symbol
   val is_eof: symbol -> bool
   val not_eof: symbol -> bool
@@ -42,7 +42,7 @@
   val is_raw: symbol -> bool
   val decode_raw: symbol -> string
   val encode_raw: string -> string
-  datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string
+  datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string
   val decode: symbol -> sym
   datatype kind = Letter | Digit | Quasi | Blank | Other
   val kind: symbol -> kind
@@ -108,14 +108,14 @@
 
 fun is_char s = size s = 1;
 
+fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;
+
 fun is_symbolic s =
   String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
 
 fun is_printable s =
   if is_char s then ord space <= ord s andalso ord s <= ord "~"
-  else not (String.isPrefix "\\<^" s);
-
-fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
+  else is_utf8 s orelse not (String.isPrefix "\\<^" s);
 
 
 (* input source control *)
@@ -224,10 +224,12 @@
 
 (* symbol variants *)
 
-datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string;
+datatype sym =
+  Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string;
 
 fun decode s =
   if is_char s then Char s
+  else if is_utf8 s then UTF8 s
   else if is_raw s then Raw (decode_raw s)
   else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
   else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
@@ -426,7 +428,9 @@
 
 local
 
-fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s;
+fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\";
+
+fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
 
 val scan_encoded_newline =
   $$ "\^M" -- $$ "\n" >> K "\n" ||
@@ -439,6 +443,7 @@
 
 val scan =
   Scan.one is_plain ||
+  Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode ||
   scan_encoded_newline ||
   ($$ "\\" ^^ $$ "<" ^^
     !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
@@ -471,7 +476,7 @@
 fun no_explode [] = true
   | no_explode ("\\" :: "<" :: _) = false
   | no_explode ("\^M" :: _) = false
-  | no_explode (_ :: cs) = no_explode cs;
+  | no_explode (c :: cs) = is_ascii c andalso no_explode cs;
 
 in