merged
authorwenzelm
Fri, 25 Jun 2010 14:05:49 +0200
changeset 37554 6c7399bc0d10
parent 37553 08fc6b026b01 (current diff)
parent 37536 c62aa9281101 (diff)
child 37555 d57d0f581d38
child 37559 9118ce1d1750
merged
--- a/NEWS	Fri Jun 25 12:15:49 2010 +0200
+++ b/NEWS	Fri Jun 25 14:05:49 2010 +0200
@@ -4,6 +4,17 @@
 New in this Isabelle version
 ----------------------------
 
+*** General ***
+
+* Explicit treatment of UTF8 sequences as Isabelle symbols, such that
+a Unicode character is treated as a single symbol, not a sequence of
+non-ASCII bytes as before.  Since Isabelle/ML string literals may
+contain symbols without further backslash escapes, Unicode can now be
+used here as well.  Recall that Symbol.explode in ML provides a
+consistent view on symbols, while raw explode (or String.explode)
+merely give a byte-oriented representation.
+
+
 *** HOL ***
 
 * Some previously unqualified names have been qualified:
--- a/doc-src/IsarImplementation/Thy/Prelim.thy	Fri Jun 25 12:15:49 2010 +0200
+++ b/doc-src/IsarImplementation/Thy/Prelim.thy	Fri Jun 25 14:05:49 2010 +0200
@@ -527,8 +527,10 @@
 
   \begin{enumerate}
 
-  \item a single ASCII character ``@{text "c"}'' or raw byte in the
-  range of 128\dots 255, for example ``\verb,a,'',
+  \item a single ASCII character ``@{text "c"}'', for example
+  ``\verb,a,'',
+
+  \item a codepoint according to UTF8 (non-ASCII byte sequence),
 
   \item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'',
   for example ``\verb,\,\verb,<alpha>,'',
@@ -555,19 +557,17 @@
   ``\verb,\,\verb,<alpha>,'' is classified as a letter, which means it
   may occur within regular Isabelle identifiers.
 
-  Since the character set underlying Isabelle symbols is 7-bit ASCII
-  and 8-bit characters are passed through transparently, Isabelle can
-  also process Unicode/UCS data in UTF-8 encoding.\footnote{When
-  counting precise source positions internally, bytes in the range of
-  128\dots 191 are ignored.  In UTF-8 encoding, this interval covers
-  the additional trailer bytes, so Isabelle happens to count Unicode
-  characters here, not bytes in memory.  In ISO-Latin encoding, the
-  ignored range merely includes some extra punctuation characters that
-  even have replacements within the standard collection of Isabelle
-  symbols; the accented letters range is counted properly.} Unicode
-  provides its own collection of mathematical symbols, but within the
-  core Isabelle/ML world there is no link to the standard collection
-  of Isabelle regular symbols.
+  The character set underlying Isabelle symbols is 7-bit ASCII, but
+  8-bit character sequences are passed-through unchanged.  Unicode/UCS
+  data in UTF-8 encoding is processed in a non-strict fashion, such
+  that well-formed code sequences are recognized
+  accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
+  in some special punctuation characters that even have replacements
+  within the standard collection of Isabelle symbols.  Text consisting
+  of ASCII plus accented letters can be processed in either encoding.}
+  Unicode provides its own collection of mathematical symbols, but
+  within the core Isabelle/ML world there is no link to the standard
+  collection of Isabelle regular symbols.
 
   \medskip Output of Isabelle symbols depends on the print mode
   (\secref{print-mode}).  For example, the standard {\LaTeX} setup of
@@ -612,8 +612,8 @@
 
   \item @{ML_type "Symbol.sym"} is a concrete datatype that represents
   the different kinds of symbols explicitly, with constructors @{ML
-  "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.Ctrl"}, @{ML
-  "Symbol.Raw"}.
+  "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.UTF8"}, @{ML
+  "Symbol.Ctrl"}, @{ML "Symbol.Raw"}.
 
   \item @{ML "Symbol.decode"} converts the string representation of a
   symbol into the datatype version.
--- a/doc-src/IsarImplementation/Thy/document/Prelim.tex	Fri Jun 25 12:15:49 2010 +0200
+++ b/doc-src/IsarImplementation/Thy/document/Prelim.tex	Fri Jun 25 14:05:49 2010 +0200
@@ -648,8 +648,10 @@
 
   \begin{enumerate}
 
-  \item a single ASCII character ``\isa{c}'' or raw byte in the
-  range of 128\dots 255, for example ``\verb,a,'',
+  \item a single ASCII character ``\isa{c}'', for example
+  ``\verb,a,'',
+
+  \item a codepoint according to UTF8 (non-ASCII byte sequence),
 
   \item a regular symbol ``\verb,\,\verb,<,\isa{ident}\verb,>,'',
   for example ``\verb,\,\verb,<alpha>,'',
@@ -673,19 +675,17 @@
   ``\verb,\,\verb,<alpha>,'' is classified as a letter, which means it
   may occur within regular Isabelle identifiers.
 
-  Since the character set underlying Isabelle symbols is 7-bit ASCII
-  and 8-bit characters are passed through transparently, Isabelle can
-  also process Unicode/UCS data in UTF-8 encoding.\footnote{When
-  counting precise source positions internally, bytes in the range of
-  128\dots 191 are ignored.  In UTF-8 encoding, this interval covers
-  the additional trailer bytes, so Isabelle happens to count Unicode
-  characters here, not bytes in memory.  In ISO-Latin encoding, the
-  ignored range merely includes some extra punctuation characters that
-  even have replacements within the standard collection of Isabelle
-  symbols; the accented letters range is counted properly.} Unicode
-  provides its own collection of mathematical symbols, but within the
-  core Isabelle/ML world there is no link to the standard collection
-  of Isabelle regular symbols.
+  The character set underlying Isabelle symbols is 7-bit ASCII, but
+  8-bit character sequences are passed-through unchanged.  Unicode/UCS
+  data in UTF-8 encoding is processed in a non-strict fashion, such
+  that well-formed code sequences are recognized
+  accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
+  in some special punctuation characters that even have replacements
+  within the standard collection of Isabelle symbols.  Text consisting
+  of ASCII plus accented letters can be processed in either encoding.}
+  Unicode provides its own collection of mathematical symbols, but
+  within the core Isabelle/ML world there is no link to the standard
+  collection of Isabelle regular symbols.
 
   \medskip Output of Isabelle symbols depends on the print mode
   (\secref{print-mode}).  For example, the standard {\LaTeX} setup of
@@ -733,7 +733,7 @@
   \cite{isabelle-isar-ref}.
 
   \item \verb|Symbol.sym| is a concrete datatype that represents
-  the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|.
+  the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.UTF8|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|.
 
   \item \verb|Symbol.decode| converts the string representation of a
   symbol into the datatype version.
--- a/src/Pure/General/position.ML	Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/General/position.ML	Fri Jun 25 14:05:49 2010 +0200
@@ -67,8 +67,8 @@
 fun advance_count "\n" (i: int, j: int, k: int) =
       (if_valid i (i + 1), if_valid j 1, if_valid k (k + 1))
   | advance_count s (i, j, k) =
-      if Symbol.is_regular s andalso not (Symbol.is_utf8_trailer s)
-      then (i, if_valid j (j + 1), if_valid k (k + 1)) else (i, j, k);
+      if Symbol.is_regular s then (i, if_valid j (j + 1), if_valid k (k + 1))
+      else (i, j, k);
 
 fun invalid_count (i, j, k) =
   not (valid i orelse valid j orelse valid k);
--- a/src/Pure/General/symbol.ML	Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/General/symbol.ML	Fri Jun 25 14:05:49 2010 +0200
@@ -15,9 +15,9 @@
   val space: symbol
   val spaces: int -> string
   val is_char: symbol -> bool
+  val is_utf8: symbol -> bool
   val is_symbolic: symbol -> bool
   val is_printable: symbol -> bool
-  val is_utf8_trailer: symbol -> bool
   val eof: symbol
   val is_eof: symbol -> bool
   val not_eof: symbol -> bool
@@ -42,7 +42,7 @@
   val is_raw: symbol -> bool
   val decode_raw: symbol -> string
   val encode_raw: string -> string
-  datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string
+  datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string
   val decode: symbol -> sym
   datatype kind = Letter | Digit | Quasi | Blank | Other
   val kind: symbol -> kind
@@ -108,14 +108,14 @@
 
 fun is_char s = size s = 1;
 
+fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;
+
 fun is_symbolic s =
   String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
 
 fun is_printable s =
   if is_char s then ord space <= ord s andalso ord s <= ord "~"
-  else not (String.isPrefix "\\<^" s);
-
-fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
+  else is_utf8 s orelse not (String.isPrefix "\\<^" s);
 
 
 (* input source control *)
@@ -224,10 +224,12 @@
 
 (* symbol variants *)
 
-datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string;
+datatype sym =
+  Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string;
 
 fun decode s =
   if is_char s then Char s
+  else if is_utf8 s then UTF8 s
   else if is_raw s then Raw (decode_raw s)
   else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
   else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
@@ -426,7 +428,9 @@
 
 local
 
-fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s;
+fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\";
+
+fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
 
 val scan_encoded_newline =
   $$ "\^M" -- $$ "\n" >> K "\n" ||
@@ -439,6 +443,7 @@
 
 val scan =
   Scan.one is_plain ||
+  Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode ||
   scan_encoded_newline ||
   ($$ "\\" ^^ $$ "<" ^^
     !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
@@ -471,7 +476,7 @@
 fun no_explode [] = true
   | no_explode ("\\" :: "<" :: _) = false
   | no_explode ("\^M" :: _) = false
-  | no_explode (_ :: cs) = no_explode cs;
+  | no_explode (c :: cs) = is_ascii c andalso no_explode cs;
 
 in
 
@@ -486,7 +491,11 @@
 
 (* escape *)
 
-val esc = fn s => if is_char s then s else "\\" ^ s;
+val esc = fn s =>
+  if is_char s then s
+  else if is_utf8 s then translate_string (fn c => "\\" ^ string_of_int (ord c)) s
+  else "\\" ^ s;
+
 val escape = implode o map esc o sym_explode;
 
 
--- a/src/Pure/ML/ml_syntax.ML	Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/ML/ml_syntax.ML	Fri Jun 25 14:05:49 2010 +0200
@@ -26,6 +26,7 @@
   val print_sort: sort -> string
   val print_typ: typ -> string
   val print_term: term -> string
+  val pretty_string: string -> Pretty.T
 end;
 
 structure ML_Syntax: ML_SYNTAX =
@@ -92,4 +93,15 @@
       "Abs (" ^ print_string s ^ ", " ^ print_typ T ^ ", " ^ print_term t ^ ")"
   | print_term (t1 $ t2) = atomic (print_term t1) ^ " $ " ^ atomic (print_term t2);
 
+
+(* toplevel pretty printing *)
+
+fun pretty_string raw_str =
+  let
+    val str =
+      implode (map (fn XML.Elem _ => "<markup>" | XML.Text s => s) (YXML.parse_body raw_str))
+        handle Fail _ => raw_str;
+    val syms = Symbol.explode str handle ERROR _ => explode str;
+  in Pretty.str (quote (implode (map print_char syms))) end;
+
 end;
--- a/src/Pure/Thy/latex.ML	Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/Thy/latex.ML	Fri Jun 25 14:05:49 2010 +0200
@@ -77,6 +77,7 @@
 fun output_known_sym (known_sym, known_ctrl) sym =
   (case Symbol.decode sym of
     Symbol.Char s => output_chr s
+  | Symbol.UTF8 s => s
   | Symbol.Sym s => if known_sym s then enclose "{\\isasym" "}" s else output_chrs sym
   | Symbol.Ctrl s => if known_ctrl s then enclose "\\isactrl" " " s else output_chrs sym
   | Symbol.Raw s => s);
--- a/src/Pure/pure_setup.ML	Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/pure_setup.ML	Fri Jun 25 14:05:49 2010 +0200
@@ -18,6 +18,7 @@
 
 (* ML toplevel pretty printing *)
 
+toplevel_pp ["String", "string"] "ML_Syntax.pretty_string";
 toplevel_pp ["Pretty", "T"] "(fn _: Pretty.T => Pretty.str \"<pretty>\")";
 toplevel_pp ["Task_Queue", "task"] "Pretty.str o Task_Queue.str_of_task";
 toplevel_pp ["Task_Queue", "group"] "Pretty.str o Task_Queue.str_of_group";