--- a/NEWS Fri Jun 25 12:15:49 2010 +0200
+++ b/NEWS Fri Jun 25 14:05:49 2010 +0200
@@ -4,6 +4,17 @@
New in this Isabelle version
----------------------------
+*** General ***
+
+* Explicit treatment of UTF8 sequences as Isabelle symbols, such that
+a Unicode character is treated as a single symbol, not a sequence of
+non-ASCII bytes as before. Since Isabelle/ML string literals may
+contain symbols without further backslash escapes, Unicode can now be
+used here as well. Recall that Symbol.explode in ML provides a
+consistent view on symbols, while raw explode (or String.explode)
+merely give a byte-oriented representation.
+
+
*** HOL ***
* Some previously unqualified names have been qualified:
--- a/doc-src/IsarImplementation/Thy/Prelim.thy Fri Jun 25 12:15:49 2010 +0200
+++ b/doc-src/IsarImplementation/Thy/Prelim.thy Fri Jun 25 14:05:49 2010 +0200
@@ -527,8 +527,10 @@
\begin{enumerate}
- \item a single ASCII character ``@{text "c"}'' or raw byte in the
- range of 128\dots 255, for example ``\verb,a,'',
+ \item a single ASCII character ``@{text "c"}'', for example
+ ``\verb,a,'',
+
+ \item a codepoint according to UTF8 (non-ASCII byte sequence),
\item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'',
for example ``\verb,\,\verb,<alpha>,'',
@@ -555,19 +557,17 @@
``\verb,\,\verb,<alpha>,'' is classified as a letter, which means it
may occur within regular Isabelle identifiers.
- Since the character set underlying Isabelle symbols is 7-bit ASCII
- and 8-bit characters are passed through transparently, Isabelle can
- also process Unicode/UCS data in UTF-8 encoding.\footnote{When
- counting precise source positions internally, bytes in the range of
- 128\dots 191 are ignored. In UTF-8 encoding, this interval covers
- the additional trailer bytes, so Isabelle happens to count Unicode
- characters here, not bytes in memory. In ISO-Latin encoding, the
- ignored range merely includes some extra punctuation characters that
- even have replacements within the standard collection of Isabelle
- symbols; the accented letters range is counted properly.} Unicode
- provides its own collection of mathematical symbols, but within the
- core Isabelle/ML world there is no link to the standard collection
- of Isabelle regular symbols.
+ The character set underlying Isabelle symbols is 7-bit ASCII, but
+ 8-bit character sequences are passed-through unchanged. Unicode/UCS
+ data in UTF-8 encoding is processed in a non-strict fashion, such
+ that well-formed code sequences are recognized
+ accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
+ in some special punctuation characters that even have replacements
+ within the standard collection of Isabelle symbols. Text consisting
+ of ASCII plus accented letters can be processed in either encoding.}
+ Unicode provides its own collection of mathematical symbols, but
+ within the core Isabelle/ML world there is no link to the standard
+ collection of Isabelle regular symbols.
\medskip Output of Isabelle symbols depends on the print mode
(\secref{print-mode}). For example, the standard {\LaTeX} setup of
@@ -612,8 +612,8 @@
\item @{ML_type "Symbol.sym"} is a concrete datatype that represents
the different kinds of symbols explicitly, with constructors @{ML
- "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.Ctrl"}, @{ML
- "Symbol.Raw"}.
+ "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.UTF8"}, @{ML
+ "Symbol.Ctrl"}, @{ML "Symbol.Raw"}.
\item @{ML "Symbol.decode"} converts the string representation of a
symbol into the datatype version.
--- a/doc-src/IsarImplementation/Thy/document/Prelim.tex Fri Jun 25 12:15:49 2010 +0200
+++ b/doc-src/IsarImplementation/Thy/document/Prelim.tex Fri Jun 25 14:05:49 2010 +0200
@@ -648,8 +648,10 @@
\begin{enumerate}
- \item a single ASCII character ``\isa{c}'' or raw byte in the
- range of 128\dots 255, for example ``\verb,a,'',
+ \item a single ASCII character ``\isa{c}'', for example
+ ``\verb,a,'',
+
+ \item a codepoint according to UTF8 (non-ASCII byte sequence),
\item a regular symbol ``\verb,\,\verb,<,\isa{ident}\verb,>,'',
for example ``\verb,\,\verb,<alpha>,'',
@@ -673,19 +675,17 @@
``\verb,\,\verb,<alpha>,'' is classified as a letter, which means it
may occur within regular Isabelle identifiers.
- Since the character set underlying Isabelle symbols is 7-bit ASCII
- and 8-bit characters are passed through transparently, Isabelle can
- also process Unicode/UCS data in UTF-8 encoding.\footnote{When
- counting precise source positions internally, bytes in the range of
- 128\dots 191 are ignored. In UTF-8 encoding, this interval covers
- the additional trailer bytes, so Isabelle happens to count Unicode
- characters here, not bytes in memory. In ISO-Latin encoding, the
- ignored range merely includes some extra punctuation characters that
- even have replacements within the standard collection of Isabelle
- symbols; the accented letters range is counted properly.} Unicode
- provides its own collection of mathematical symbols, but within the
- core Isabelle/ML world there is no link to the standard collection
- of Isabelle regular symbols.
+ The character set underlying Isabelle symbols is 7-bit ASCII, but
+ 8-bit character sequences are passed-through unchanged. Unicode/UCS
+ data in UTF-8 encoding is processed in a non-strict fashion, such
+ that well-formed code sequences are recognized
+ accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
+ in some special punctuation characters that even have replacements
+ within the standard collection of Isabelle symbols. Text consisting
+ of ASCII plus accented letters can be processed in either encoding.}
+ Unicode provides its own collection of mathematical symbols, but
+ within the core Isabelle/ML world there is no link to the standard
+ collection of Isabelle regular symbols.
\medskip Output of Isabelle symbols depends on the print mode
(\secref{print-mode}). For example, the standard {\LaTeX} setup of
@@ -733,7 +733,7 @@
\cite{isabelle-isar-ref}.
\item \verb|Symbol.sym| is a concrete datatype that represents
- the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|.
+ the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.UTF8|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|.
\item \verb|Symbol.decode| converts the string representation of a
symbol into the datatype version.
--- a/src/Pure/General/position.ML Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/General/position.ML Fri Jun 25 14:05:49 2010 +0200
@@ -67,8 +67,8 @@
fun advance_count "\n" (i: int, j: int, k: int) =
(if_valid i (i + 1), if_valid j 1, if_valid k (k + 1))
| advance_count s (i, j, k) =
- if Symbol.is_regular s andalso not (Symbol.is_utf8_trailer s)
- then (i, if_valid j (j + 1), if_valid k (k + 1)) else (i, j, k);
+ if Symbol.is_regular s then (i, if_valid j (j + 1), if_valid k (k + 1))
+ else (i, j, k);
fun invalid_count (i, j, k) =
not (valid i orelse valid j orelse valid k);
--- a/src/Pure/General/symbol.ML Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/General/symbol.ML Fri Jun 25 14:05:49 2010 +0200
@@ -15,9 +15,9 @@
val space: symbol
val spaces: int -> string
val is_char: symbol -> bool
+ val is_utf8: symbol -> bool
val is_symbolic: symbol -> bool
val is_printable: symbol -> bool
- val is_utf8_trailer: symbol -> bool
val eof: symbol
val is_eof: symbol -> bool
val not_eof: symbol -> bool
@@ -42,7 +42,7 @@
val is_raw: symbol -> bool
val decode_raw: symbol -> string
val encode_raw: string -> string
- datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string
+ datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string
val decode: symbol -> sym
datatype kind = Letter | Digit | Quasi | Blank | Other
val kind: symbol -> kind
@@ -108,14 +108,14 @@
fun is_char s = size s = 1;
+fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;
+
fun is_symbolic s =
String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
fun is_printable s =
if is_char s then ord space <= ord s andalso ord s <= ord "~"
- else not (String.isPrefix "\\<^" s);
-
-fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
+ else is_utf8 s orelse not (String.isPrefix "\\<^" s);
(* input source control *)
@@ -224,10 +224,12 @@
(* symbol variants *)
-datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string;
+datatype sym =
+ Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string;
fun decode s =
if is_char s then Char s
+ else if is_utf8 s then UTF8 s
else if is_raw s then Raw (decode_raw s)
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
@@ -426,7 +428,9 @@
local
-fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s;
+fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\";
+
+fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
val scan_encoded_newline =
$$ "\^M" -- $$ "\n" >> K "\n" ||
@@ -439,6 +443,7 @@
val scan =
Scan.one is_plain ||
+ Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode ||
scan_encoded_newline ||
($$ "\\" ^^ $$ "<" ^^
!! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
@@ -471,7 +476,7 @@
fun no_explode [] = true
| no_explode ("\\" :: "<" :: _) = false
| no_explode ("\^M" :: _) = false
- | no_explode (_ :: cs) = no_explode cs;
+ | no_explode (c :: cs) = is_ascii c andalso no_explode cs;
in
@@ -486,7 +491,11 @@
(* escape *)
-val esc = fn s => if is_char s then s else "\\" ^ s;
+val esc = fn s =>
+ if is_char s then s
+ else if is_utf8 s then translate_string (fn c => "\\" ^ string_of_int (ord c)) s
+ else "\\" ^ s;
+
val escape = implode o map esc o sym_explode;
--- a/src/Pure/ML/ml_syntax.ML Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/ML/ml_syntax.ML Fri Jun 25 14:05:49 2010 +0200
@@ -26,6 +26,7 @@
val print_sort: sort -> string
val print_typ: typ -> string
val print_term: term -> string
+ val pretty_string: string -> Pretty.T
end;
structure ML_Syntax: ML_SYNTAX =
@@ -92,4 +93,15 @@
"Abs (" ^ print_string s ^ ", " ^ print_typ T ^ ", " ^ print_term t ^ ")"
| print_term (t1 $ t2) = atomic (print_term t1) ^ " $ " ^ atomic (print_term t2);
+
+(* toplevel pretty printing *)
+
+fun pretty_string raw_str =
+ let
+ val str =
+ implode (map (fn XML.Elem _ => "<markup>" | XML.Text s => s) (YXML.parse_body raw_str))
+ handle Fail _ => raw_str;
+ val syms = Symbol.explode str handle ERROR _ => explode str;
+ in Pretty.str (quote (implode (map print_char syms))) end;
+
end;
--- a/src/Pure/Thy/latex.ML Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/Thy/latex.ML Fri Jun 25 14:05:49 2010 +0200
@@ -77,6 +77,7 @@
fun output_known_sym (known_sym, known_ctrl) sym =
(case Symbol.decode sym of
Symbol.Char s => output_chr s
+ | Symbol.UTF8 s => s
| Symbol.Sym s => if known_sym s then enclose "{\\isasym" "}" s else output_chrs sym
| Symbol.Ctrl s => if known_ctrl s then enclose "\\isactrl" " " s else output_chrs sym
| Symbol.Raw s => s);
--- a/src/Pure/pure_setup.ML Fri Jun 25 12:15:49 2010 +0200
+++ b/src/Pure/pure_setup.ML Fri Jun 25 14:05:49 2010 +0200
@@ -18,6 +18,7 @@
(* ML toplevel pretty printing *)
+toplevel_pp ["String", "string"] "ML_Syntax.pretty_string";
toplevel_pp ["Pretty", "T"] "(fn _: Pretty.T => Pretty.str \"<pretty>\")";
toplevel_pp ["Task_Queue", "task"] "Pretty.str o Task_Queue.str_of_task";
toplevel_pp ["Task_Queue", "group"] "Pretty.str o Task_Queue.str_of_group";