# HG changeset patch # User wenzelm # Date 1277467549 -7200 # Node ID 6c7399bc0d10577829e8b4a5144530b3ddf8541c # Parent 08fc6b026b018c5eb43993da4999858afcd7a366# Parent c62aa9281101f8924a57489ddc228fc4f9eed9c1 merged diff -r 08fc6b026b01 -r 6c7399bc0d10 NEWS --- a/NEWS Fri Jun 25 12:15:49 2010 +0200 +++ b/NEWS Fri Jun 25 14:05:49 2010 +0200 @@ -4,6 +4,17 @@ New in this Isabelle version ---------------------------- +*** General *** + +* Explicit treatment of UTF8 sequences as Isabelle symbols, such that +a Unicode character is treated as a single symbol, not a sequence of +non-ASCII bytes as before. Since Isabelle/ML string literals may +contain symbols without further backslash escapes, Unicode can now be +used here as well. Recall that Symbol.explode in ML provides a +consistent view on symbols, while raw explode (or String.explode) +merely give a byte-oriented representation. + + *** HOL *** * Some previously unqualified names have been qualified: diff -r 08fc6b026b01 -r 6c7399bc0d10 doc-src/IsarImplementation/Thy/Prelim.thy --- a/doc-src/IsarImplementation/Thy/Prelim.thy Fri Jun 25 12:15:49 2010 +0200 +++ b/doc-src/IsarImplementation/Thy/Prelim.thy Fri Jun 25 14:05:49 2010 +0200 @@ -527,8 +527,10 @@ \begin{enumerate} - \item a single ASCII character ``@{text "c"}'' or raw byte in the - range of 128\dots 255, for example ``\verb,a,'', + \item a single ASCII character ``@{text "c"}'', for example + ``\verb,a,'', + + \item a codepoint according to UTF8 (non-ASCII byte sequence), \item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'', for example ``\verb,\,\verb,,'', @@ -555,19 +557,17 @@ ``\verb,\,\verb,,'' is classified as a letter, which means it may occur within regular Isabelle identifiers. - Since the character set underlying Isabelle symbols is 7-bit ASCII - and 8-bit characters are passed through transparently, Isabelle can - also process Unicode/UCS data in UTF-8 encoding.\footnote{When - counting precise source positions internally, bytes in the range of - 128\dots 191 are ignored. In UTF-8 encoding, this interval covers - the additional trailer bytes, so Isabelle happens to count Unicode - characters here, not bytes in memory. In ISO-Latin encoding, the - ignored range merely includes some extra punctuation characters that - even have replacements within the standard collection of Isabelle - symbols; the accented letters range is counted properly.} Unicode - provides its own collection of mathematical symbols, but within the - core Isabelle/ML world there is no link to the standard collection - of Isabelle regular symbols. + The character set underlying Isabelle symbols is 7-bit ASCII, but + 8-bit character sequences are passed-through unchanged. Unicode/UCS + data in UTF-8 encoding is processed in a non-strict fashion, such + that well-formed code sequences are recognized + accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only + in some special punctuation characters that even have replacements + within the standard collection of Isabelle symbols. Text consisting + of ASCII plus accented letters can be processed in either encoding.} + Unicode provides its own collection of mathematical symbols, but + within the core Isabelle/ML world there is no link to the standard + collection of Isabelle regular symbols. \medskip Output of Isabelle symbols depends on the print mode (\secref{print-mode}). For example, the standard {\LaTeX} setup of @@ -612,8 +612,8 @@ \item @{ML_type "Symbol.sym"} is a concrete datatype that represents the different kinds of symbols explicitly, with constructors @{ML - "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.Ctrl"}, @{ML - "Symbol.Raw"}. + "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.UTF8"}, @{ML + "Symbol.Ctrl"}, @{ML "Symbol.Raw"}. \item @{ML "Symbol.decode"} converts the string representation of a symbol into the datatype version. diff -r 08fc6b026b01 -r 6c7399bc0d10 doc-src/IsarImplementation/Thy/document/Prelim.tex --- a/doc-src/IsarImplementation/Thy/document/Prelim.tex Fri Jun 25 12:15:49 2010 +0200 +++ b/doc-src/IsarImplementation/Thy/document/Prelim.tex Fri Jun 25 14:05:49 2010 +0200 @@ -648,8 +648,10 @@ \begin{enumerate} - \item a single ASCII character ``\isa{c}'' or raw byte in the - range of 128\dots 255, for example ``\verb,a,'', + \item a single ASCII character ``\isa{c}'', for example + ``\verb,a,'', + + \item a codepoint according to UTF8 (non-ASCII byte sequence), \item a regular symbol ``\verb,\,\verb,<,\isa{ident}\verb,>,'', for example ``\verb,\,\verb,,'', @@ -673,19 +675,17 @@ ``\verb,\,\verb,,'' is classified as a letter, which means it may occur within regular Isabelle identifiers. - Since the character set underlying Isabelle symbols is 7-bit ASCII - and 8-bit characters are passed through transparently, Isabelle can - also process Unicode/UCS data in UTF-8 encoding.\footnote{When - counting precise source positions internally, bytes in the range of - 128\dots 191 are ignored. In UTF-8 encoding, this interval covers - the additional trailer bytes, so Isabelle happens to count Unicode - characters here, not bytes in memory. In ISO-Latin encoding, the - ignored range merely includes some extra punctuation characters that - even have replacements within the standard collection of Isabelle - symbols; the accented letters range is counted properly.} Unicode - provides its own collection of mathematical symbols, but within the - core Isabelle/ML world there is no link to the standard collection - of Isabelle regular symbols. + The character set underlying Isabelle symbols is 7-bit ASCII, but + 8-bit character sequences are passed-through unchanged. Unicode/UCS + data in UTF-8 encoding is processed in a non-strict fashion, such + that well-formed code sequences are recognized + accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only + in some special punctuation characters that even have replacements + within the standard collection of Isabelle symbols. Text consisting + of ASCII plus accented letters can be processed in either encoding.} + Unicode provides its own collection of mathematical symbols, but + within the core Isabelle/ML world there is no link to the standard + collection of Isabelle regular symbols. \medskip Output of Isabelle symbols depends on the print mode (\secref{print-mode}). For example, the standard {\LaTeX} setup of @@ -733,7 +733,7 @@ \cite{isabelle-isar-ref}. \item \verb|Symbol.sym| is a concrete datatype that represents - the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|. + the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.UTF8|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|. \item \verb|Symbol.decode| converts the string representation of a symbol into the datatype version. diff -r 08fc6b026b01 -r 6c7399bc0d10 src/Pure/General/position.ML --- a/src/Pure/General/position.ML Fri Jun 25 12:15:49 2010 +0200 +++ b/src/Pure/General/position.ML Fri Jun 25 14:05:49 2010 +0200 @@ -67,8 +67,8 @@ fun advance_count "\n" (i: int, j: int, k: int) = (if_valid i (i + 1), if_valid j 1, if_valid k (k + 1)) | advance_count s (i, j, k) = - if Symbol.is_regular s andalso not (Symbol.is_utf8_trailer s) - then (i, if_valid j (j + 1), if_valid k (k + 1)) else (i, j, k); + if Symbol.is_regular s then (i, if_valid j (j + 1), if_valid k (k + 1)) + else (i, j, k); fun invalid_count (i, j, k) = not (valid i orelse valid j orelse valid k); diff -r 08fc6b026b01 -r 6c7399bc0d10 src/Pure/General/symbol.ML --- a/src/Pure/General/symbol.ML Fri Jun 25 12:15:49 2010 +0200 +++ b/src/Pure/General/symbol.ML Fri Jun 25 14:05:49 2010 +0200 @@ -15,9 +15,9 @@ val space: symbol val spaces: int -> string val is_char: symbol -> bool + val is_utf8: symbol -> bool val is_symbolic: symbol -> bool val is_printable: symbol -> bool - val is_utf8_trailer: symbol -> bool val eof: symbol val is_eof: symbol -> bool val not_eof: symbol -> bool @@ -42,7 +42,7 @@ val is_raw: symbol -> bool val decode_raw: symbol -> string val encode_raw: string -> string - datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string + datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string val decode: symbol -> sym datatype kind = Letter | Digit | Quasi | Blank | Other val kind: symbol -> kind @@ -108,14 +108,14 @@ fun is_char s = size s = 1; +fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s; + fun is_symbolic s = String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s); fun is_printable s = if is_char s then ord space <= ord s andalso ord s <= ord "~" - else not (String.isPrefix "\\<^" s); - -fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192; + else is_utf8 s orelse not (String.isPrefix "\\<^" s); (* input source control *) @@ -224,10 +224,12 @@ (* symbol variants *) -datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string; +datatype sym = + Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string; fun decode s = if is_char s then Char s + else if is_utf8 s then UTF8 s else if is_raw s then Raw (decode_raw s) else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4)) else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3)) @@ -426,7 +428,9 @@ local -fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s; +fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\"; + +fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192; val scan_encoded_newline = $$ "\^M" -- $$ "\n" >> K "\n" || @@ -439,6 +443,7 @@ val scan = Scan.one is_plain || + Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode || scan_encoded_newline || ($$ "\\" ^^ $$ "<" ^^ !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs))) @@ -471,7 +476,7 @@ fun no_explode [] = true | no_explode ("\\" :: "<" :: _) = false | no_explode ("\^M" :: _) = false - | no_explode (_ :: cs) = no_explode cs; + | no_explode (c :: cs) = is_ascii c andalso no_explode cs; in @@ -486,7 +491,11 @@ (* escape *) -val esc = fn s => if is_char s then s else "\\" ^ s; +val esc = fn s => + if is_char s then s + else if is_utf8 s then translate_string (fn c => "\\" ^ string_of_int (ord c)) s + else "\\" ^ s; + val escape = implode o map esc o sym_explode; diff -r 08fc6b026b01 -r 6c7399bc0d10 src/Pure/ML/ml_syntax.ML --- a/src/Pure/ML/ml_syntax.ML Fri Jun 25 12:15:49 2010 +0200 +++ b/src/Pure/ML/ml_syntax.ML Fri Jun 25 14:05:49 2010 +0200 @@ -26,6 +26,7 @@ val print_sort: sort -> string val print_typ: typ -> string val print_term: term -> string + val pretty_string: string -> Pretty.T end; structure ML_Syntax: ML_SYNTAX = @@ -92,4 +93,15 @@ "Abs (" ^ print_string s ^ ", " ^ print_typ T ^ ", " ^ print_term t ^ ")" | print_term (t1 $ t2) = atomic (print_term t1) ^ " $ " ^ atomic (print_term t2); + +(* toplevel pretty printing *) + +fun pretty_string raw_str = + let + val str = + implode (map (fn XML.Elem _ => "" | XML.Text s => s) (YXML.parse_body raw_str)) + handle Fail _ => raw_str; + val syms = Symbol.explode str handle ERROR _ => explode str; + in Pretty.str (quote (implode (map print_char syms))) end; + end; diff -r 08fc6b026b01 -r 6c7399bc0d10 src/Pure/Thy/latex.ML --- a/src/Pure/Thy/latex.ML Fri Jun 25 12:15:49 2010 +0200 +++ b/src/Pure/Thy/latex.ML Fri Jun 25 14:05:49 2010 +0200 @@ -77,6 +77,7 @@ fun output_known_sym (known_sym, known_ctrl) sym = (case Symbol.decode sym of Symbol.Char s => output_chr s + | Symbol.UTF8 s => s | Symbol.Sym s => if known_sym s then enclose "{\\isasym" "}" s else output_chrs sym | Symbol.Ctrl s => if known_ctrl s then enclose "\\isactrl" " " s else output_chrs sym | Symbol.Raw s => s); diff -r 08fc6b026b01 -r 6c7399bc0d10 src/Pure/pure_setup.ML --- a/src/Pure/pure_setup.ML Fri Jun 25 12:15:49 2010 +0200 +++ b/src/Pure/pure_setup.ML Fri Jun 25 14:05:49 2010 +0200 @@ -18,6 +18,7 @@ (* ML toplevel pretty printing *) +toplevel_pp ["String", "string"] "ML_Syntax.pretty_string"; toplevel_pp ["Pretty", "T"] "(fn _: Pretty.T => Pretty.str \"\")"; toplevel_pp ["Task_Queue", "task"] "Pretty.str o Task_Queue.str_of_task"; toplevel_pp ["Task_Queue", "group"] "Pretty.str o Task_Queue.str_of_group";