--- a/NEWS Fri Jun 25 23:35:14 2010 +0200
+++ b/NEWS Mon Jun 28 08:55:46 2010 +0200
@@ -4,6 +4,17 @@
New in this Isabelle version
----------------------------
+*** General ***
+
+* Explicit treatment of UTF8 sequences as Isabelle symbols, such that
+a Unicode character is treated as a single symbol, not a sequence of
+non-ASCII bytes as before. Since Isabelle/ML string literals may
+contain symbols without further backslash escapes, Unicode can now be
+used here as well. Recall that Symbol.explode in ML provides a
+consistent view on symbols, while raw explode (or String.explode)
+merely give a byte-oriented representation.
+
+
*** HOL ***
* Some previously unqualified names have been qualified:
--- a/doc-src/IsarImplementation/Thy/Prelim.thy Fri Jun 25 23:35:14 2010 +0200
+++ b/doc-src/IsarImplementation/Thy/Prelim.thy Mon Jun 28 08:55:46 2010 +0200
@@ -527,8 +527,10 @@
\begin{enumerate}
- \item a single ASCII character ``@{text "c"}'' or raw byte in the
- range of 128\dots 255, for example ``\verb,a,'',
+ \item a single ASCII character ``@{text "c"}'', for example
+ ``\verb,a,'',
+
+ \item a codepoint according to UTF8 (non-ASCII byte sequence),
\item a regular symbol ``\verb,\,\verb,<,@{text "ident"}\verb,>,'',
for example ``\verb,\,\verb,<alpha>,'',
@@ -555,19 +557,17 @@
``\verb,\,\verb,<alpha>,'' is classified as a letter, which means it
may occur within regular Isabelle identifiers.
- Since the character set underlying Isabelle symbols is 7-bit ASCII
- and 8-bit characters are passed through transparently, Isabelle can
- also process Unicode/UCS data in UTF-8 encoding.\footnote{When
- counting precise source positions internally, bytes in the range of
- 128\dots 191 are ignored. In UTF-8 encoding, this interval covers
- the additional trailer bytes, so Isabelle happens to count Unicode
- characters here, not bytes in memory. In ISO-Latin encoding, the
- ignored range merely includes some extra punctuation characters that
- even have replacements within the standard collection of Isabelle
- symbols; the accented letters range is counted properly.} Unicode
- provides its own collection of mathematical symbols, but within the
- core Isabelle/ML world there is no link to the standard collection
- of Isabelle regular symbols.
+ The character set underlying Isabelle symbols is 7-bit ASCII, but
+ 8-bit character sequences are passed-through unchanged. Unicode/UCS
+ data in UTF-8 encoding is processed in a non-strict fashion, such
+ that well-formed code sequences are recognized
+ accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
+ in some special punctuation characters that even have replacements
+ within the standard collection of Isabelle symbols. Text consisting
+ of ASCII plus accented letters can be processed in either encoding.}
+ Unicode provides its own collection of mathematical symbols, but
+ within the core Isabelle/ML world there is no link to the standard
+ collection of Isabelle regular symbols.
\medskip Output of Isabelle symbols depends on the print mode
(\secref{print-mode}). For example, the standard {\LaTeX} setup of
@@ -612,8 +612,8 @@
\item @{ML_type "Symbol.sym"} is a concrete datatype that represents
the different kinds of symbols explicitly, with constructors @{ML
- "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.Ctrl"}, @{ML
- "Symbol.Raw"}.
+ "Symbol.Char"}, @{ML "Symbol.Sym"}, @{ML "Symbol.UTF8"}, @{ML
+ "Symbol.Ctrl"}, @{ML "Symbol.Raw"}.
\item @{ML "Symbol.decode"} converts the string representation of a
symbol into the datatype version.
--- a/doc-src/IsarImplementation/Thy/document/Prelim.tex Fri Jun 25 23:35:14 2010 +0200
+++ b/doc-src/IsarImplementation/Thy/document/Prelim.tex Mon Jun 28 08:55:46 2010 +0200
@@ -648,8 +648,10 @@
\begin{enumerate}
- \item a single ASCII character ``\isa{c}'' or raw byte in the
- range of 128\dots 255, for example ``\verb,a,'',
+ \item a single ASCII character ``\isa{c}'', for example
+ ``\verb,a,'',
+
+ \item a codepoint according to UTF8 (non-ASCII byte sequence),
\item a regular symbol ``\verb,\,\verb,<,\isa{ident}\verb,>,'',
for example ``\verb,\,\verb,<alpha>,'',
@@ -673,19 +675,17 @@
``\verb,\,\verb,<alpha>,'' is classified as a letter, which means it
may occur within regular Isabelle identifiers.
- Since the character set underlying Isabelle symbols is 7-bit ASCII
- and 8-bit characters are passed through transparently, Isabelle can
- also process Unicode/UCS data in UTF-8 encoding.\footnote{When
- counting precise source positions internally, bytes in the range of
- 128\dots 191 are ignored. In UTF-8 encoding, this interval covers
- the additional trailer bytes, so Isabelle happens to count Unicode
- characters here, not bytes in memory. In ISO-Latin encoding, the
- ignored range merely includes some extra punctuation characters that
- even have replacements within the standard collection of Isabelle
- symbols; the accented letters range is counted properly.} Unicode
- provides its own collection of mathematical symbols, but within the
- core Isabelle/ML world there is no link to the standard collection
- of Isabelle regular symbols.
+ The character set underlying Isabelle symbols is 7-bit ASCII, but
+ 8-bit character sequences are passed-through unchanged. Unicode/UCS
+ data in UTF-8 encoding is processed in a non-strict fashion, such
+ that well-formed code sequences are recognized
+ accordingly.\footnote{Note that ISO-Latin-1 differs from UTF-8 only
+ in some special punctuation characters that even have replacements
+ within the standard collection of Isabelle symbols. Text consisting
+ of ASCII plus accented letters can be processed in either encoding.}
+ Unicode provides its own collection of mathematical symbols, but
+ within the core Isabelle/ML world there is no link to the standard
+ collection of Isabelle regular symbols.
\medskip Output of Isabelle symbols depends on the print mode
(\secref{print-mode}). For example, the standard {\LaTeX} setup of
@@ -733,7 +733,7 @@
\cite{isabelle-isar-ref}.
\item \verb|Symbol.sym| is a concrete datatype that represents
- the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|.
+ the different kinds of symbols explicitly, with constructors \verb|Symbol.Char|, \verb|Symbol.Sym|, \verb|Symbol.UTF8|, \verb|Symbol.Ctrl|, \verb|Symbol.Raw|.
\item \verb|Symbol.decode| converts the string representation of a
symbol into the datatype version.
--- a/src/HOL/Tools/Predicate_Compile/predicate_compile_core.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/HOL/Tools/Predicate_Compile/predicate_compile_core.ML Mon Jun 28 08:55:46 2010 +0200
@@ -2535,8 +2535,8 @@
THEN (rtac pred_intro_rule
(* How to handle equality correctly? *)
THEN_ALL_NEW (K (print_tac options "state before assumption matching")
- THEN' (fn i => REPEAT (atac i ORELSE (CHANGED (asm_full_simp_tac split_ss i)
- THEN print_tac options "state after pre-simplification:")))
+ THEN' (atac ORELSE' ((CHANGED o asm_full_simp_tac split_ss) THEN' (TRY o atac)))
+ THEN' (K (print_tac options "state after pre-simplification:"))
THEN' (K (print_tac options "state after assumption matching:")))) 1)
| prove_prems2 out_ts ((p, deriv) :: ps) =
let
--- a/src/HOL/Tools/Quotient/quotient_def.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/HOL/Tools/Quotient/quotient_def.ML Mon Jun 28 08:55:46 2010 +0200
@@ -12,7 +12,7 @@
val quotdef_cmd: (binding option * mixfix) * (Attrib.binding * (string * string)) ->
local_theory -> Quotient_Info.qconsts_info * local_theory
- val lift_raw_const: typ list -> string * term -> local_theory ->
+ val lift_raw_const: typ list -> (string * term * mixfix) -> local_theory ->
Quotient_Info.qconsts_info * local_theory
end;
@@ -88,13 +88,13 @@
end
(* a wrapper for automatically lifting a raw constant *)
-fun lift_raw_const qtys (qconst_name, rconst) ctxt =
+fun lift_raw_const qtys (qconst_name, rconst, mx) ctxt =
let
val rty = fastype_of rconst
val qty = derive_qtyp qtys rty ctxt
+ val lhs = Free (qconst_name, qty)
in
- quotient_def ((NONE, NoSyn), (Attrib.empty_binding,
- (Free (qconst_name, qty), rconst))) ctxt
+ quotient_def ((NONE, mx), (Attrib.empty_binding, (lhs, rconst))) ctxt
end
(* parser and command *)
--- a/src/HOL/Tools/Quotient/quotient_tacs.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/HOL/Tools/Quotient/quotient_tacs.ML Mon Jun 28 08:55:46 2010 +0200
@@ -649,13 +649,21 @@
so we do it both in the original theorem *)
val thm' = Drule.eta_contraction_rule thm
val ((_, [thm'']), ctxt') = Variable.import false [thm'] ctxt
- val goal = (quotient_lift_all qtys ctxt' o prop_of) thm''
+ val goal = derive_qtrm qtys (prop_of thm'') ctxt'
in
Goal.prove ctxt' [] [] goal (K (lift_tac ctxt' [thm'] 1))
|> singleton (ProofContext.export ctxt' ctxt)
end;
-(* An Attribute which automatically constructs the qthm *)
-val lifted_attrib = Thm.rule_attribute (fn ctxt => lifted [] (Context.proof_of ctxt))
+(* An attribute which automatically constructs the qthm
+ using all quotients defined so far.
+*)
+val lifted_attrib = Thm.rule_attribute (fn context =>
+ let
+ val ctxt = Context.proof_of context
+ val qtys = map #qtyp (Quotient_Info.quotdata_dest ctxt)
+ in
+ lifted qtys ctxt
+ end)
end; (* structure *)
--- a/src/HOL/Tools/Quotient/quotient_term.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/HOL/Tools/Quotient/quotient_term.ML Mon Jun 28 08:55:46 2010 +0200
@@ -27,7 +27,7 @@
val inj_repabs_trm_chk: Proof.context -> term * term -> term
val derive_qtyp: typ list -> typ -> Proof.context -> typ
- val quotient_lift_all: typ list -> Proof.context -> term -> term
+ val derive_qtrm: typ list -> term -> Proof.context -> term
end;
structure Quotient_Term: QUOTIENT_TERM =
@@ -692,94 +692,85 @@
(* subst_tys takes a list of (rty, qty) substitution pairs
and replaces all occurences of rty in the given type
- by appropriate qty, with substitution *)
-fun subst_ty thy ty (rty, qty) r =
- if r <> NONE then r else
- case try (Sign.typ_match thy (rty, ty)) Vartab.empty of
- SOME inst => SOME (Envir.subst_type inst qty)
- | NONE => NONE
-fun subst_tys thy substs ty =
- case fold (subst_ty thy ty) substs NONE of
- SOME ty => ty
- | NONE =>
- (case ty of
- Type (s, tys) => Type (s, map (subst_tys thy substs) tys)
- | x => x)
+ by an appropriate qty
+*)
+fun subst_rtyp ctxt ty_subst rty =
+ case rty of
+ Type (s, rtys) =>
+ let
+ val thy = ProofContext.theory_of ctxt
+ val rty' = Type (s, map (subst_rtyp ctxt ty_subst) rtys)
+
+ fun matches [] = rty'
+ | matches ((rty, qty)::tail) =
+ case try (Sign.typ_match thy (rty, rty')) Vartab.empty of
+ NONE => matches tail
+ | SOME inst => Envir.subst_type inst qty
+ in
+ matches ty_subst
+ end
+ | _ => rty
+
+(* subst_trms takes a list of (rconst, qconst) substitution pairs,
+ as well as (rty, qty) substitution pairs, and replaces all
+ occurences of rconst and rty by appropriate instances of
+ qconst and qty
+*)
+fun subst_rtrm ctxt ty_subst trm_subst rtrm =
+ case rtrm of
+ t1 $ t2 => (subst_rtrm ctxt ty_subst trm_subst t1) $ (subst_rtrm ctxt ty_subst trm_subst t2)
+ | Abs (x, ty, t) => Abs (x, subst_rtyp ctxt ty_subst ty, subst_rtrm ctxt ty_subst trm_subst t)
+ | Free(n, ty) => Free(n, subst_rtyp ctxt ty_subst ty)
+ | Var(n, ty) => Var(n, subst_rtyp ctxt ty_subst ty)
+ | Bound i => Bound i
+ | Const (a, ty) =>
+ let
+ val thy = ProofContext.theory_of ctxt
-(* subst_trms takes a list of (rtrm, qtrm) substitution pairs
- and if the given term matches any of the raw terms it
- returns the appropriate qtrm instantiated. If none of
- them matched it returns NONE. *)
-fun subst_trm thy t (rtrm, qtrm) s =
- if s <> NONE then s else
- case try (Pattern.match thy (rtrm, t)) (Vartab.empty, Vartab.empty) of
- SOME inst => SOME (Envir.subst_term inst qtrm)
- | NONE => NONE;
-fun subst_trms thy substs t = fold (subst_trm thy t) substs NONE
+ fun matches [] = Const (a, subst_rtyp ctxt ty_subst ty)
+ | matches ((rconst, qconst)::tail) =
+ case try (Pattern.match thy (rconst, rtrm)) (Vartab.empty, Vartab.empty) of
+ NONE => matches tail
+ | SOME inst => Envir.subst_term inst qconst
+ in
+ matches trm_subst
+ end
+
+(* generates type substitutions out of the
+ types involved in a quotient
+*)
+fun get_ty_subst qtys ctxt =
+ Quotient_Info.quotdata_dest ctxt
+ |> map (fn x => (#rtyp x, #qtyp x))
+ |> filter (fn (_, qty) => member (op =) qtys qty)
-(* prepares type and term substitution pairs to be used by above
- functions that let replace all raw constructs by appropriate
- lifted counterparts. *)
-fun get_ty_trm_substs qtys ctxt =
-let
- val thy = ProofContext.theory_of ctxt
- val quot_infos = Quotient_Info.quotdata_dest ctxt
- val const_infos = Quotient_Info.qconsts_dest ctxt
- val all_ty_substs = map (fn ri => (#rtyp ri, #qtyp ri)) quot_infos
- val ty_substs =
- if qtys = [] then all_ty_substs else
- filter (fn (_, qty) => member (op =) qtys qty) all_ty_substs
- val const_substs = map (fn ci => (#rconst ci, #qconst ci)) const_infos
- fun rel_eq rel = HOLogic.eq_const (subst_tys thy ty_substs (domain_type (fastype_of rel)))
- val rel_substs = map (fn ri => (#equiv_rel ri, rel_eq (#equiv_rel ri))) quot_infos
- fun valid_trm_subst (rt, qt) = (subst_tys thy ty_substs (fastype_of rt) = fastype_of qt)
- val all_trm_substs = const_substs @ rel_substs
+(* generates term substitutions out of the qconst
+ definitions and relations in a quotient
+*)
+fun get_trm_subst qtys ctxt =
+let
+ val subst_rtyp' = subst_rtyp ctxt (get_ty_subst qtys ctxt)
+ fun proper (rt, qt) = (subst_rtyp' (fastype_of rt) = fastype_of qt)
+
+ val const_substs =
+ Quotient_Info.qconsts_dest ctxt
+ |> map (fn x => (#rconst x, #qconst x))
+
+ val rel_substs =
+ Quotient_Info.quotdata_dest ctxt
+ |> map (fn x => (#equiv_rel x, HOLogic.eq_const (#qtyp x)))
in
- (ty_substs, filter valid_trm_subst all_trm_substs)
+ filter proper (const_substs @ rel_substs)
end
+(* derives a qtyp and qtrm out of a rtyp and rtrm,
+ respectively
+*)
fun derive_qtyp qtys rty ctxt =
-let
- val thy = ProofContext.theory_of ctxt
- val (ty_substs, _) = get_ty_trm_substs qtys ctxt
-in
- subst_tys thy ty_substs rty
-end
-
-(*
-Takes a term and
-
-* replaces raw constants by the quotient constants
-
-* replaces equivalence relations by equalities
-
-* replaces raw types by the quotient types
-
-*)
+ subst_rtyp ctxt (get_ty_subst qtys ctxt) rty
-fun quotient_lift_all qtys ctxt t =
-let
- val thy = ProofContext.theory_of ctxt
- val (ty_substs, substs) = get_ty_trm_substs qtys ctxt
- fun lift_aux t =
- case subst_trms thy substs t of
- SOME x => x
- | NONE =>
- (case t of
- a $ b => lift_aux a $ lift_aux b
- | Abs(a, ty, s) =>
- let
- val (y, s') = Term.dest_abs (a, ty, s)
- val nty = subst_tys thy ty_substs ty
- in
- Abs(y, nty, abstract_over (Free (y, nty), lift_aux s'))
- end
- | Free(n, ty) => Free(n, subst_tys thy ty_substs ty)
- | Var(n, ty) => Var(n, subst_tys thy ty_substs ty)
- | Bound i => Bound i
- | Const(s, ty) => Const(s, subst_tys thy ty_substs ty))
-in
- lift_aux t
-end
+fun derive_qtrm qtys rtrm ctxt =
+ subst_rtrm ctxt (get_ty_subst qtys ctxt) (get_trm_subst qtys ctxt) rtrm
+
end; (* structure *)
--- a/src/Pure/General/position.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/Pure/General/position.ML Mon Jun 28 08:55:46 2010 +0200
@@ -67,8 +67,8 @@
fun advance_count "\n" (i: int, j: int, k: int) =
(if_valid i (i + 1), if_valid j 1, if_valid k (k + 1))
| advance_count s (i, j, k) =
- if Symbol.is_regular s andalso not (Symbol.is_utf8_trailer s)
- then (i, if_valid j (j + 1), if_valid k (k + 1)) else (i, j, k);
+ if Symbol.is_regular s then (i, if_valid j (j + 1), if_valid k (k + 1))
+ else (i, j, k);
fun invalid_count (i, j, k) =
not (valid i orelse valid j orelse valid k);
--- a/src/Pure/General/symbol.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/Pure/General/symbol.ML Mon Jun 28 08:55:46 2010 +0200
@@ -15,9 +15,9 @@
val space: symbol
val spaces: int -> string
val is_char: symbol -> bool
+ val is_utf8: symbol -> bool
val is_symbolic: symbol -> bool
val is_printable: symbol -> bool
- val is_utf8_trailer: symbol -> bool
val eof: symbol
val is_eof: symbol -> bool
val not_eof: symbol -> bool
@@ -42,7 +42,7 @@
val is_raw: symbol -> bool
val decode_raw: symbol -> string
val encode_raw: string -> string
- datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string
+ datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string
val decode: symbol -> sym
datatype kind = Letter | Digit | Quasi | Blank | Other
val kind: symbol -> kind
@@ -108,14 +108,14 @@
fun is_char s = size s = 1;
+fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s;
+
fun is_symbolic s =
String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s);
fun is_printable s =
if is_char s then ord space <= ord s andalso ord s <= ord "~"
- else not (String.isPrefix "\\<^" s);
-
-fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
+ else is_utf8 s orelse not (String.isPrefix "\\<^" s);
(* input source control *)
@@ -224,10 +224,12 @@
(* symbol variants *)
-datatype sym = Char of string | Sym of string | Ctrl of string | Raw of string;
+datatype sym =
+ Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string;
fun decode s =
if is_char s then Char s
+ else if is_utf8 s then UTF8 s
else if is_raw s then Raw (decode_raw s)
else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4))
else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3))
@@ -426,7 +428,9 @@
local
-fun is_plain s = s <> "\^M" andalso s <> "\\" andalso not_eof s;
+fun is_plain s = is_ascii s andalso s <> "\^M" andalso s <> "\\";
+
+fun is_utf8_trailer s = is_char s andalso 128 <= ord s andalso ord s < 192;
val scan_encoded_newline =
$$ "\^M" -- $$ "\n" >> K "\n" ||
@@ -439,6 +443,7 @@
val scan =
Scan.one is_plain ||
+ Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode ||
scan_encoded_newline ||
($$ "\\" ^^ $$ "<" ^^
!! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs)))
@@ -471,7 +476,7 @@
fun no_explode [] = true
| no_explode ("\\" :: "<" :: _) = false
| no_explode ("\^M" :: _) = false
- | no_explode (_ :: cs) = no_explode cs;
+ | no_explode (c :: cs) = is_ascii c andalso no_explode cs;
in
@@ -486,7 +491,11 @@
(* escape *)
-val esc = fn s => if is_char s then s else "\\" ^ s;
+val esc = fn s =>
+ if is_char s then s
+ else if is_utf8 s then translate_string (fn c => "\\" ^ string_of_int (ord c)) s
+ else "\\" ^ s;
+
val escape = implode o map esc o sym_explode;
--- a/src/Pure/ML/ml_syntax.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/Pure/ML/ml_syntax.ML Mon Jun 28 08:55:46 2010 +0200
@@ -26,6 +26,7 @@
val print_sort: sort -> string
val print_typ: typ -> string
val print_term: term -> string
+ val pretty_string: string -> Pretty.T
end;
structure ML_Syntax: ML_SYNTAX =
@@ -92,4 +93,15 @@
"Abs (" ^ print_string s ^ ", " ^ print_typ T ^ ", " ^ print_term t ^ ")"
| print_term (t1 $ t2) = atomic (print_term t1) ^ " $ " ^ atomic (print_term t2);
+
+(* toplevel pretty printing *)
+
+fun pretty_string raw_str =
+ let
+ val str =
+ implode (map (fn XML.Elem _ => "<markup>" | XML.Text s => s) (YXML.parse_body raw_str))
+ handle Fail _ => raw_str;
+ val syms = Symbol.explode str handle ERROR _ => explode str;
+ in Pretty.str (quote (implode (map print_char syms))) end;
+
end;
--- a/src/Pure/Thy/latex.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/Pure/Thy/latex.ML Mon Jun 28 08:55:46 2010 +0200
@@ -77,6 +77,7 @@
fun output_known_sym (known_sym, known_ctrl) sym =
(case Symbol.decode sym of
Symbol.Char s => output_chr s
+ | Symbol.UTF8 s => s
| Symbol.Sym s => if known_sym s then enclose "{\\isasym" "}" s else output_chrs sym
| Symbol.Ctrl s => if known_ctrl s then enclose "\\isactrl" " " s else output_chrs sym
| Symbol.Raw s => s);
--- a/src/Pure/pure_setup.ML Fri Jun 25 23:35:14 2010 +0200
+++ b/src/Pure/pure_setup.ML Mon Jun 28 08:55:46 2010 +0200
@@ -18,6 +18,7 @@
(* ML toplevel pretty printing *)
+toplevel_pp ["String", "string"] "ML_Syntax.pretty_string";
toplevel_pp ["Pretty", "T"] "(fn _: Pretty.T => Pretty.str \"<pretty>\")";
toplevel_pp ["Task_Queue", "task"] "Pretty.str o Task_Queue.str_of_task";
toplevel_pp ["Task_Queue", "group"] "Pretty.str o Task_Queue.str_of_group";