more robust treatment of conflicts with existing Unicode text;
authorwenzelm
Sat, 30 Dec 2017 20:04:05 +0100
changeset 67304 3cf05d7cf174
parent 67303 a77c0dd8bb7c
child 67305 ecb74607063f
more robust treatment of conflicts with existing Unicode text;
NEWS
src/Doc/JEdit/JEdit.thy
src/Pure/General/symbol.scala
src/Tools/jEdit/src-base/isabelle_encoding.scala
--- a/NEWS	Sat Dec 30 14:15:44 2017 +0100
+++ b/NEWS	Sat Dec 30 20:04:05 2017 +0100
@@ -81,6 +81,12 @@
 * Action "isabelle.draft" is similar to "isabelle.preview", but shows a
 plain-text document draft.
 
+* When loading text files, the Isabelle symbols encoding UTF-8-Isabelle
+is only used if there is no conflict with existing Unicode sequences in
+the file. Otherwise, the fallback encoding is plain UTF-8 and Isabelle
+symbols remain in literal \<symbol> form. This avoids accidental loss of
+Unicode content when saving the file.
+
 
 *** Isabelle/VSCode Prover IDE ***
 
--- a/src/Doc/JEdit/JEdit.thy	Sat Dec 30 14:15:44 2017 +0100
+++ b/src/Doc/JEdit/JEdit.thy	Sat Dec 30 20:04:05 2017 +0100
@@ -504,16 +504,26 @@
 \<close>
 
 paragraph \<open>Encoding.\<close>
+
 text \<open>Technically, the Unicode interpretation of Isabelle symbols is an
   \<^emph>\<open>encoding\<close> called \<^verbatim>\<open>UTF-8-Isabelle\<close> in jEdit (\<^emph>\<open>not\<close> in the underlying
   JVM). It is provided by the Isabelle Base plugin and enabled by default for
-  all source files in Isabelle/jEdit. Sometimes such defaults are reset
-  accidentally, or malformed UTF-8 sequences in the text force jEdit to fall
-  back on a different encoding like \<^verbatim>\<open>ISO-8859-15\<close>. In that case, verbatim
-  ``\<^verbatim>\<open>\<alpha>\<close>'' will be shown in the text buffer instead of its Unicode rendering
-  ``\<open>\<alpha>\<close>''. The jEdit menu operation \<^emph>\<open>File~/ Reload with Encoding~/
-  UTF-8-Isabelle\<close> helps to resolve such problems (after repairing malformed
-  parts of the text). \<close>
+  all source files in Isabelle/jEdit.
+
+  Sometimes such defaults are reset accidentally, or malformed UTF-8 sequences
+  in the text force jEdit to fall back on a different encoding like
+  \<^verbatim>\<open>ISO-8859-15\<close>. In that case, verbatim ``\<^verbatim>\<open>\<alpha>\<close>'' will be shown in the text
+  buffer instead of its Unicode rendering ``\<open>\<alpha>\<close>''. The jEdit menu operation
+  \<^emph>\<open>File~/ Reload with Encoding~/ UTF-8-Isabelle\<close> helps to resolve such
+  problems (after repairing malformed parts of the text).
+
+  If the loaded text already contains Unicode sequences that are in conflict
+  with the Isabelle symbol encoding, the fallback-encoding UTF-8 is used and
+  Isabelle symbols remain in literal \<^verbatim>\<open>\<symbol>\<close> form. The jEdit menu
+  operation \<^emph>\<open>Utilities~/ Buffer Options~/ Character encoding\<close> allows to
+  enforce the UTF-8-Isabelle, but this will also change original Unicode
+  text into Isabelle symbols when saving the file!
+\<close>
 
 paragraph \<open>Font.\<close>
 text \<open>Correct rendering via Unicode requires a font that contains glyphs for
--- a/src/Pure/General/symbol.scala	Sat Dec 30 14:15:44 2017 +0100
+++ b/src/Pure/General/symbol.scala	Sat Dec 30 20:04:05 2017 +0100
@@ -504,6 +504,7 @@
   def abbrevs: Multi_Map[Symbol, String] = symbols.abbrevs
   def codes: List[(Symbol, Int)] = symbols.codes
 
+  lazy val is_code: Int => Boolean = codes.map(_._2).toSet
   def decode(text: String): String = symbols.decode(text)
   def encode(text: String): String = symbols.encode(text)
 
--- a/src/Tools/jEdit/src-base/isabelle_encoding.scala	Sat Dec 30 14:15:44 2017 +0100
+++ b/src/Tools/jEdit/src-base/isabelle_encoding.scala	Sat Dec 30 20:04:05 2017 +0100
@@ -11,7 +11,7 @@
 
 import org.gjt.sp.jedit.io.Encoding
 
-import java.nio.charset.{Charset, CodingErrorAction}
+import java.nio.charset.{Charset, CodingErrorAction, CharacterCodingException}
 import java.io.{InputStream, OutputStream, Reader, Writer, InputStreamReader, OutputStreamWriter,
   CharArrayReader, ByteArrayOutputStream}
 
@@ -22,20 +22,25 @@
 {
   private val BUFSIZE = 32768
 
-  private def text_reader(in: InputStream, codec: Codec): Reader =
+  private def text_reader(in: InputStream, codec: Codec, strict: Boolean): Reader =
   {
-    val source = new BufferedSource(in)(codec)
-    new CharArrayReader(Symbol.decode(source.mkString).toArray)
+    val source = (new BufferedSource(in)(codec)).mkString
+
+    if (strict && Codepoint.iterator(source).exists(Symbol.is_code))
+      throw new CharacterCodingException()
+
+    new CharArrayReader(Symbol.decode(source).toArray)
   }
 
-  override def getTextReader(in: InputStream): Reader = text_reader(in, UTF8.codec())
+  override def getTextReader(in: InputStream): Reader =
+    text_reader(in, UTF8.codec(), true)
 
   override def getPermissiveTextReader(in: InputStream): Reader =
   {
     val codec = UTF8.codec()
     codec.onMalformedInput(CodingErrorAction.REPLACE)
     codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
-    text_reader(in, codec)
+    text_reader(in, codec, false)
   }
 
   override def getTextWriter(out: OutputStream): Writer =