# HG changeset patch # User wenzelm # Date 1514660645 -3600 # Node ID 3cf05d7cf174590dc914ba4c89a77879e3ef4bf3 # Parent a77c0dd8bb7c6fe0b80bb9b651d89829119def6e more robust treatment of conflicts with existing Unicode text; diff -r a77c0dd8bb7c -r 3cf05d7cf174 NEWS --- a/NEWS Sat Dec 30 14:15:44 2017 +0100 +++ b/NEWS Sat Dec 30 20:04:05 2017 +0100 @@ -81,6 +81,12 @@ * Action "isabelle.draft" is similar to "isabelle.preview", but shows a plain-text document draft. +* When loading text files, the Isabelle symbols encoding UTF-8-Isabelle +is only used if there is no conflict with existing Unicode sequences in +the file. Otherwise, the fallback encoding is plain UTF-8 and Isabelle +symbols remain in literal \ form. This avoids accidental loss of +Unicode content when saving the file. + *** Isabelle/VSCode Prover IDE *** diff -r a77c0dd8bb7c -r 3cf05d7cf174 src/Doc/JEdit/JEdit.thy --- a/src/Doc/JEdit/JEdit.thy Sat Dec 30 14:15:44 2017 +0100 +++ b/src/Doc/JEdit/JEdit.thy Sat Dec 30 20:04:05 2017 +0100 @@ -504,16 +504,26 @@ \ paragraph \Encoding.\ + text \Technically, the Unicode interpretation of Isabelle symbols is an \<^emph>\encoding\ called \<^verbatim>\UTF-8-Isabelle\ in jEdit (\<^emph>\not\ in the underlying JVM). It is provided by the Isabelle Base plugin and enabled by default for - all source files in Isabelle/jEdit. Sometimes such defaults are reset - accidentally, or malformed UTF-8 sequences in the text force jEdit to fall - back on a different encoding like \<^verbatim>\ISO-8859-15\. In that case, verbatim - ``\<^verbatim>\\\'' will be shown in the text buffer instead of its Unicode rendering - ``\\\''. The jEdit menu operation \<^emph>\File~/ Reload with Encoding~/ - UTF-8-Isabelle\ helps to resolve such problems (after repairing malformed - parts of the text). \ + all source files in Isabelle/jEdit. + + Sometimes such defaults are reset accidentally, or malformed UTF-8 sequences + in the text force jEdit to fall back on a different encoding like + \<^verbatim>\ISO-8859-15\. In that case, verbatim ``\<^verbatim>\\\'' will be shown in the text + buffer instead of its Unicode rendering ``\\\''. The jEdit menu operation + \<^emph>\File~/ Reload with Encoding~/ UTF-8-Isabelle\ helps to resolve such + problems (after repairing malformed parts of the text). + + If the loaded text already contains Unicode sequences that are in conflict + with the Isabelle symbol encoding, the fallback-encoding UTF-8 is used and + Isabelle symbols remain in literal \<^verbatim>\\\ form. The jEdit menu + operation \<^emph>\Utilities~/ Buffer Options~/ Character encoding\ allows to + enforce the UTF-8-Isabelle, but this will also change original Unicode + text into Isabelle symbols when saving the file! +\ paragraph \Font.\ text \Correct rendering via Unicode requires a font that contains glyphs for diff -r a77c0dd8bb7c -r 3cf05d7cf174 src/Pure/General/symbol.scala --- a/src/Pure/General/symbol.scala Sat Dec 30 14:15:44 2017 +0100 +++ b/src/Pure/General/symbol.scala Sat Dec 30 20:04:05 2017 +0100 @@ -504,6 +504,7 @@ def abbrevs: Multi_Map[Symbol, String] = symbols.abbrevs def codes: List[(Symbol, Int)] = symbols.codes + lazy val is_code: Int => Boolean = codes.map(_._2).toSet def decode(text: String): String = symbols.decode(text) def encode(text: String): String = symbols.encode(text) diff -r a77c0dd8bb7c -r 3cf05d7cf174 src/Tools/jEdit/src-base/isabelle_encoding.scala --- a/src/Tools/jEdit/src-base/isabelle_encoding.scala Sat Dec 30 14:15:44 2017 +0100 +++ b/src/Tools/jEdit/src-base/isabelle_encoding.scala Sat Dec 30 20:04:05 2017 +0100 @@ -11,7 +11,7 @@ import org.gjt.sp.jedit.io.Encoding -import java.nio.charset.{Charset, CodingErrorAction} +import java.nio.charset.{Charset, CodingErrorAction, CharacterCodingException} import java.io.{InputStream, OutputStream, Reader, Writer, InputStreamReader, OutputStreamWriter, CharArrayReader, ByteArrayOutputStream} @@ -22,20 +22,25 @@ { private val BUFSIZE = 32768 - private def text_reader(in: InputStream, codec: Codec): Reader = + private def text_reader(in: InputStream, codec: Codec, strict: Boolean): Reader = { - val source = new BufferedSource(in)(codec) - new CharArrayReader(Symbol.decode(source.mkString).toArray) + val source = (new BufferedSource(in)(codec)).mkString + + if (strict && Codepoint.iterator(source).exists(Symbol.is_code)) + throw new CharacterCodingException() + + new CharArrayReader(Symbol.decode(source).toArray) } - override def getTextReader(in: InputStream): Reader = text_reader(in, UTF8.codec()) + override def getTextReader(in: InputStream): Reader = + text_reader(in, UTF8.codec(), true) override def getPermissiveTextReader(in: InputStream): Reader = { val codec = UTF8.codec() codec.onMalformedInput(CodingErrorAction.REPLACE) codec.onUnmappableCharacter(CodingErrorAction.REPLACE) - text_reader(in, codec) + text_reader(in, codec, false) } override def getTextWriter(out: OutputStream): Writer =