more robust treatment of conflicts with existing Unicode text;
authorwenzelm
Sat Dec 30 20:04:05 2017 +0100 (17 months ago)
changeset 673043cf05d7cf174
parent 67303 a77c0dd8bb7c
child 67305 ecb74607063f
more robust treatment of conflicts with existing Unicode text;
NEWS
src/Doc/JEdit/JEdit.thy
src/Pure/General/symbol.scala
src/Tools/jEdit/src-base/isabelle_encoding.scala
     1.1 --- a/NEWS	Sat Dec 30 14:15:44 2017 +0100
     1.2 +++ b/NEWS	Sat Dec 30 20:04:05 2017 +0100
     1.3 @@ -81,6 +81,12 @@
     1.4  * Action "isabelle.draft" is similar to "isabelle.preview", but shows a
     1.5  plain-text document draft.
     1.6  
     1.7 +* When loading text files, the Isabelle symbols encoding UTF-8-Isabelle
     1.8 +is only used if there is no conflict with existing Unicode sequences in
     1.9 +the file. Otherwise, the fallback encoding is plain UTF-8 and Isabelle
    1.10 +symbols remain in literal \<symbol> form. This avoids accidental loss of
    1.11 +Unicode content when saving the file.
    1.12 +
    1.13  
    1.14  *** Isabelle/VSCode Prover IDE ***
    1.15  
     2.1 --- a/src/Doc/JEdit/JEdit.thy	Sat Dec 30 14:15:44 2017 +0100
     2.2 +++ b/src/Doc/JEdit/JEdit.thy	Sat Dec 30 20:04:05 2017 +0100
     2.3 @@ -504,16 +504,26 @@
     2.4  \<close>
     2.5  
     2.6  paragraph \<open>Encoding.\<close>
     2.7 +
     2.8  text \<open>Technically, the Unicode interpretation of Isabelle symbols is an
     2.9    \<^emph>\<open>encoding\<close> called \<^verbatim>\<open>UTF-8-Isabelle\<close> in jEdit (\<^emph>\<open>not\<close> in the underlying
    2.10    JVM). It is provided by the Isabelle Base plugin and enabled by default for
    2.11 -  all source files in Isabelle/jEdit. Sometimes such defaults are reset
    2.12 -  accidentally, or malformed UTF-8 sequences in the text force jEdit to fall
    2.13 -  back on a different encoding like \<^verbatim>\<open>ISO-8859-15\<close>. In that case, verbatim
    2.14 -  ``\<^verbatim>\<open>\<alpha>\<close>'' will be shown in the text buffer instead of its Unicode rendering
    2.15 -  ``\<open>\<alpha>\<close>''. The jEdit menu operation \<^emph>\<open>File~/ Reload with Encoding~/
    2.16 -  UTF-8-Isabelle\<close> helps to resolve such problems (after repairing malformed
    2.17 -  parts of the text). \<close>
    2.18 +  all source files in Isabelle/jEdit.
    2.19 +
    2.20 +  Sometimes such defaults are reset accidentally, or malformed UTF-8 sequences
    2.21 +  in the text force jEdit to fall back on a different encoding like
    2.22 +  \<^verbatim>\<open>ISO-8859-15\<close>. In that case, verbatim ``\<^verbatim>\<open>\<alpha>\<close>'' will be shown in the text
    2.23 +  buffer instead of its Unicode rendering ``\<open>\<alpha>\<close>''. The jEdit menu operation
    2.24 +  \<^emph>\<open>File~/ Reload with Encoding~/ UTF-8-Isabelle\<close> helps to resolve such
    2.25 +  problems (after repairing malformed parts of the text).
    2.26 +
    2.27 +  If the loaded text already contains Unicode sequences that are in conflict
    2.28 +  with the Isabelle symbol encoding, the fallback-encoding UTF-8 is used and
    2.29 +  Isabelle symbols remain in literal \<^verbatim>\<open>\<symbol>\<close> form. The jEdit menu
    2.30 +  operation \<^emph>\<open>Utilities~/ Buffer Options~/ Character encoding\<close> allows to
    2.31 +  enforce the UTF-8-Isabelle, but this will also change original Unicode
    2.32 +  text into Isabelle symbols when saving the file!
    2.33 +\<close>
    2.34  
    2.35  paragraph \<open>Font.\<close>
    2.36  text \<open>Correct rendering via Unicode requires a font that contains glyphs for
     3.1 --- a/src/Pure/General/symbol.scala	Sat Dec 30 14:15:44 2017 +0100
     3.2 +++ b/src/Pure/General/symbol.scala	Sat Dec 30 20:04:05 2017 +0100
     3.3 @@ -504,6 +504,7 @@
     3.4    def abbrevs: Multi_Map[Symbol, String] = symbols.abbrevs
     3.5    def codes: List[(Symbol, Int)] = symbols.codes
     3.6  
     3.7 +  lazy val is_code: Int => Boolean = codes.map(_._2).toSet
     3.8    def decode(text: String): String = symbols.decode(text)
     3.9    def encode(text: String): String = symbols.encode(text)
    3.10  
     4.1 --- a/src/Tools/jEdit/src-base/isabelle_encoding.scala	Sat Dec 30 14:15:44 2017 +0100
     4.2 +++ b/src/Tools/jEdit/src-base/isabelle_encoding.scala	Sat Dec 30 20:04:05 2017 +0100
     4.3 @@ -11,7 +11,7 @@
     4.4  
     4.5  import org.gjt.sp.jedit.io.Encoding
     4.6  
     4.7 -import java.nio.charset.{Charset, CodingErrorAction}
     4.8 +import java.nio.charset.{Charset, CodingErrorAction, CharacterCodingException}
     4.9  import java.io.{InputStream, OutputStream, Reader, Writer, InputStreamReader, OutputStreamWriter,
    4.10    CharArrayReader, ByteArrayOutputStream}
    4.11  
    4.12 @@ -22,20 +22,25 @@
    4.13  {
    4.14    private val BUFSIZE = 32768
    4.15  
    4.16 -  private def text_reader(in: InputStream, codec: Codec): Reader =
    4.17 +  private def text_reader(in: InputStream, codec: Codec, strict: Boolean): Reader =
    4.18    {
    4.19 -    val source = new BufferedSource(in)(codec)
    4.20 -    new CharArrayReader(Symbol.decode(source.mkString).toArray)
    4.21 +    val source = (new BufferedSource(in)(codec)).mkString
    4.22 +
    4.23 +    if (strict && Codepoint.iterator(source).exists(Symbol.is_code))
    4.24 +      throw new CharacterCodingException()
    4.25 +
    4.26 +    new CharArrayReader(Symbol.decode(source).toArray)
    4.27    }
    4.28  
    4.29 -  override def getTextReader(in: InputStream): Reader = text_reader(in, UTF8.codec())
    4.30 +  override def getTextReader(in: InputStream): Reader =
    4.31 +    text_reader(in, UTF8.codec(), true)
    4.32  
    4.33    override def getPermissiveTextReader(in: InputStream): Reader =
    4.34    {
    4.35      val codec = UTF8.codec()
    4.36      codec.onMalformedInput(CodingErrorAction.REPLACE)
    4.37      codec.onUnmappableCharacter(CodingErrorAction.REPLACE)
    4.38 -    text_reader(in, codec)
    4.39 +    text_reader(in, codec, false)
    4.40    }
    4.41  
    4.42    override def getTextWriter(out: OutputStream): Writer =