# HG changeset patch # User wenzelm # Date 1353871024 -3600 # Node ID 00d8ad713e32ab86c38917c5f1944eff3da01636 # Parent ec0f2f8dbeb9cbd60fb0c13eaad1039cbaf4fb76 explicit module UTF8; diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/General/file.scala --- a/src/Pure/General/file.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/General/file.scala Sun Nov 25 20:17:04 2012 +0100 @@ -53,8 +53,7 @@ buf } - def read(file: JFile): String = - new String(read_bytes(file), Standard_System.charset) + def read(file: JFile): String = new String(read_bytes(file), UTF8.charset) def read(path: Path): String = read(path.file) @@ -68,7 +67,7 @@ } def read(stream: InputStream): String = - read(new BufferedReader(new InputStreamReader(stream, Standard_System.charset))) + read(new BufferedReader(new InputStreamReader(stream, UTF8.charset))) /* try_read */ @@ -90,7 +89,7 @@ { val stream1 = new FileOutputStream(file) val stream2 = if (gzip) new GZIPOutputStream(new BufferedOutputStream(stream1)) else stream1 - val writer = new BufferedWriter(new OutputStreamWriter(stream2, Standard_System.charset)) + val writer = new BufferedWriter(new OutputStreamWriter(stream2, UTF8.charset)) try { writer.append(text) } finally { writer.close } } diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/General/sha1.scala --- a/src/Pure/General/sha1.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/General/sha1.scala Sun Nov 25 20:17:04 2012 +0100 @@ -56,6 +56,6 @@ make_result(digest) } - def digest(string: String): Digest = digest(Standard_System.string_bytes(string)) + def digest(string: String): Digest = digest(UTF8.string_bytes(string)) } diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/System/build.scala --- a/src/Pure/System/build.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/System/build.scala Sun Nov 25 20:17:04 2012 +0100 @@ -522,7 +522,7 @@ private def read_stamps(path: Path): Option[(String, String, String)] = if (path.is_file) { val stream = new GZIPInputStream (new BufferedInputStream(new FileInputStream(path.file))) - val reader = new BufferedReader(new InputStreamReader(stream, Standard_System.charset)) + val reader = new BufferedReader(new InputStreamReader(stream, UTF8.charset)) val (s, h1, h2) = try { (reader.readLine, reader.readLine, reader.readLine) } finally { reader.close } diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/System/isabelle_charset.scala --- a/src/Pure/System/isabelle_charset.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/System/isabelle_charset.scala Sun Nov 25 20:17:04 2012 +0100 @@ -22,14 +22,11 @@ class Isabelle_Charset extends Charset(Isabelle_Charset.name, null) { override def contains(cs: Charset): Boolean = - cs.name.equalsIgnoreCase(Standard_System.charset_name) || - Standard_System.charset.contains(cs) + cs.name.equalsIgnoreCase(UTF8.charset_name) || UTF8.charset.contains(cs) - override def newDecoder(): CharsetDecoder = - Standard_System.charset.newDecoder + override def newDecoder(): CharsetDecoder = UTF8.charset.newDecoder - override def newEncoder(): CharsetEncoder = - Standard_System.charset.newEncoder + override def newEncoder(): CharsetEncoder = UTF8.charset.newEncoder } diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/System/isabelle_process.scala --- a/src/Pure/System/isabelle_process.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/System/isabelle_process.scala Sun Nov 25 20:17:04 2012 +0100 @@ -292,9 +292,8 @@ //{{{ receive { case Input_Chunks(chunks) => - stream.write(Standard_System.string_bytes( - chunks.map(_.length).mkString("", ",", "\n"))); - chunks.foreach(stream.write(_)); + stream.write(UTF8.string_bytes(chunks.map(_.length).mkString("", ",", "\n"))) + chunks.foreach(stream.write(_)) stream.flush case Close => stream.close @@ -350,8 +349,8 @@ if (i != n) throw new Protocol_Error("bad message chunk content") if (do_decode) - YXML.parse_body_failsafe(Standard_System.decode_chars(decode, buf, 0, n)) - else List(XML.Text(Standard_System.decode_chars(s => s, buf, 0, n).toString)) + YXML.parse_body_failsafe(UTF8.decode_chars(decode, buf, 0, n)) + else List(XML.Text(UTF8.decode_chars(s => s, buf, 0, n).toString)) //}}} } @@ -390,12 +389,12 @@ def input_raw(text: String): Unit = standard_input._2 ! Input_Text(text) def input_bytes(name: String, args: Array[Byte]*): Unit = - command_input._2 ! Input_Chunks(Standard_System.string_bytes(name) :: args.toList) + command_input._2 ! Input_Chunks(UTF8.string_bytes(name) :: args.toList) def input(name: String, args: String*) { receiver(new Input(name, args.toList)) - input_bytes(name, args.map(Standard_System.string_bytes): _*) + input_bytes(name, args.map(UTF8.string_bytes): _*) } def options(opts: Options): Unit = diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/System/isabelle_system.scala --- a/src/Pure/System/isabelle_system.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/System/isabelle_system.scala Sun Nov 25 20:17:04 2012 +0100 @@ -168,13 +168,13 @@ // channels val stdin: BufferedWriter = - new BufferedWriter(new OutputStreamWriter(proc.getOutputStream, Standard_System.charset)) + new BufferedWriter(new OutputStreamWriter(proc.getOutputStream, UTF8.charset)) val stdout: BufferedReader = - new BufferedReader(new InputStreamReader(proc.getInputStream, Standard_System.charset)) + new BufferedReader(new InputStreamReader(proc.getInputStream, UTF8.charset)) val stderr: BufferedReader = - new BufferedReader(new InputStreamReader(proc.getErrorStream, Standard_System.charset)) + new BufferedReader(new InputStreamReader(proc.getErrorStream, UTF8.charset)) // signals diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/System/standard_system.scala --- a/src/Pure/System/standard_system.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/System/standard_system.scala Sun Nov 25 20:17:04 2012 +0100 @@ -12,90 +12,12 @@ import java.util.Locale import java.net.URL import java.io.{File => JFile} -import java.nio.charset.Charset -import scala.io.Codec import scala.util.matching.Regex object Standard_System { - /* UTF-8 charset */ - - val charset_name: String = "UTF-8" - val charset: Charset = Charset.forName(charset_name) - def codec(): Codec = Codec(charset) - - def string_bytes(s: String): Array[Byte] = s.getBytes(charset) - - - /* permissive UTF-8 decoding */ - - // see also http://en.wikipedia.org/wiki/UTF-8#Description - // overlong encodings enable byte-stuffing - - def decode_permissive_utf8(text: CharSequence): String = - { - val buf = new java.lang.StringBuilder(text.length) - var code = -1 - var rest = 0 - def flush() - { - if (code != -1) { - if (rest == 0 && Character.isValidCodePoint(code)) - buf.appendCodePoint(code) - else buf.append('\uFFFD') - code = -1 - rest = 0 - } - } - def init(x: Int, n: Int) - { - flush() - code = x - rest = n - } - def push(x: Int) - { - if (rest <= 0) init(x, -1) - else { - code <<= 6 - code += x - rest -= 1 - } - } - for (i <- 0 until text.length) { - val c = text.charAt(i) - if (c < 128) { flush(); buf.append(c) } - else if ((c & 0xC0) == 0x80) push(c & 0x3F) - else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) - else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) - else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) - } - flush() - buf.toString - } - - private class Decode_Chars(decode: String => String, - buffer: Array[Byte], start: Int, end: Int) extends CharSequence - { - def length: Int = end - start - def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] - def subSequence(i: Int, j: Int): CharSequence = - new Decode_Chars(decode, buffer, start + i, start + j) - - // toString with adhoc decoding: abuse of CharSequence interface - override def toString: String = decode(decode_permissive_utf8(this)) - } - - def decode_chars(decode: String => String, - buffer: Array[Byte], start: Int, end: Int): CharSequence = - { - require(0 <= start && start <= end && end <= buffer.length) - new Decode_Chars(decode, buffer, start, end) - } - - /* shell processes */ def raw_execute(cwd: JFile, env: Map[String, String], redirect: Boolean, args: String*): Process = diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/System/utf8.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Pure/System/utf8.scala Sun Nov 25 20:17:04 2012 +0100 @@ -0,0 +1,92 @@ +/* Title: Pure/System/utf8.scala + Module: PIDE + Author: Makarius + +Variations on UTF-8. +*/ + +package isabelle + + +import java.nio.charset.Charset +import scala.io.Codec + + +object UTF8 +{ + /* charset */ + + val charset_name: String = "UTF-8" + val charset: Charset = Charset.forName(charset_name) + def codec(): Codec = Codec(charset) + + def string_bytes(s: String): Array[Byte] = s.getBytes(charset) + + + /* permissive UTF-8 decoding */ + + // see also http://en.wikipedia.org/wiki/UTF-8#Description + // overlong encodings enable byte-stuffing + + def decode_permissive(text: CharSequence): String = + { + val buf = new java.lang.StringBuilder(text.length) + var code = -1 + var rest = 0 + def flush() + { + if (code != -1) { + if (rest == 0 && Character.isValidCodePoint(code)) + buf.appendCodePoint(code) + else buf.append('\uFFFD') + code = -1 + rest = 0 + } + } + def init(x: Int, n: Int) + { + flush() + code = x + rest = n + } + def push(x: Int) + { + if (rest <= 0) init(x, -1) + else { + code <<= 6 + code += x + rest -= 1 + } + } + for (i <- 0 until text.length) { + val c = text.charAt(i) + if (c < 128) { flush(); buf.append(c) } + else if ((c & 0xC0) == 0x80) push(c & 0x3F) + else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) + else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) + else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) + } + flush() + buf.toString + } + + private class Decode_Chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int) extends CharSequence + { + def length: Int = end - start + def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] + def subSequence(i: Int, j: Int): CharSequence = + new Decode_Chars(decode, buffer, start + i, start + j) + + // toString with adhoc decoding: abuse of CharSequence interface + override def toString: String = decode(decode_permissive(this)) + } + + def decode_chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int): CharSequence = + { + require(0 <= start && start <= end && end <= buffer.length) + new Decode_Chars(decode, buffer, start, end) + } +} + diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Pure/build-jars --- a/src/Pure/build-jars Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Pure/build-jars Sun Nov 25 20:17:04 2012 +0100 @@ -56,6 +56,7 @@ System/standard_system.scala System/swing_thread.scala System/system_channel.scala + System/utf8.scala Thy/completion.scala Thy/html.scala Thy/thy_header.scala diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Tools/jEdit/src/isabelle_encoding.scala --- a/src/Tools/jEdit/src/isabelle_encoding.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Tools/jEdit/src/isabelle_encoding.scala Sun Nov 25 20:17:04 2012 +0100 @@ -37,12 +37,11 @@ new CharArrayReader(Symbol.decode(source.mkString).toArray) } - override def getTextReader(in: InputStream): Reader = - text_reader(in, Standard_System.codec()) + override def getTextReader(in: InputStream): Reader = text_reader(in, UTF8.codec()) override def getPermissiveTextReader(in: InputStream): Reader = { - val codec = Standard_System.codec() + val codec = UTF8.codec() codec.onMalformedInput(CodingErrorAction.REPLACE) codec.onUnmappableCharacter(CodingErrorAction.REPLACE) text_reader(in, codec) @@ -53,12 +52,12 @@ val buffer = new ByteArrayOutputStream(BUFSIZE) { override def flush() { - val text = Symbol.encode(toString(Standard_System.charset_name)) - out.write(text.getBytes(Standard_System.charset)) + val text = Symbol.encode(toString(UTF8.charset_name)) + out.write(text.getBytes(UTF8.charset)) out.flush() } override def close() { out.close() } } - new OutputStreamWriter(buffer, Standard_System.charset.newEncoder()) + new OutputStreamWriter(buffer, UTF8.charset.newEncoder()) } } diff -r ec0f2f8dbeb9 -r 00d8ad713e32 src/Tools/jEdit/src/scala_console.scala --- a/src/Tools/jEdit/src/scala_console.scala Sun Nov 25 19:55:42 2012 +0100 +++ b/src/Tools/jEdit/src/scala_console.scala Sun Nov 25 20:17:04 2012 +0100 @@ -64,7 +64,7 @@ val buf = new StringBuilder override def flush() { - val str = Standard_System.decode_permissive_utf8(buf.toString) + val str = UTF8.decode_permissive(buf.toString) buf.clear if (global_out == null) System.out.print(str) else Swing_Thread.now { global_out.writeAttrs(null, str) }