diff -r 83f012ce2567 -r bad5de3f9554 src/Pure/General/utf8.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Pure/General/utf8.scala Wed Dec 21 11:21:46 2016 +0100 @@ -0,0 +1,99 @@ +/* Title: Pure/General/utf8.scala + Author: Makarius + +Variations on UTF-8. +*/ + +package isabelle + + +import java.nio.charset.Charset +import scala.io.Codec + + +object UTF8 +{ + /* charset */ + + val charset_name: String = "UTF-8" + val charset: Charset = Charset.forName(charset_name) + def codec(): Codec = Codec(charset) + + def bytes(s: String): Array[Byte] = s.getBytes(charset) + + object Length extends Codepoint.Length + { + override def codepoint_length(c: Int): Int = + if (c < 0x80) 1 + else if (c < 0x800) 2 + else if (c < 0x10000) 3 + else 4 + } + + + /* permissive UTF-8 decoding */ + + // see also http://en.wikipedia.org/wiki/UTF-8#Description + // overlong encodings enable byte-stuffing of low-ASCII + + def decode_permissive(text: CharSequence): String = + { + val buf = new java.lang.StringBuilder(text.length) + var code = -1 + var rest = 0 + def flush() + { + if (code != -1) { + if (rest == 0 && Character.isValidCodePoint(code)) + buf.appendCodePoint(code) + else buf.append('\uFFFD') + code = -1 + rest = 0 + } + } + def init(x: Int, n: Int) + { + flush() + code = x + rest = n + } + def push(x: Int) + { + if (rest <= 0) init(x, -1) + else { + code <<= 6 + code += x + rest -= 1 + } + } + for (i <- 0 until text.length) { + val c = text.charAt(i) + if (c < 128) { flush(); buf.append(c) } + else if ((c & 0xC0) == 0x80) push(c & 0x3F) + else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) + else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) + else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) + } + flush() + buf.toString + } + + private class Decode_Chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int) extends CharSequence + { + def length: Int = end - start + def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] + def subSequence(i: Int, j: Int): CharSequence = + new Decode_Chars(decode, buffer, start + i, start + j) + + // toString with adhoc decoding: abuse of CharSequence interface + override def toString: String = decode(decode_permissive(this)) + } + + def decode_chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int): CharSequence = + { + require(0 <= start && start <= end && end <= buffer.length) + new Decode_Chars(decode, buffer, start, end) + } +}