src/Pure/General/utf8.scala
changeset 64639 bad5de3f9554
parent 64617 01e50039edc9
child 65196 e8760a98db78
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/Pure/General/utf8.scala	Wed Dec 21 11:21:46 2016 +0100
@@ -0,0 +1,99 @@
+/*  Title:      Pure/General/utf8.scala
+    Author:     Makarius
+
+Variations on UTF-8.
+*/
+
+package isabelle
+
+
+import java.nio.charset.Charset
+import scala.io.Codec
+
+
+object UTF8
+{
+  /* charset */
+
+  val charset_name: String = "UTF-8"
+  val charset: Charset = Charset.forName(charset_name)
+  def codec(): Codec = Codec(charset)
+
+  def bytes(s: String): Array[Byte] = s.getBytes(charset)
+
+  object Length extends Codepoint.Length
+  {
+    override def codepoint_length(c: Int): Int =
+      if (c < 0x80) 1
+      else if (c < 0x800) 2
+      else if (c < 0x10000) 3
+      else 4
+  }
+
+
+  /* permissive UTF-8 decoding */
+
+  // see also http://en.wikipedia.org/wiki/UTF-8#Description
+  // overlong encodings enable byte-stuffing of low-ASCII
+
+  def decode_permissive(text: CharSequence): String =
+  {
+    val buf = new java.lang.StringBuilder(text.length)
+    var code = -1
+    var rest = 0
+    def flush()
+    {
+      if (code != -1) {
+        if (rest == 0 && Character.isValidCodePoint(code))
+          buf.appendCodePoint(code)
+        else buf.append('\uFFFD')
+        code = -1
+        rest = 0
+      }
+    }
+    def init(x: Int, n: Int)
+    {
+      flush()
+      code = x
+      rest = n
+    }
+    def push(x: Int)
+    {
+      if (rest <= 0) init(x, -1)
+      else {
+        code <<= 6
+        code += x
+        rest -= 1
+      }
+    }
+    for (i <- 0 until text.length) {
+      val c = text.charAt(i)
+      if (c < 128) { flush(); buf.append(c) }
+      else if ((c & 0xC0) == 0x80) push(c & 0x3F)
+      else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
+      else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
+      else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
+    }
+    flush()
+    buf.toString
+  }
+
+  private class Decode_Chars(decode: String => String,
+    buffer: Array[Byte], start: Int, end: Int) extends CharSequence
+  {
+    def length: Int = end - start
+    def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
+    def subSequence(i: Int, j: Int): CharSequence =
+      new Decode_Chars(decode, buffer, start + i, start + j)
+
+    // toString with adhoc decoding: abuse of CharSequence interface
+    override def toString: String = decode(decode_permissive(this))
+  }
+
+  def decode_chars(decode: String => String,
+    buffer: Array[Byte], start: Int, end: Int): CharSequence =
+  {
+    require(0 <= start && start <= end && end <= buffer.length)
+    new Decode_Chars(decode, buffer, start, end)
+  }
+}