| 
64639
 | 
     1  | 
/*  Title:      Pure/General/utf8.scala
  | 
| 
50203
 | 
     2  | 
    Author:     Makarius
  | 
| 
 | 
     3  | 
  | 
| 
 | 
     4  | 
Variations on UTF-8.
  | 
| 
 | 
     5  | 
*/
  | 
| 
 | 
     6  | 
  | 
| 
 | 
     7  | 
package isabelle
  | 
| 
 | 
     8  | 
  | 
| 
 | 
     9  | 
  | 
| 
 | 
    10  | 
import java.nio.charset.Charset
  | 
| 
 | 
    11  | 
import scala.io.Codec
  | 
| 
 | 
    12  | 
  | 
| 
 | 
    13  | 
  | 
| 
 | 
    14  | 
object UTF8
  | 
| 
 | 
    15  | 
{
 | 
| 
 | 
    16  | 
  /* charset */
  | 
| 
 | 
    17  | 
  | 
| 
 | 
    18  | 
  val charset_name: String = "UTF-8"
  | 
| 
 | 
    19  | 
  val charset: Charset = Charset.forName(charset_name)
  | 
| 
 | 
    20  | 
  def codec(): Codec = Codec(charset)
  | 
| 
 | 
    21  | 
  | 
| 
62527
 | 
    22  | 
  def bytes(s: String): Array[Byte] = s.getBytes(charset)
  | 
| 
 | 
    23  | 
  | 
| 
50203
 | 
    24  | 
  | 
| 
 | 
    25  | 
  /* permissive UTF-8 decoding */
  | 
| 
 | 
    26  | 
  | 
| 
68224
 | 
    27  | 
  // see also https://en.wikipedia.org/wiki/UTF-8#Description
  | 
| 
54444
 | 
    28  | 
  // overlong encodings enable byte-stuffing of low-ASCII
  | 
| 
50203
 | 
    29  | 
  | 
| 
 | 
    30  | 
  def decode_permissive(text: CharSequence): String =
  | 
| 
 | 
    31  | 
  {
 | 
| 
 | 
    32  | 
    val buf = new java.lang.StringBuilder(text.length)
  | 
| 
 | 
    33  | 
    var code = -1
  | 
| 
 | 
    34  | 
    var rest = 0
  | 
| 
 | 
    35  | 
    def flush()
  | 
| 
 | 
    36  | 
    {
 | 
| 
 | 
    37  | 
      if (code != -1) {
 | 
| 
 | 
    38  | 
        if (rest == 0 && Character.isValidCodePoint(code))
  | 
| 
 | 
    39  | 
          buf.appendCodePoint(code)
  | 
| 
 | 
    40  | 
        else buf.append('\uFFFD')
 | 
| 
 | 
    41  | 
        code = -1
  | 
| 
 | 
    42  | 
        rest = 0
  | 
| 
 | 
    43  | 
      }
  | 
| 
 | 
    44  | 
    }
  | 
| 
 | 
    45  | 
    def init(x: Int, n: Int)
  | 
| 
 | 
    46  | 
    {
 | 
| 
 | 
    47  | 
      flush()
  | 
| 
 | 
    48  | 
      code = x
  | 
| 
 | 
    49  | 
      rest = n
  | 
| 
 | 
    50  | 
    }
  | 
| 
 | 
    51  | 
    def push(x: Int)
  | 
| 
 | 
    52  | 
    {
 | 
| 
 | 
    53  | 
      if (rest <= 0) init(x, -1)
  | 
| 
 | 
    54  | 
      else {
 | 
| 
 | 
    55  | 
        code <<= 6
  | 
| 
 | 
    56  | 
        code += x
  | 
| 
 | 
    57  | 
        rest -= 1
  | 
| 
 | 
    58  | 
      }
  | 
| 
 | 
    59  | 
    }
  | 
| 
 | 
    60  | 
    for (i <- 0 until text.length) {
 | 
| 
 | 
    61  | 
      val c = text.charAt(i)
  | 
| 
 | 
    62  | 
      if (c < 128) { flush(); buf.append(c) }
 | 
| 
 | 
    63  | 
      else if ((c & 0xC0) == 0x80) push(c & 0x3F)
  | 
| 
 | 
    64  | 
      else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
  | 
| 
 | 
    65  | 
      else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
  | 
| 
 | 
    66  | 
      else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
  | 
| 
 | 
    67  | 
    }
  | 
| 
 | 
    68  | 
    flush()
  | 
| 
 | 
    69  | 
    buf.toString
  | 
| 
 | 
    70  | 
  }
  | 
| 
 | 
    71  | 
  | 
| 
 | 
    72  | 
  private class Decode_Chars(decode: String => String,
  | 
| 
 | 
    73  | 
    buffer: Array[Byte], start: Int, end: Int) extends CharSequence
  | 
| 
 | 
    74  | 
  {
 | 
| 
 | 
    75  | 
    def length: Int = end - start
  | 
| 
 | 
    76  | 
    def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
  | 
| 
 | 
    77  | 
    def subSequence(i: Int, j: Int): CharSequence =
  | 
| 
 | 
    78  | 
      new Decode_Chars(decode, buffer, start + i, start + j)
  | 
| 
 | 
    79  | 
  | 
| 
 | 
    80  | 
    // toString with adhoc decoding: abuse of CharSequence interface
  | 
| 
 | 
    81  | 
    override def toString: String = decode(decode_permissive(this))
  | 
| 
 | 
    82  | 
  }
  | 
| 
 | 
    83  | 
  | 
| 
 | 
    84  | 
  def decode_chars(decode: String => String,
  | 
| 
 | 
    85  | 
    buffer: Array[Byte], start: Int, end: Int): CharSequence =
  | 
| 
 | 
    86  | 
  {
 | 
| 
 | 
    87  | 
    require(0 <= start && start <= end && end <= buffer.length)
  | 
| 
 | 
    88  | 
    new Decode_Chars(decode, buffer, start, end)
  | 
| 
 | 
    89  | 
  }
  | 
| 
 | 
    90  | 
}
  |