src/Pure/System/utf8.scala
changeset 64669 ce441970956f
parent 64638 235df39ade87
parent 64668 39a6c88c059b
child 64670 f77b946d18aa
equal deleted inserted replaced
64638:235df39ade87 64669:ce441970956f
     1 /*  Title:      Pure/System/utf8.scala
       
     2     Author:     Makarius
       
     3 
       
     4 Variations on UTF-8.
       
     5 */
       
     6 
       
     7 package isabelle
       
     8 
       
     9 
       
    10 import java.nio.charset.Charset
       
    11 import scala.io.Codec
       
    12 
       
    13 
       
    14 object UTF8
       
    15 {
       
    16   /* charset */
       
    17 
       
    18   val charset_name: String = "UTF-8"
       
    19   val charset: Charset = Charset.forName(charset_name)
       
    20   def codec(): Codec = Codec(charset)
       
    21 
       
    22   def bytes(s: String): Array[Byte] = s.getBytes(charset)
       
    23 
       
    24   object Length extends Codepoint.Length
       
    25   {
       
    26     override def codepoint_length(c: Int): Int =
       
    27       if (c < 0x80) 1
       
    28       else if (c < 0x800) 2
       
    29       else if (c < 0x10000) 3
       
    30       else 4
       
    31   }
       
    32 
       
    33 
       
    34   /* permissive UTF-8 decoding */
       
    35 
       
    36   // see also http://en.wikipedia.org/wiki/UTF-8#Description
       
    37   // overlong encodings enable byte-stuffing of low-ASCII
       
    38 
       
    39   def decode_permissive(text: CharSequence): String =
       
    40   {
       
    41     val buf = new java.lang.StringBuilder(text.length)
       
    42     var code = -1
       
    43     var rest = 0
       
    44     def flush()
       
    45     {
       
    46       if (code != -1) {
       
    47         if (rest == 0 && Character.isValidCodePoint(code))
       
    48           buf.appendCodePoint(code)
       
    49         else buf.append('\uFFFD')
       
    50         code = -1
       
    51         rest = 0
       
    52       }
       
    53     }
       
    54     def init(x: Int, n: Int)
       
    55     {
       
    56       flush()
       
    57       code = x
       
    58       rest = n
       
    59     }
       
    60     def push(x: Int)
       
    61     {
       
    62       if (rest <= 0) init(x, -1)
       
    63       else {
       
    64         code <<= 6
       
    65         code += x
       
    66         rest -= 1
       
    67       }
       
    68     }
       
    69     for (i <- 0 until text.length) {
       
    70       val c = text.charAt(i)
       
    71       if (c < 128) { flush(); buf.append(c) }
       
    72       else if ((c & 0xC0) == 0x80) push(c & 0x3F)
       
    73       else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
       
    74       else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
       
    75       else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
       
    76     }
       
    77     flush()
       
    78     buf.toString
       
    79   }
       
    80 
       
    81   private class Decode_Chars(decode: String => String,
       
    82     buffer: Array[Byte], start: Int, end: Int) extends CharSequence
       
    83   {
       
    84     def length: Int = end - start
       
    85     def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
       
    86     def subSequence(i: Int, j: Int): CharSequence =
       
    87       new Decode_Chars(decode, buffer, start + i, start + j)
       
    88 
       
    89     // toString with adhoc decoding: abuse of CharSequence interface
       
    90     override def toString: String = decode(decode_permissive(this))
       
    91   }
       
    92 
       
    93   def decode_chars(decode: String => String,
       
    94     buffer: Array[Byte], start: Int, end: Int): CharSequence =
       
    95   {
       
    96     require(0 <= start && start <= end && end <= buffer.length)
       
    97     new Decode_Chars(decode, buffer, start, end)
       
    98   }
       
    99 }