| 64639 |      1 | /*  Title:      Pure/General/utf8.scala
 | 
| 50203 |      2 |     Author:     Makarius
 | 
|  |      3 | 
 | 
|  |      4 | Variations on UTF-8.
 | 
|  |      5 | */
 | 
|  |      6 | 
 | 
|  |      7 | package isabelle
 | 
|  |      8 | 
 | 
|  |      9 | 
 | 
|  |     10 | import java.nio.charset.Charset
 | 
|  |     11 | import scala.io.Codec
 | 
|  |     12 | 
 | 
|  |     13 | 
 | 
|  |     14 | object UTF8
 | 
|  |     15 | {
 | 
|  |     16 |   /* charset */
 | 
|  |     17 | 
 | 
|  |     18 |   val charset_name: String = "UTF-8"
 | 
|  |     19 |   val charset: Charset = Charset.forName(charset_name)
 | 
|  |     20 |   def codec(): Codec = Codec(charset)
 | 
|  |     21 | 
 | 
| 62527 |     22 |   def bytes(s: String): Array[Byte] = s.getBytes(charset)
 | 
|  |     23 | 
 | 
| 50203 |     24 | 
 | 
|  |     25 |   /* permissive UTF-8 decoding */
 | 
|  |     26 | 
 | 
| 68224 |     27 |   // see also https://en.wikipedia.org/wiki/UTF-8#Description
 | 
| 54444 |     28 |   // overlong encodings enable byte-stuffing of low-ASCII
 | 
| 50203 |     29 | 
 | 
|  |     30 |   def decode_permissive(text: CharSequence): String =
 | 
|  |     31 |   {
 | 
|  |     32 |     val buf = new java.lang.StringBuilder(text.length)
 | 
|  |     33 |     var code = -1
 | 
|  |     34 |     var rest = 0
 | 
|  |     35 |     def flush()
 | 
|  |     36 |     {
 | 
|  |     37 |       if (code != -1) {
 | 
|  |     38 |         if (rest == 0 && Character.isValidCodePoint(code))
 | 
|  |     39 |           buf.appendCodePoint(code)
 | 
|  |     40 |         else buf.append('\uFFFD')
 | 
|  |     41 |         code = -1
 | 
|  |     42 |         rest = 0
 | 
|  |     43 |       }
 | 
|  |     44 |     }
 | 
|  |     45 |     def init(x: Int, n: Int)
 | 
|  |     46 |     {
 | 
|  |     47 |       flush()
 | 
|  |     48 |       code = x
 | 
|  |     49 |       rest = n
 | 
|  |     50 |     }
 | 
|  |     51 |     def push(x: Int)
 | 
|  |     52 |     {
 | 
|  |     53 |       if (rest <= 0) init(x, -1)
 | 
|  |     54 |       else {
 | 
|  |     55 |         code <<= 6
 | 
|  |     56 |         code += x
 | 
|  |     57 |         rest -= 1
 | 
|  |     58 |       }
 | 
|  |     59 |     }
 | 
|  |     60 |     for (i <- 0 until text.length) {
 | 
|  |     61 |       val c = text.charAt(i)
 | 
|  |     62 |       if (c < 128) { flush(); buf.append(c) }
 | 
|  |     63 |       else if ((c & 0xC0) == 0x80) push(c & 0x3F)
 | 
|  |     64 |       else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
 | 
|  |     65 |       else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
 | 
|  |     66 |       else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
 | 
|  |     67 |     }
 | 
|  |     68 |     flush()
 | 
|  |     69 |     buf.toString
 | 
|  |     70 |   }
 | 
|  |     71 | 
 | 
|  |     72 |   private class Decode_Chars(decode: String => String,
 | 
|  |     73 |     buffer: Array[Byte], start: Int, end: Int) extends CharSequence
 | 
|  |     74 |   {
 | 
|  |     75 |     def length: Int = end - start
 | 
|  |     76 |     def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
 | 
|  |     77 |     def subSequence(i: Int, j: Int): CharSequence =
 | 
|  |     78 |       new Decode_Chars(decode, buffer, start + i, start + j)
 | 
|  |     79 | 
 | 
|  |     80 |     // toString with adhoc decoding: abuse of CharSequence interface
 | 
|  |     81 |     override def toString: String = decode(decode_permissive(this))
 | 
|  |     82 |   }
 | 
|  |     83 | 
 | 
|  |     84 |   def decode_chars(decode: String => String,
 | 
|  |     85 |     buffer: Array[Byte], start: Int, end: Int): CharSequence =
 | 
|  |     86 |   {
 | 
|  |     87 |     require(0 <= start && start <= end && end <= buffer.length)
 | 
|  |     88 |     new Decode_Chars(decode, buffer, start, end)
 | 
|  |     89 |   }
 | 
|  |     90 | }
 |