src/Pure/System/utf8.scala
author wenzelm
Mon Jul 15 10:25:35 2013 +0200 (2013-07-15 ago)
changeset 52655 3b2b1ef13979
parent 50203 00d8ad713e32
child 54440 2c4940d2edf7
permissions -rw-r--r--
more careful termination of removed execs, leaving running execs undisturbed;
     1 /*  Title:      Pure/System/utf8.scala
     2     Module:     PIDE
     3     Author:     Makarius
     4 
     5 Variations on UTF-8.
     6 */
     7 
     8 package isabelle
     9 
    10 
    11 import java.nio.charset.Charset
    12 import scala.io.Codec
    13 
    14 
    15 object UTF8
    16 {
    17   /* charset */
    18 
    19   val charset_name: String = "UTF-8"
    20   val charset: Charset = Charset.forName(charset_name)
    21   def codec(): Codec = Codec(charset)
    22 
    23   def string_bytes(s: String): Array[Byte] = s.getBytes(charset)
    24 
    25 
    26   /* permissive UTF-8 decoding */
    27 
    28   // see also http://en.wikipedia.org/wiki/UTF-8#Description
    29   // overlong encodings enable byte-stuffing
    30 
    31   def decode_permissive(text: CharSequence): String =
    32   {
    33     val buf = new java.lang.StringBuilder(text.length)
    34     var code = -1
    35     var rest = 0
    36     def flush()
    37     {
    38       if (code != -1) {
    39         if (rest == 0 && Character.isValidCodePoint(code))
    40           buf.appendCodePoint(code)
    41         else buf.append('\uFFFD')
    42         code = -1
    43         rest = 0
    44       }
    45     }
    46     def init(x: Int, n: Int)
    47     {
    48       flush()
    49       code = x
    50       rest = n
    51     }
    52     def push(x: Int)
    53     {
    54       if (rest <= 0) init(x, -1)
    55       else {
    56         code <<= 6
    57         code += x
    58         rest -= 1
    59       }
    60     }
    61     for (i <- 0 until text.length) {
    62       val c = text.charAt(i)
    63       if (c < 128) { flush(); buf.append(c) }
    64       else if ((c & 0xC0) == 0x80) push(c & 0x3F)
    65       else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
    66       else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
    67       else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
    68     }
    69     flush()
    70     buf.toString
    71   }
    72 
    73   private class Decode_Chars(decode: String => String,
    74     buffer: Array[Byte], start: Int, end: Int) extends CharSequence
    75   {
    76     def length: Int = end - start
    77     def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
    78     def subSequence(i: Int, j: Int): CharSequence =
    79       new Decode_Chars(decode, buffer, start + i, start + j)
    80 
    81     // toString with adhoc decoding: abuse of CharSequence interface
    82     override def toString: String = decode(decode_permissive(this))
    83   }
    84 
    85   def decode_chars(decode: String => String,
    86     buffer: Array[Byte], start: Int, end: Int): CharSequence =
    87   {
    88     require(0 <= start && start <= end && end <= buffer.length)
    89     new Decode_Chars(decode, buffer, start, end)
    90   }
    91 }
    92