# HG changeset patch # User wenzelm # Date 1482315706 -3600 # Node ID bad5de3f9554e168cd2c0ad1315f62785aab3c1b # Parent 83f012ce256791a0ee559db4989d1a4701dbc0a4 clarified directories; diff -r 83f012ce2567 -r bad5de3f9554 src/Pure/General/utf8.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Pure/General/utf8.scala Wed Dec 21 11:21:46 2016 +0100 @@ -0,0 +1,99 @@ +/* Title: Pure/General/utf8.scala + Author: Makarius + +Variations on UTF-8. +*/ + +package isabelle + + +import java.nio.charset.Charset +import scala.io.Codec + + +object UTF8 +{ + /* charset */ + + val charset_name: String = "UTF-8" + val charset: Charset = Charset.forName(charset_name) + def codec(): Codec = Codec(charset) + + def bytes(s: String): Array[Byte] = s.getBytes(charset) + + object Length extends Codepoint.Length + { + override def codepoint_length(c: Int): Int = + if (c < 0x80) 1 + else if (c < 0x800) 2 + else if (c < 0x10000) 3 + else 4 + } + + + /* permissive UTF-8 decoding */ + + // see also http://en.wikipedia.org/wiki/UTF-8#Description + // overlong encodings enable byte-stuffing of low-ASCII + + def decode_permissive(text: CharSequence): String = + { + val buf = new java.lang.StringBuilder(text.length) + var code = -1 + var rest = 0 + def flush() + { + if (code != -1) { + if (rest == 0 && Character.isValidCodePoint(code)) + buf.appendCodePoint(code) + else buf.append('\uFFFD') + code = -1 + rest = 0 + } + } + def init(x: Int, n: Int) + { + flush() + code = x + rest = n + } + def push(x: Int) + { + if (rest <= 0) init(x, -1) + else { + code <<= 6 + code += x + rest -= 1 + } + } + for (i <- 0 until text.length) { + val c = text.charAt(i) + if (c < 128) { flush(); buf.append(c) } + else if ((c & 0xC0) == 0x80) push(c & 0x3F) + else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) + else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) + else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) + } + flush() + buf.toString + } + + private class Decode_Chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int) extends CharSequence + { + def length: Int = end - start + def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] + def subSequence(i: Int, j: Int): CharSequence = + new Decode_Chars(decode, buffer, start + i, start + j) + + // toString with adhoc decoding: abuse of CharSequence interface + override def toString: String = decode(decode_permissive(this)) + } + + def decode_chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int): CharSequence = + { + require(0 <= start && start <= end && end <= buffer.length) + new Decode_Chars(decode, buffer, start, end) + } +} diff -r 83f012ce2567 -r bad5de3f9554 src/Pure/System/utf8.scala --- a/src/Pure/System/utf8.scala Tue Dec 20 22:32:04 2016 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,99 +0,0 @@ -/* Title: Pure/System/utf8.scala - Author: Makarius - -Variations on UTF-8. -*/ - -package isabelle - - -import java.nio.charset.Charset -import scala.io.Codec - - -object UTF8 -{ - /* charset */ - - val charset_name: String = "UTF-8" - val charset: Charset = Charset.forName(charset_name) - def codec(): Codec = Codec(charset) - - def bytes(s: String): Array[Byte] = s.getBytes(charset) - - object Length extends Codepoint.Length - { - override def codepoint_length(c: Int): Int = - if (c < 0x80) 1 - else if (c < 0x800) 2 - else if (c < 0x10000) 3 - else 4 - } - - - /* permissive UTF-8 decoding */ - - // see also http://en.wikipedia.org/wiki/UTF-8#Description - // overlong encodings enable byte-stuffing of low-ASCII - - def decode_permissive(text: CharSequence): String = - { - val buf = new java.lang.StringBuilder(text.length) - var code = -1 - var rest = 0 - def flush() - { - if (code != -1) { - if (rest == 0 && Character.isValidCodePoint(code)) - buf.appendCodePoint(code) - else buf.append('\uFFFD') - code = -1 - rest = 0 - } - } - def init(x: Int, n: Int) - { - flush() - code = x - rest = n - } - def push(x: Int) - { - if (rest <= 0) init(x, -1) - else { - code <<= 6 - code += x - rest -= 1 - } - } - for (i <- 0 until text.length) { - val c = text.charAt(i) - if (c < 128) { flush(); buf.append(c) } - else if ((c & 0xC0) == 0x80) push(c & 0x3F) - else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) - else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) - else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) - } - flush() - buf.toString - } - - private class Decode_Chars(decode: String => String, - buffer: Array[Byte], start: Int, end: Int) extends CharSequence - { - def length: Int = end - start - def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] - def subSequence(i: Int, j: Int): CharSequence = - new Decode_Chars(decode, buffer, start + i, start + j) - - // toString with adhoc decoding: abuse of CharSequence interface - override def toString: String = decode(decode_permissive(this)) - } - - def decode_chars(decode: String => String, - buffer: Array[Byte], start: Int, end: Int): CharSequence = - { - require(0 <= start && start <= end && end <= buffer.length) - new Decode_Chars(decode, buffer, start, end) - } -} diff -r 83f012ce2567 -r bad5de3f9554 src/Pure/build-jars --- a/src/Pure/build-jars Tue Dec 20 22:32:04 2016 +0100 +++ b/src/Pure/build-jars Wed Dec 21 11:21:46 2016 +0100 @@ -70,6 +70,7 @@ General/timing.scala General/untyped.scala General/url.scala + General/utf8.scala General/value.scala General/word.scala General/xz.scala @@ -118,7 +119,6 @@ System/process_result.scala System/progress.scala System/system_channel.scala - System/utf8.scala Thy/html.scala Thy/present.scala Thy/sessions.scala