# HG changeset patch # User wenzelm # Date 1261951756 -3600 # Node ID b6960fc09ef3795240841f59a0917b6edd9c032c # Parent dfcf667bbfedd442aaadd26211c3316161517163 factored-out Library.decode_permissive_utf8; diff -r dfcf667bbfed -r b6960fc09ef3 src/Pure/General/yxml.scala --- a/src/Pure/General/yxml.scala Sun Dec 27 22:36:47 2009 +0100 +++ b/src/Pure/General/yxml.scala Sun Dec 27 23:09:16 2009 +0100 @@ -51,48 +51,7 @@ new Decode_Chars(decode, buffer, start + i, start + j) // toString with adhoc decoding: abuse of CharSequence interface - // see also http://en.wikipedia.org/wiki/UTF-8#Description - override def toString: String = - { - val buf = new java.lang.StringBuilder(length) - var code = -1 - var rest = 0 - def flush() - { - if (code != -1) { - if (rest == 0 && Character.isValidCodePoint(code)) - buf.appendCodePoint(code) - else buf.append('\uFFFD') - code = -1 - rest = 0 - } - } - def init(x: Int, n: Int) - { - flush() - code = x - rest = n - } - def push(x: Int) - { - if (rest <= 0) init(x, -1) - else { - code <<= 6 - code += x - rest -= 1 - } - } - for (i <- 0 until length) { - val c = charAt(i) - if (c < 128) { flush(); buf.append(c) } - else if ((c & 0xC0) == 0x80) push(c & 0x3F) - else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) - else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) - else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) - } - flush() - decode(buf.toString) - } + override def toString: String = decode(Library.decode_permissive_utf8(this)) } def decode_chars(decode: String => String, diff -r dfcf667bbfed -r b6960fc09ef3 src/Pure/library.scala --- a/src/Pure/library.scala Sun Dec 27 22:36:47 2009 +0100 +++ b/src/Pure/library.scala Sun Dec 27 23:09:16 2009 +0100 @@ -36,6 +36,52 @@ } + /* permissive UTF-8 decoding */ + + // see also http://en.wikipedia.org/wiki/UTF-8#Description + def decode_permissive_utf8(text: CharSequence): String = + { + val buf = new java.lang.StringBuilder(text.length) + var code = -1 + var rest = 0 + def flush() + { + if (code != -1) { + if (rest == 0 && Character.isValidCodePoint(code)) + buf.appendCodePoint(code) + else buf.append('\uFFFD') + code = -1 + rest = 0 + } + } + def init(x: Int, n: Int) + { + flush() + code = x + rest = n + } + def push(x: Int) + { + if (rest <= 0) init(x, -1) + else { + code <<= 6 + code += x + rest -= 1 + } + } + for (i <- 0 until text.length) { + val c = text.charAt(i) + if (c < 128) { flush(); buf.append(c) } + else if ((c & 0xC0) == 0x80) push(c & 0x3F) + else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) + else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) + else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) + } + flush() + buf.toString + } + + /* timing */ def timeit[A](e: => A) =