diff -r 2b9cdf23c188 -r 2541de190d92 src/Pure/General/yxml.scala --- a/src/Pure/General/yxml.scala Thu Dec 17 19:30:12 2009 +0100 +++ b/src/Pure/General/yxml.scala Thu Dec 17 20:09:19 2009 +0100 @@ -19,7 +19,8 @@ /* iterate over chunks (resembles space_explode in ML) */ - private def chunks(sep: Char, source: CharSequence) = new Iterator[CharSequence] { + private def chunks(sep: Char, source: CharSequence) = new Iterator[CharSequence] + { private val end = source.length private var state = if (end == 0) None else get_chunk(-1) private def get_chunk(i: Int) = @@ -39,6 +40,68 @@ } + /* decoding pseudo UTF-8 */ + + private class Decode_Chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int) extends CharSequence + { + def length: Int = end - start + def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char] + def subSequence(i: Int, j: Int): CharSequence = + new Decode_Chars(decode, buffer, start + i, start + j) + + // toString with adhoc decoding: abuse of CharSequence interface + // see also http://en.wikipedia.org/wiki/UTF-8#Description + override def toString: String = + { + val buf = new java.lang.StringBuilder(length) + var code = -1 + var rest = 0 + def flush() + { + if (code != -1) { + if (rest == 0) buf.appendCodePoint(code) + else buf.append('\uFFFD') + code = -1 + rest = 0 + } + } + def init(x: Int, n: Int) + { + flush() + code = x + rest = n + } + def push(x: Int) + { + if (rest <= 0) init(x, -1) + else { + code <<= 6 + code += x + rest -= 1 + } + } + for (i <- 0 until length) { + val c = charAt(i) + if (c < 128) { flush(); buf.append(c) } + else if ((c & 0xC0) == 0x80) push(c & 0x3F) + else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1) + else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2) + else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3) + } + flush() + decode(buf.toString) + } + } + + def decode_chars(decode: String => String, + buffer: Array[Byte], start: Int, end: Int): CharSequence = + { + require(0 <= start && start <= end && end <= buffer.length) + new Decode_Chars(decode, buffer, start, end) + } + + /* parsing */ private def err(msg: String) = error("Malformed YXML: " + msg) @@ -80,11 +143,12 @@ /* parse chunks */ stack = List((("", Nil), Nil)) - for (chunk <- chunks(X, source) if chunk != "") { - if (chunk == Y_string) pop() + for (chunk <- chunks(X, source) if chunk.length != 0) { + if (chunk.length == 1 && chunk.charAt(0) == Y) pop() else { chunks(Y, chunk).toList match { - case "" :: name :: atts => push(name.toString.intern, atts.map(parse_attrib)) + case ch :: name :: atts if ch.length == 0 => + push(name.toString.intern, atts.map(parse_attrib)) case txts => for (txt <- txts) add(XML.Text(txt.toString)) } }