src/Pure/General/yxml.scala
changeset 34099 2541de190d92
parent 32450 375db037f4d2
child 34118 ee9f87e11400
--- a/src/Pure/General/yxml.scala	Thu Dec 17 19:30:12 2009 +0100
+++ b/src/Pure/General/yxml.scala	Thu Dec 17 20:09:19 2009 +0100
@@ -19,7 +19,8 @@
 
   /* iterate over chunks (resembles space_explode in ML) */
 
-  private def chunks(sep: Char, source: CharSequence) = new Iterator[CharSequence] {
+  private def chunks(sep: Char, source: CharSequence) = new Iterator[CharSequence]
+  {
     private val end = source.length
     private var state = if (end == 0) None else get_chunk(-1)
     private def get_chunk(i: Int) =
@@ -39,6 +40,68 @@
   }
 
 
+  /* decoding pseudo UTF-8 */
+
+  private class Decode_Chars(decode: String => String,
+    buffer: Array[Byte], start: Int, end: Int) extends CharSequence
+  {
+    def length: Int = end - start
+    def charAt(i: Int): Char = (buffer(start + i).asInstanceOf[Int] & 0xFF).asInstanceOf[Char]
+    def subSequence(i: Int, j: Int): CharSequence =
+      new Decode_Chars(decode, buffer, start + i, start + j)
+
+    // toString with adhoc decoding: abuse of CharSequence interface
+    // see also http://en.wikipedia.org/wiki/UTF-8#Description
+    override def toString: String =
+    {
+      val buf = new java.lang.StringBuilder(length)
+      var code = -1
+      var rest = 0
+      def flush()
+      {
+        if (code != -1) {
+          if (rest == 0) buf.appendCodePoint(code)
+          else buf.append('\uFFFD')
+          code = -1
+          rest = 0
+        }
+      }
+      def init(x: Int, n: Int)
+      {
+        flush()
+        code = x
+        rest = n
+      }
+      def push(x: Int)
+      {
+        if (rest <= 0) init(x, -1)
+        else {
+          code <<= 6
+          code += x
+          rest -= 1
+        }
+      }
+      for (i <- 0 until length) {
+        val c = charAt(i)
+        if (c < 128) { flush(); buf.append(c) }
+        else if ((c & 0xC0) == 0x80) push(c & 0x3F)
+        else if ((c & 0xE0) == 0xC0) init(c & 0x1F, 1)
+        else if ((c & 0xF0) == 0xE0) init(c & 0x0F, 2)
+        else if ((c & 0xF8) == 0xF0) init(c & 0x07, 3)
+      }
+      flush()
+      decode(buf.toString)
+    }
+  }
+
+  def decode_chars(decode: String => String,
+    buffer: Array[Byte], start: Int, end: Int): CharSequence =
+  {
+    require(0 <= start && start <= end && end <= buffer.length)
+    new Decode_Chars(decode, buffer, start, end)
+  }
+
+
   /* parsing */
 
   private def err(msg: String) = error("Malformed YXML: " + msg)
@@ -80,11 +143,12 @@
     /* parse chunks */
 
     stack = List((("", Nil), Nil))
-    for (chunk <- chunks(X, source) if chunk != "") {
-      if (chunk == Y_string) pop()
+    for (chunk <- chunks(X, source) if chunk.length != 0) {
+      if (chunk.length == 1 && chunk.charAt(0) == Y) pop()
       else {
         chunks(Y, chunk).toList match {
-          case "" :: name :: atts => push(name.toString.intern, atts.map(parse_attrib))
+          case ch :: name :: atts if ch.length == 0 =>
+            push(name.toString.intern, atts.map(parse_attrib))
           case txts => for (txt <- txts) add(XML.Text(txt.toString))
         }
       }