src/Pure/General/xml.scala
author wenzelm
Tue Jul 12 10:44:30 2011 +0200 (2011-07-12 ago)
changeset 43767 e0219ef7f84c
parent 43747 74a9e9c8d5e8
child 43768 d52ab827d62b
permissions -rw-r--r--
tuned XML modules;
wenzelm@27931
     1
/*  Title:      Pure/General/xml.scala
wenzelm@27931
     2
    Author:     Makarius
wenzelm@27931
     3
wenzelm@27947
     4
Simple XML tree values.
wenzelm@27931
     5
*/
wenzelm@27931
     6
wenzelm@27931
     7
package isabelle
wenzelm@27931
     8
wenzelm@43520
     9
import java.lang.System
wenzelm@34108
    10
import java.util.WeakHashMap
wenzelm@34108
    11
import java.lang.ref.WeakReference
wenzelm@34108
    12
import javax.xml.parsers.DocumentBuilderFactory
wenzelm@34108
    13
wenzelm@38446
    14
import scala.actors.Actor._
wenzelm@38446
    15
wenzelm@27947
    16
wenzelm@29203
    17
object XML
wenzelm@29203
    18
{
wenzelm@43767
    19
  /** XML trees **/
wenzelm@43767
    20
wenzelm@27947
    21
  /* datatype representation */
wenzelm@27947
    22
wenzelm@27931
    23
  type Attributes = List[(String, String)]
wenzelm@27931
    24
wenzelm@38268
    25
  sealed abstract class Tree { override def toString = string_of_tree(this) }
wenzelm@38230
    26
  case class Elem(markup: Markup, body: List[Tree]) extends Tree
wenzelm@29204
    27
  case class Text(content: String) extends Tree
wenzelm@29203
    28
wenzelm@38230
    29
  def elem(name: String, body: List[Tree]) = Elem(Markup(name, Nil), body)
wenzelm@38230
    30
  def elem(name: String) = Elem(Markup(name, Nil), Nil)
wenzelm@33999
    31
wenzelm@38267
    32
  type Body = List[Tree]
wenzelm@38267
    33
wenzelm@29203
    34
wenzelm@29203
    35
  /* string representation */
wenzelm@29203
    36
wenzelm@38268
    37
  def string_of_body(body: Body): String =
wenzelm@38268
    38
  {
wenzelm@38268
    39
    val s = new StringBuilder
wenzelm@38268
    40
wenzelm@38268
    41
    def text(txt: String) {
wenzelm@38268
    42
      if (txt == null) s ++= txt
wenzelm@38268
    43
      else {
wenzelm@38268
    44
        for (c <- txt.iterator) c match {
wenzelm@38268
    45
          case '<' => s ++= "&lt;"
wenzelm@38268
    46
          case '>' => s ++= "&gt;"
wenzelm@38268
    47
          case '&' => s ++= "&amp;"
wenzelm@38268
    48
          case '"' => s ++= "&quot;"
wenzelm@38268
    49
          case '\'' => s ++= "&apos;"
wenzelm@38268
    50
          case _ => s += c
wenzelm@38268
    51
        }
wenzelm@34005
    52
      }
wenzelm@29203
    53
    }
wenzelm@38268
    54
    def attrib(p: (String, String)) { s ++= " "; s ++= p._1; s ++= "=\""; text(p._2); s ++= "\"" }
wenzelm@38268
    55
    def elem(markup: Markup) { s ++= markup.name; markup.properties.foreach(attrib) }
wenzelm@38268
    56
    def tree(t: Tree): Unit =
wenzelm@38268
    57
      t match {
wenzelm@38268
    58
        case Elem(markup, Nil) =>
wenzelm@38268
    59
          s ++= "<"; elem(markup); s ++= "/>"
wenzelm@38268
    60
        case Elem(markup, ts) =>
wenzelm@38268
    61
          s ++= "<"; elem(markup); s ++= ">"
wenzelm@38268
    62
          ts.foreach(tree)
wenzelm@38268
    63
          s ++= "</"; s ++= markup.name; s ++= ">"
wenzelm@38268
    64
        case Text(txt) => text(txt)
wenzelm@38268
    65
      }
wenzelm@38268
    66
    body.foreach(tree)
wenzelm@38268
    67
    s.toString
wenzelm@29203
    68
  }
wenzelm@29203
    69
wenzelm@38268
    70
  def string_of_tree(tree: XML.Tree): String = string_of_body(List(tree))
wenzelm@27941
    71
wenzelm@27941
    72
wenzelm@38484
    73
  /* text content */
wenzelm@27941
    74
wenzelm@38484
    75
  def content_stream(tree: Tree): Stream[String] =
wenzelm@38484
    76
    tree match {
wenzelm@43747
    77
      case Elem(_, body) => content_stream(body)
wenzelm@38484
    78
      case Text(content) => Stream(content)
wenzelm@27941
    79
    }
wenzelm@43747
    80
  def content_stream(body: Body): Stream[String] =
wenzelm@43747
    81
    body.toStream.flatten(content_stream(_))
wenzelm@27941
    82
wenzelm@38484
    83
  def content(tree: Tree): Iterator[String] = content_stream(tree).iterator
wenzelm@43747
    84
  def content(body: Body): Iterator[String] = content_stream(body).iterator
wenzelm@27941
    85
wenzelm@27947
    86
wenzelm@38446
    87
  /* pipe-lined cache for partial sharing */
wenzelm@34108
    88
wenzelm@43745
    89
  class Cache(initial_size: Int = 131071, max_string: Int = 100)
wenzelm@34108
    90
  {
wenzelm@38446
    91
    private val cache_actor = actor
wenzelm@34108
    92
    {
wenzelm@38446
    93
      val table = new WeakHashMap[Any, WeakReference[Any]](initial_size)
wenzelm@38446
    94
wenzelm@38446
    95
      def lookup[A](x: A): Option[A] =
wenzelm@38446
    96
      {
wenzelm@38446
    97
        val ref = table.get(x)
wenzelm@38446
    98
        if (ref == null) None
wenzelm@38446
    99
        else {
wenzelm@38446
   100
          val y = ref.asInstanceOf[WeakReference[A]].get
wenzelm@38446
   101
          if (y == null) None
wenzelm@38446
   102
          else Some(y)
wenzelm@38446
   103
        }
wenzelm@34108
   104
      }
wenzelm@38446
   105
      def store[A](x: A): A =
wenzelm@38446
   106
      {
wenzelm@38446
   107
        table.put(x, new WeakReference[Any](x))
wenzelm@38446
   108
        x
wenzelm@38446
   109
      }
wenzelm@34108
   110
wenzelm@38869
   111
      def trim_bytes(s: String): String = new String(s.toCharArray)
wenzelm@38869
   112
wenzelm@38446
   113
      def cache_string(x: String): String =
wenzelm@38446
   114
        lookup(x) match {
wenzelm@38446
   115
          case Some(y) => y
wenzelm@43745
   116
          case None =>
wenzelm@43745
   117
            val z = trim_bytes(x)
wenzelm@43745
   118
            if (z.length > max_string) z else store(z)
wenzelm@38446
   119
        }
wenzelm@38446
   120
      def cache_props(x: List[(String, String)]): List[(String, String)] =
wenzelm@38446
   121
        if (x.isEmpty) x
wenzelm@38446
   122
        else
wenzelm@38446
   123
          lookup(x) match {
wenzelm@38446
   124
            case Some(y) => y
wenzelm@38869
   125
            case None => store(x.map(p => (trim_bytes(p._1).intern, cache_string(p._2))))
wenzelm@38446
   126
          }
wenzelm@38446
   127
      def cache_markup(x: Markup): Markup =
wenzelm@34133
   128
        lookup(x) match {
wenzelm@34133
   129
          case Some(y) => y
wenzelm@38446
   130
          case None =>
wenzelm@38446
   131
            x match {
wenzelm@38446
   132
              case Markup(name, props) =>
wenzelm@38446
   133
                store(Markup(cache_string(name), cache_props(props)))
wenzelm@38446
   134
            }
wenzelm@34133
   135
        }
wenzelm@38446
   136
      def cache_tree(x: XML.Tree): XML.Tree =
wenzelm@34133
   137
        lookup(x) match {
wenzelm@34133
   138
          case Some(y) => y
wenzelm@38446
   139
          case None =>
wenzelm@38446
   140
            x match {
wenzelm@38446
   141
              case XML.Elem(markup, body) =>
wenzelm@38446
   142
                store(XML.Elem(cache_markup(markup), cache_body(body)))
wenzelm@38446
   143
              case XML.Text(text) => store(XML.Text(cache_string(text)))
wenzelm@38446
   144
            }
wenzelm@34133
   145
        }
wenzelm@38446
   146
      def cache_body(x: XML.Body): XML.Body =
wenzelm@38446
   147
        if (x.isEmpty) x
wenzelm@38446
   148
        else
wenzelm@38446
   149
          lookup(x) match {
wenzelm@38446
   150
            case Some(y) => y
wenzelm@38446
   151
            case None => x.map(cache_tree(_))
wenzelm@38446
   152
          }
wenzelm@38446
   153
wenzelm@38446
   154
      // main loop
wenzelm@38446
   155
      loop {
wenzelm@38446
   156
        react {
wenzelm@38446
   157
          case Cache_String(x, f) => f(cache_string(x))
wenzelm@38446
   158
          case Cache_Markup(x, f) => f(cache_markup(x))
wenzelm@38446
   159
          case Cache_Tree(x, f) => f(cache_tree(x))
wenzelm@38446
   160
          case Cache_Body(x, f) => f(cache_body(x))
wenzelm@38446
   161
          case bad => System.err.println("XML.cache_actor: ignoring bad input " + bad)
wenzelm@38446
   162
        }
wenzelm@38446
   163
      }
wenzelm@38446
   164
    }
wenzelm@38446
   165
wenzelm@38446
   166
    private case class Cache_String(x: String, f: String => Unit)
wenzelm@38446
   167
    private case class Cache_Markup(x: Markup, f: Markup => Unit)
wenzelm@38446
   168
    private case class Cache_Tree(x: XML.Tree, f: XML.Tree => Unit)
wenzelm@38446
   169
    private case class Cache_Body(x: XML.Body, f: XML.Body => Unit)
wenzelm@38446
   170
wenzelm@38446
   171
    // main methods
wenzelm@38446
   172
    def cache_string(x: String)(f: String => Unit) { cache_actor ! Cache_String(x, f) }
wenzelm@38446
   173
    def cache_markup(x: Markup)(f: Markup => Unit) { cache_actor ! Cache_Markup(x, f) }
wenzelm@38446
   174
    def cache_tree(x: XML.Tree)(f: XML.Tree => Unit) { cache_actor ! Cache_Tree(x, f) }
wenzelm@38446
   175
    def cache_body(x: XML.Body)(f: XML.Body => Unit) { cache_actor ! Cache_Body(x, f) }
wenzelm@34108
   176
  }
wenzelm@34108
   177
wenzelm@34108
   178
wenzelm@43767
   179
wenzelm@43767
   180
  /** document object model (W3C DOM) **/
wenzelm@27948
   181
wenzelm@34871
   182
  def get_data(node: org.w3c.dom.Node): Option[XML.Tree] =
wenzelm@38231
   183
    node.getUserData(Markup.Data.name) match {
wenzelm@34047
   184
      case tree: XML.Tree => Some(tree)
wenzelm@34047
   185
      case _ => None
wenzelm@34047
   186
    }
wenzelm@34047
   187
wenzelm@34871
   188
  def document_node(doc: org.w3c.dom.Document, tree: Tree): org.w3c.dom.Node =
wenzelm@33953
   189
  {
wenzelm@34871
   190
    def DOM(tr: Tree): org.w3c.dom.Node = tr match {
wenzelm@38231
   191
      case Elem(Markup.Data, List(data, t)) =>
wenzelm@34046
   192
        val node = DOM(t)
wenzelm@38231
   193
        node.setUserData(Markup.Data.name, data, null)
wenzelm@34046
   194
        node
wenzelm@38230
   195
      case Elem(Markup(name, atts), ts) =>
wenzelm@38231
   196
        if (name == Markup.Data.name)
wenzelm@34046
   197
          error("Malformed data element: " + tr.toString)
wenzelm@27947
   198
        val node = doc.createElement(name)
wenzelm@27947
   199
        for ((name, value) <- atts) node.setAttribute(name, value)
wenzelm@27952
   200
        for (t <- ts) node.appendChild(DOM(t))
wenzelm@27947
   201
        node
wenzelm@27947
   202
      case Text(txt) => doc.createTextNode(txt)
wenzelm@27947
   203
    }
wenzelm@33953
   204
    DOM(tree)
wenzelm@33953
   205
  }
wenzelm@43767
   206
wenzelm@43767
   207
wenzelm@43767
   208
wenzelm@43767
   209
  /** XML as data representation language **/
wenzelm@43767
   210
wenzelm@43767
   211
  class XML_Atom(s: String) extends Exception(s)
wenzelm@43767
   212
  class XML_Body(body: XML.Body) extends Exception
wenzelm@43767
   213
wenzelm@43767
   214
  object Encode
wenzelm@43767
   215
  {
wenzelm@43767
   216
    type T[A] = A => XML.Body
wenzelm@43767
   217
wenzelm@43767
   218
wenzelm@43767
   219
    /* basic values */
wenzelm@43767
   220
wenzelm@43767
   221
    private def long_atom(i: Long): String = i.toString
wenzelm@43767
   222
wenzelm@43767
   223
    private def int_atom(i: Int): String = i.toString
wenzelm@43767
   224
wenzelm@43767
   225
    private def bool_atom(b: Boolean): String = if (b) "1" else "0"
wenzelm@43767
   226
wenzelm@43767
   227
    private def unit_atom(u: Unit) = ""
wenzelm@43767
   228
wenzelm@43767
   229
wenzelm@43767
   230
    /* structural nodes */
wenzelm@43767
   231
wenzelm@43767
   232
    private def node(ts: XML.Body): XML.Tree = XML.Elem(Markup(":", Nil), ts)
wenzelm@43767
   233
wenzelm@43767
   234
    private def tagged(tag: Int, ts: XML.Body): XML.Tree =
wenzelm@43767
   235
      XML.Elem(Markup(int_atom(tag), Nil), ts)
wenzelm@43767
   236
wenzelm@43767
   237
wenzelm@43767
   238
    /* representation of standard types */
wenzelm@43767
   239
wenzelm@43767
   240
    val properties: T[List[(String, String)]] =
wenzelm@43767
   241
      (props => List(XML.Elem(Markup(":", props), Nil)))
wenzelm@43767
   242
wenzelm@43767
   243
    val string: T[String] = (s => if (s.isEmpty) Nil else List(XML.Text(s)))
wenzelm@43767
   244
wenzelm@43767
   245
    val long: T[Long] = (x => string(long_atom(x)))
wenzelm@43767
   246
wenzelm@43767
   247
    val int: T[Int] = (x => string(int_atom(x)))
wenzelm@43767
   248
wenzelm@43767
   249
    val bool: T[Boolean] = (x => string(bool_atom(x)))
wenzelm@43767
   250
wenzelm@43767
   251
    val unit: T[Unit] = (x => string(unit_atom(x)))
wenzelm@43767
   252
wenzelm@43767
   253
    def pair[A, B](f: T[A], g: T[B]): T[(A, B)] =
wenzelm@43767
   254
      (x => List(node(f(x._1)), node(g(x._2))))
wenzelm@43767
   255
wenzelm@43767
   256
    def triple[A, B, C](f: T[A], g: T[B], h: T[C]): T[(A, B, C)] =
wenzelm@43767
   257
      (x => List(node(f(x._1)), node(g(x._2)), node(h(x._3))))
wenzelm@43767
   258
wenzelm@43767
   259
    def list[A](f: T[A]): T[List[A]] =
wenzelm@43767
   260
      (xs => xs.map((x: A) => node(f(x))))
wenzelm@43767
   261
wenzelm@43767
   262
    def option[A](f: T[A]): T[Option[A]] =
wenzelm@43767
   263
    {
wenzelm@43767
   264
      case None => Nil
wenzelm@43767
   265
      case Some(x) => List(node(f(x)))
wenzelm@43767
   266
    }
wenzelm@43767
   267
wenzelm@43767
   268
    def variant[A](fs: List[PartialFunction[A, XML.Body]]): T[A] =
wenzelm@43767
   269
    {
wenzelm@43767
   270
      case x =>
wenzelm@43767
   271
        val (f, tag) = fs.iterator.zipWithIndex.find(p => p._1.isDefinedAt(x)).get
wenzelm@43767
   272
        List(tagged(tag, f(x)))
wenzelm@43767
   273
    }
wenzelm@43767
   274
  }
wenzelm@43767
   275
wenzelm@43767
   276
  object Decode
wenzelm@43767
   277
  {
wenzelm@43767
   278
    type T[A] = XML.Body => A
wenzelm@43767
   279
wenzelm@43767
   280
wenzelm@43767
   281
     /* basic values */
wenzelm@43767
   282
wenzelm@43767
   283
    private def long_atom(s: String): Long =
wenzelm@43767
   284
      try { java.lang.Long.parseLong(s) }
wenzelm@43767
   285
      catch { case e: NumberFormatException => throw new XML_Atom(s) }
wenzelm@43767
   286
wenzelm@43767
   287
    private def int_atom(s: String): Int =
wenzelm@43767
   288
      try { Integer.parseInt(s) }
wenzelm@43767
   289
      catch { case e: NumberFormatException => throw new XML_Atom(s) }
wenzelm@43767
   290
wenzelm@43767
   291
    private def bool_atom(s: String): Boolean =
wenzelm@43767
   292
      if (s == "1") true
wenzelm@43767
   293
      else if (s == "0") false
wenzelm@43767
   294
      else throw new XML_Atom(s)
wenzelm@43767
   295
wenzelm@43767
   296
    private def unit_atom(s: String): Unit =
wenzelm@43767
   297
      if (s == "") () else throw new XML_Atom(s)
wenzelm@43767
   298
wenzelm@43767
   299
wenzelm@43767
   300
    /* structural nodes */
wenzelm@43767
   301
wenzelm@43767
   302
    private def node(t: XML.Tree): XML.Body =
wenzelm@43767
   303
      t match {
wenzelm@43767
   304
        case XML.Elem(Markup(":", Nil), ts) => ts
wenzelm@43767
   305
        case _ => throw new XML_Body(List(t))
wenzelm@43767
   306
      }
wenzelm@43767
   307
wenzelm@43767
   308
    private def tagged(t: XML.Tree): (Int, XML.Body) =
wenzelm@43767
   309
      t match {
wenzelm@43767
   310
        case XML.Elem(Markup(s, Nil), ts) => (int_atom(s), ts)
wenzelm@43767
   311
        case _ => throw new XML_Body(List(t))
wenzelm@43767
   312
      }
wenzelm@43767
   313
wenzelm@43767
   314
wenzelm@43767
   315
    /* representation of standard types */
wenzelm@43767
   316
wenzelm@43767
   317
    val properties: T[List[(String, String)]] =
wenzelm@43767
   318
    {
wenzelm@43767
   319
      case List(XML.Elem(Markup(":", props), Nil)) => props
wenzelm@43767
   320
      case ts => throw new XML_Body(ts)
wenzelm@43767
   321
    }
wenzelm@43767
   322
wenzelm@43767
   323
    val string: T[String] =
wenzelm@43767
   324
    {
wenzelm@43767
   325
      case Nil => ""
wenzelm@43767
   326
      case List(XML.Text(s)) => s
wenzelm@43767
   327
      case ts => throw new XML_Body(ts)
wenzelm@43767
   328
    }
wenzelm@43767
   329
wenzelm@43767
   330
    val long: T[Long] = (x => long_atom(string(x)))
wenzelm@43767
   331
wenzelm@43767
   332
    val int: T[Int] = (x => int_atom(string(x)))
wenzelm@43767
   333
wenzelm@43767
   334
    val bool: T[Boolean] = (x => bool_atom(string(x)))
wenzelm@43767
   335
wenzelm@43767
   336
    val unit: T[Unit] = (x => unit_atom(string(x)))
wenzelm@43767
   337
wenzelm@43767
   338
    def pair[A, B](f: T[A], g: T[B]): T[(A, B)] =
wenzelm@43767
   339
    {
wenzelm@43767
   340
      case List(t1, t2) => (f(node(t1)), g(node(t2)))
wenzelm@43767
   341
      case ts => throw new XML_Body(ts)
wenzelm@43767
   342
    }
wenzelm@43767
   343
wenzelm@43767
   344
    def triple[A, B, C](f: T[A], g: T[B], h: T[C]): T[(A, B, C)] =
wenzelm@43767
   345
    {
wenzelm@43767
   346
      case List(t1, t2, t3) => (f(node(t1)), g(node(t2)), h(node(t3)))
wenzelm@43767
   347
      case ts => throw new XML_Body(ts)
wenzelm@43767
   348
    }
wenzelm@43767
   349
wenzelm@43767
   350
    def list[A](f: T[A]): T[List[A]] =
wenzelm@43767
   351
      (ts => ts.map(t => f(node(t))))
wenzelm@43767
   352
wenzelm@43767
   353
    def option[A](f: T[A]): T[Option[A]] =
wenzelm@43767
   354
    {
wenzelm@43767
   355
      case Nil => None
wenzelm@43767
   356
      case List(t) => Some(f(node(t)))
wenzelm@43767
   357
      case ts => throw new XML_Body(ts)
wenzelm@43767
   358
    }
wenzelm@43767
   359
wenzelm@43767
   360
    def variant[A](fs: List[T[A]]): T[A] =
wenzelm@43767
   361
    {
wenzelm@43767
   362
      case List(t) =>
wenzelm@43767
   363
        val (tag, ts) = tagged(t)
wenzelm@43767
   364
        fs(tag)(ts)
wenzelm@43767
   365
      case ts => throw new XML_Body(ts)
wenzelm@43767
   366
    }
wenzelm@43767
   367
  }
wenzelm@27931
   368
}