src/Pure/PIDE/xml.scala
author wenzelm
Sun Sep 04 15:21:50 2011 +0200 (2011-09-04 ago)
changeset 44698 0385292321a0
parent 44697 src/Pure/General/xml.scala@b99dfee76538
child 44704 528d635ef6f0
permissions -rw-r--r--
moved XML/YXML to src/Pure/PIDE;
tuned comments;
wenzelm@44698
     1
/*  Title:      Pure/PIDE/xml.scala
wenzelm@27931
     2
    Author:     Makarius
wenzelm@27931
     3
wenzelm@44698
     4
Untyped XML trees and basic data representation.
wenzelm@27931
     5
*/
wenzelm@27931
     6
wenzelm@27931
     7
package isabelle
wenzelm@27931
     8
wenzelm@43520
     9
import java.lang.System
wenzelm@34108
    10
import java.util.WeakHashMap
wenzelm@34108
    11
import java.lang.ref.WeakReference
wenzelm@34108
    12
import javax.xml.parsers.DocumentBuilderFactory
wenzelm@34108
    13
wenzelm@38446
    14
import scala.actors.Actor._
wenzelm@43778
    15
import scala.collection.mutable
wenzelm@38446
    16
wenzelm@27947
    17
wenzelm@29203
    18
object XML
wenzelm@29203
    19
{
wenzelm@43767
    20
  /** XML trees **/
wenzelm@43767
    21
wenzelm@27947
    22
  /* datatype representation */
wenzelm@27947
    23
wenzelm@43780
    24
  type Attributes = Properties.T
wenzelm@27931
    25
wenzelm@38268
    26
  sealed abstract class Tree { override def toString = string_of_tree(this) }
wenzelm@38230
    27
  case class Elem(markup: Markup, body: List[Tree]) extends Tree
wenzelm@29204
    28
  case class Text(content: String) extends Tree
wenzelm@29203
    29
wenzelm@38230
    30
  def elem(name: String, body: List[Tree]) = Elem(Markup(name, Nil), body)
wenzelm@38230
    31
  def elem(name: String) = Elem(Markup(name, Nil), Nil)
wenzelm@33999
    32
wenzelm@38267
    33
  type Body = List[Tree]
wenzelm@38267
    34
wenzelm@29203
    35
wenzelm@29203
    36
  /* string representation */
wenzelm@29203
    37
wenzelm@38268
    38
  def string_of_body(body: Body): String =
wenzelm@38268
    39
  {
wenzelm@38268
    40
    val s = new StringBuilder
wenzelm@38268
    41
wenzelm@38268
    42
    def text(txt: String) {
wenzelm@38268
    43
      if (txt == null) s ++= txt
wenzelm@38268
    44
      else {
wenzelm@38268
    45
        for (c <- txt.iterator) c match {
wenzelm@38268
    46
          case '<' => s ++= "&lt;"
wenzelm@38268
    47
          case '>' => s ++= "&gt;"
wenzelm@38268
    48
          case '&' => s ++= "&amp;"
wenzelm@38268
    49
          case '"' => s ++= "&quot;"
wenzelm@38268
    50
          case '\'' => s ++= "&apos;"
wenzelm@38268
    51
          case _ => s += c
wenzelm@38268
    52
        }
wenzelm@34005
    53
      }
wenzelm@29203
    54
    }
wenzelm@38268
    55
    def attrib(p: (String, String)) { s ++= " "; s ++= p._1; s ++= "=\""; text(p._2); s ++= "\"" }
wenzelm@38268
    56
    def elem(markup: Markup) { s ++= markup.name; markup.properties.foreach(attrib) }
wenzelm@38268
    57
    def tree(t: Tree): Unit =
wenzelm@38268
    58
      t match {
wenzelm@38268
    59
        case Elem(markup, Nil) =>
wenzelm@38268
    60
          s ++= "<"; elem(markup); s ++= "/>"
wenzelm@38268
    61
        case Elem(markup, ts) =>
wenzelm@38268
    62
          s ++= "<"; elem(markup); s ++= ">"
wenzelm@38268
    63
          ts.foreach(tree)
wenzelm@38268
    64
          s ++= "</"; s ++= markup.name; s ++= ">"
wenzelm@38268
    65
        case Text(txt) => text(txt)
wenzelm@38268
    66
      }
wenzelm@38268
    67
    body.foreach(tree)
wenzelm@38268
    68
    s.toString
wenzelm@29203
    69
  }
wenzelm@29203
    70
wenzelm@38268
    71
  def string_of_tree(tree: XML.Tree): String = string_of_body(List(tree))
wenzelm@27941
    72
wenzelm@27941
    73
wenzelm@38484
    74
  /* text content */
wenzelm@27941
    75
wenzelm@38484
    76
  def content_stream(tree: Tree): Stream[String] =
wenzelm@38484
    77
    tree match {
wenzelm@43747
    78
      case Elem(_, body) => content_stream(body)
wenzelm@38484
    79
      case Text(content) => Stream(content)
wenzelm@27941
    80
    }
wenzelm@43747
    81
  def content_stream(body: Body): Stream[String] =
wenzelm@43747
    82
    body.toStream.flatten(content_stream(_))
wenzelm@27941
    83
wenzelm@38484
    84
  def content(tree: Tree): Iterator[String] = content_stream(tree).iterator
wenzelm@43747
    85
  def content(body: Body): Iterator[String] = content_stream(body).iterator
wenzelm@27941
    86
wenzelm@27947
    87
wenzelm@38446
    88
  /* pipe-lined cache for partial sharing */
wenzelm@34108
    89
wenzelm@43745
    90
  class Cache(initial_size: Int = 131071, max_string: Int = 100)
wenzelm@34108
    91
  {
wenzelm@38446
    92
    private val cache_actor = actor
wenzelm@34108
    93
    {
wenzelm@38446
    94
      val table = new WeakHashMap[Any, WeakReference[Any]](initial_size)
wenzelm@38446
    95
wenzelm@38446
    96
      def lookup[A](x: A): Option[A] =
wenzelm@38446
    97
      {
wenzelm@38446
    98
        val ref = table.get(x)
wenzelm@38446
    99
        if (ref == null) None
wenzelm@38446
   100
        else {
wenzelm@38446
   101
          val y = ref.asInstanceOf[WeakReference[A]].get
wenzelm@38446
   102
          if (y == null) None
wenzelm@38446
   103
          else Some(y)
wenzelm@38446
   104
        }
wenzelm@34108
   105
      }
wenzelm@38446
   106
      def store[A](x: A): A =
wenzelm@38446
   107
      {
wenzelm@38446
   108
        table.put(x, new WeakReference[Any](x))
wenzelm@38446
   109
        x
wenzelm@38446
   110
      }
wenzelm@34108
   111
wenzelm@38869
   112
      def trim_bytes(s: String): String = new String(s.toCharArray)
wenzelm@38869
   113
wenzelm@38446
   114
      def cache_string(x: String): String =
wenzelm@38446
   115
        lookup(x) match {
wenzelm@38446
   116
          case Some(y) => y
wenzelm@43745
   117
          case None =>
wenzelm@43745
   118
            val z = trim_bytes(x)
wenzelm@43745
   119
            if (z.length > max_string) z else store(z)
wenzelm@38446
   120
        }
wenzelm@43780
   121
      def cache_props(x: Properties.T): Properties.T =
wenzelm@38446
   122
        if (x.isEmpty) x
wenzelm@38446
   123
        else
wenzelm@38446
   124
          lookup(x) match {
wenzelm@38446
   125
            case Some(y) => y
wenzelm@38869
   126
            case None => store(x.map(p => (trim_bytes(p._1).intern, cache_string(p._2))))
wenzelm@38446
   127
          }
wenzelm@38446
   128
      def cache_markup(x: Markup): Markup =
wenzelm@34133
   129
        lookup(x) match {
wenzelm@34133
   130
          case Some(y) => y
wenzelm@38446
   131
          case None =>
wenzelm@38446
   132
            x match {
wenzelm@38446
   133
              case Markup(name, props) =>
wenzelm@38446
   134
                store(Markup(cache_string(name), cache_props(props)))
wenzelm@38446
   135
            }
wenzelm@34133
   136
        }
wenzelm@38446
   137
      def cache_tree(x: XML.Tree): XML.Tree =
wenzelm@34133
   138
        lookup(x) match {
wenzelm@34133
   139
          case Some(y) => y
wenzelm@38446
   140
          case None =>
wenzelm@38446
   141
            x match {
wenzelm@38446
   142
              case XML.Elem(markup, body) =>
wenzelm@38446
   143
                store(XML.Elem(cache_markup(markup), cache_body(body)))
wenzelm@38446
   144
              case XML.Text(text) => store(XML.Text(cache_string(text)))
wenzelm@38446
   145
            }
wenzelm@34133
   146
        }
wenzelm@38446
   147
      def cache_body(x: XML.Body): XML.Body =
wenzelm@38446
   148
        if (x.isEmpty) x
wenzelm@38446
   149
        else
wenzelm@38446
   150
          lookup(x) match {
wenzelm@38446
   151
            case Some(y) => y
wenzelm@38446
   152
            case None => x.map(cache_tree(_))
wenzelm@38446
   153
          }
wenzelm@38446
   154
wenzelm@38446
   155
      // main loop
wenzelm@38446
   156
      loop {
wenzelm@38446
   157
        react {
wenzelm@38446
   158
          case Cache_String(x, f) => f(cache_string(x))
wenzelm@38446
   159
          case Cache_Markup(x, f) => f(cache_markup(x))
wenzelm@38446
   160
          case Cache_Tree(x, f) => f(cache_tree(x))
wenzelm@38446
   161
          case Cache_Body(x, f) => f(cache_body(x))
wenzelm@44697
   162
          case Cache_Ignore(x, f) => f(x)
wenzelm@38446
   163
          case bad => System.err.println("XML.cache_actor: ignoring bad input " + bad)
wenzelm@38446
   164
        }
wenzelm@38446
   165
      }
wenzelm@38446
   166
    }
wenzelm@38446
   167
wenzelm@38446
   168
    private case class Cache_String(x: String, f: String => Unit)
wenzelm@38446
   169
    private case class Cache_Markup(x: Markup, f: Markup => Unit)
wenzelm@38446
   170
    private case class Cache_Tree(x: XML.Tree, f: XML.Tree => Unit)
wenzelm@38446
   171
    private case class Cache_Body(x: XML.Body, f: XML.Body => Unit)
wenzelm@44697
   172
    private case class Cache_Ignore[A](x: A, f: A => Unit)
wenzelm@38446
   173
wenzelm@38446
   174
    // main methods
wenzelm@38446
   175
    def cache_string(x: String)(f: String => Unit) { cache_actor ! Cache_String(x, f) }
wenzelm@38446
   176
    def cache_markup(x: Markup)(f: Markup => Unit) { cache_actor ! Cache_Markup(x, f) }
wenzelm@38446
   177
    def cache_tree(x: XML.Tree)(f: XML.Tree => Unit) { cache_actor ! Cache_Tree(x, f) }
wenzelm@38446
   178
    def cache_body(x: XML.Body)(f: XML.Body => Unit) { cache_actor ! Cache_Body(x, f) }
wenzelm@44697
   179
    def cache_ignore[A](x: A)(f: A => Unit) { cache_actor ! Cache_Ignore(x, f) }
wenzelm@34108
   180
  }
wenzelm@34108
   181
wenzelm@34108
   182
wenzelm@43767
   183
wenzelm@43767
   184
  /** document object model (W3C DOM) **/
wenzelm@27948
   185
wenzelm@34871
   186
  def get_data(node: org.w3c.dom.Node): Option[XML.Tree] =
wenzelm@38231
   187
    node.getUserData(Markup.Data.name) match {
wenzelm@34047
   188
      case tree: XML.Tree => Some(tree)
wenzelm@34047
   189
      case _ => None
wenzelm@34047
   190
    }
wenzelm@34047
   191
wenzelm@34871
   192
  def document_node(doc: org.w3c.dom.Document, tree: Tree): org.w3c.dom.Node =
wenzelm@33953
   193
  {
wenzelm@34871
   194
    def DOM(tr: Tree): org.w3c.dom.Node = tr match {
wenzelm@38231
   195
      case Elem(Markup.Data, List(data, t)) =>
wenzelm@34046
   196
        val node = DOM(t)
wenzelm@38231
   197
        node.setUserData(Markup.Data.name, data, null)
wenzelm@34046
   198
        node
wenzelm@38230
   199
      case Elem(Markup(name, atts), ts) =>
wenzelm@38231
   200
        if (name == Markup.Data.name)
wenzelm@34046
   201
          error("Malformed data element: " + tr.toString)
wenzelm@27947
   202
        val node = doc.createElement(name)
wenzelm@27947
   203
        for ((name, value) <- atts) node.setAttribute(name, value)
wenzelm@27952
   204
        for (t <- ts) node.appendChild(DOM(t))
wenzelm@27947
   205
        node
wenzelm@27947
   206
      case Text(txt) => doc.createTextNode(txt)
wenzelm@27947
   207
    }
wenzelm@33953
   208
    DOM(tree)
wenzelm@33953
   209
  }
wenzelm@43767
   210
wenzelm@43767
   211
wenzelm@43767
   212
wenzelm@43767
   213
  /** XML as data representation language **/
wenzelm@43767
   214
wenzelm@43767
   215
  class XML_Atom(s: String) extends Exception(s)
wenzelm@43767
   216
  class XML_Body(body: XML.Body) extends Exception
wenzelm@43767
   217
wenzelm@43767
   218
  object Encode
wenzelm@43767
   219
  {
wenzelm@43767
   220
    type T[A] = A => XML.Body
wenzelm@43767
   221
wenzelm@43767
   222
wenzelm@43778
   223
    /* atomic values */
wenzelm@43767
   224
wenzelm@43778
   225
    def long_atom(i: Long): String = i.toString
wenzelm@43767
   226
wenzelm@43778
   227
    def int_atom(i: Int): String = i.toString
wenzelm@43767
   228
wenzelm@43778
   229
    def bool_atom(b: Boolean): String = if (b) "1" else "0"
wenzelm@43767
   230
wenzelm@43778
   231
    def unit_atom(u: Unit) = ""
wenzelm@43767
   232
wenzelm@43767
   233
wenzelm@43767
   234
    /* structural nodes */
wenzelm@43767
   235
wenzelm@43767
   236
    private def node(ts: XML.Body): XML.Tree = XML.Elem(Markup(":", Nil), ts)
wenzelm@43767
   237
wenzelm@43781
   238
    private def vector(xs: List[String]): XML.Attributes =
wenzelm@43778
   239
      xs.zipWithIndex.map(p => (int_atom(p._2), p._1))
wenzelm@43778
   240
wenzelm@43778
   241
    private def tagged(tag: Int, data: (List[String], XML.Body)): XML.Tree =
wenzelm@43778
   242
      XML.Elem(Markup(int_atom(tag), vector(data._1)), data._2)
wenzelm@43767
   243
wenzelm@43767
   244
wenzelm@43767
   245
    /* representation of standard types */
wenzelm@43767
   246
wenzelm@43780
   247
    val properties: T[Properties.T] =
wenzelm@43767
   248
      (props => List(XML.Elem(Markup(":", props), Nil)))
wenzelm@43767
   249
wenzelm@43767
   250
    val string: T[String] = (s => if (s.isEmpty) Nil else List(XML.Text(s)))
wenzelm@43767
   251
wenzelm@43767
   252
    val long: T[Long] = (x => string(long_atom(x)))
wenzelm@43767
   253
wenzelm@43767
   254
    val int: T[Int] = (x => string(int_atom(x)))
wenzelm@43767
   255
wenzelm@43767
   256
    val bool: T[Boolean] = (x => string(bool_atom(x)))
wenzelm@43767
   257
wenzelm@43767
   258
    val unit: T[Unit] = (x => string(unit_atom(x)))
wenzelm@43767
   259
wenzelm@43767
   260
    def pair[A, B](f: T[A], g: T[B]): T[(A, B)] =
wenzelm@43767
   261
      (x => List(node(f(x._1)), node(g(x._2))))
wenzelm@43767
   262
wenzelm@43767
   263
    def triple[A, B, C](f: T[A], g: T[B], h: T[C]): T[(A, B, C)] =
wenzelm@43767
   264
      (x => List(node(f(x._1)), node(g(x._2)), node(h(x._3))))
wenzelm@43767
   265
wenzelm@43767
   266
    def list[A](f: T[A]): T[List[A]] =
wenzelm@43767
   267
      (xs => xs.map((x: A) => node(f(x))))
wenzelm@43767
   268
wenzelm@43767
   269
    def option[A](f: T[A]): T[Option[A]] =
wenzelm@43767
   270
    {
wenzelm@43767
   271
      case None => Nil
wenzelm@43767
   272
      case Some(x) => List(node(f(x)))
wenzelm@43767
   273
    }
wenzelm@43767
   274
wenzelm@43778
   275
    def variant[A](fs: List[PartialFunction[A, (List[String], XML.Body)]]): T[A] =
wenzelm@43767
   276
    {
wenzelm@43767
   277
      case x =>
wenzelm@43767
   278
        val (f, tag) = fs.iterator.zipWithIndex.find(p => p._1.isDefinedAt(x)).get
wenzelm@43767
   279
        List(tagged(tag, f(x)))
wenzelm@43767
   280
    }
wenzelm@43767
   281
  }
wenzelm@43767
   282
wenzelm@43767
   283
  object Decode
wenzelm@43767
   284
  {
wenzelm@43767
   285
    type T[A] = XML.Body => A
wenzelm@43778
   286
    type V[A] = (List[String], XML.Body) => A
wenzelm@43767
   287
wenzelm@43767
   288
wenzelm@43778
   289
    /* atomic values */
wenzelm@43767
   290
wenzelm@43778
   291
    def long_atom(s: String): Long =
wenzelm@43767
   292
      try { java.lang.Long.parseLong(s) }
wenzelm@43767
   293
      catch { case e: NumberFormatException => throw new XML_Atom(s) }
wenzelm@43767
   294
wenzelm@43778
   295
    def int_atom(s: String): Int =
wenzelm@43767
   296
      try { Integer.parseInt(s) }
wenzelm@43767
   297
      catch { case e: NumberFormatException => throw new XML_Atom(s) }
wenzelm@43767
   298
wenzelm@43778
   299
    def bool_atom(s: String): Boolean =
wenzelm@43767
   300
      if (s == "1") true
wenzelm@43767
   301
      else if (s == "0") false
wenzelm@43767
   302
      else throw new XML_Atom(s)
wenzelm@43767
   303
wenzelm@43778
   304
    def unit_atom(s: String): Unit =
wenzelm@43767
   305
      if (s == "") () else throw new XML_Atom(s)
wenzelm@43767
   306
wenzelm@43767
   307
wenzelm@43767
   308
    /* structural nodes */
wenzelm@43767
   309
wenzelm@43767
   310
    private def node(t: XML.Tree): XML.Body =
wenzelm@43767
   311
      t match {
wenzelm@43767
   312
        case XML.Elem(Markup(":", Nil), ts) => ts
wenzelm@43767
   313
        case _ => throw new XML_Body(List(t))
wenzelm@43767
   314
      }
wenzelm@43767
   315
wenzelm@43781
   316
    private def vector(atts: XML.Attributes): List[String] =
wenzelm@43778
   317
    {
wenzelm@43778
   318
      val xs = new mutable.ListBuffer[String]
wenzelm@43778
   319
      var i = 0
wenzelm@43781
   320
      for ((a, x) <- atts) {
wenzelm@43778
   321
        if (int_atom(a) == i) { xs += x; i = i + 1 }
wenzelm@43778
   322
        else throw new XML_Atom(a)
wenzelm@43778
   323
      }
wenzelm@43778
   324
      xs.toList
wenzelm@43778
   325
    }
wenzelm@43778
   326
wenzelm@43778
   327
    private def tagged(t: XML.Tree): (Int, (List[String], XML.Body)) =
wenzelm@43767
   328
      t match {
wenzelm@43781
   329
        case XML.Elem(Markup(name, atts), ts) => (int_atom(name), (vector(atts), ts))
wenzelm@43767
   330
        case _ => throw new XML_Body(List(t))
wenzelm@43767
   331
      }
wenzelm@43767
   332
wenzelm@43767
   333
wenzelm@43767
   334
    /* representation of standard types */
wenzelm@43767
   335
wenzelm@43780
   336
    val properties: T[Properties.T] =
wenzelm@43767
   337
    {
wenzelm@43767
   338
      case List(XML.Elem(Markup(":", props), Nil)) => props
wenzelm@43767
   339
      case ts => throw new XML_Body(ts)
wenzelm@43767
   340
    }
wenzelm@43767
   341
wenzelm@43767
   342
    val string: T[String] =
wenzelm@43767
   343
    {
wenzelm@43767
   344
      case Nil => ""
wenzelm@43767
   345
      case List(XML.Text(s)) => s
wenzelm@43767
   346
      case ts => throw new XML_Body(ts)
wenzelm@43767
   347
    }
wenzelm@43767
   348
wenzelm@43767
   349
    val long: T[Long] = (x => long_atom(string(x)))
wenzelm@43767
   350
wenzelm@43767
   351
    val int: T[Int] = (x => int_atom(string(x)))
wenzelm@43767
   352
wenzelm@43767
   353
    val bool: T[Boolean] = (x => bool_atom(string(x)))
wenzelm@43767
   354
wenzelm@43767
   355
    val unit: T[Unit] = (x => unit_atom(string(x)))
wenzelm@43767
   356
wenzelm@43767
   357
    def pair[A, B](f: T[A], g: T[B]): T[(A, B)] =
wenzelm@43767
   358
    {
wenzelm@43767
   359
      case List(t1, t2) => (f(node(t1)), g(node(t2)))
wenzelm@43767
   360
      case ts => throw new XML_Body(ts)
wenzelm@43767
   361
    }
wenzelm@43767
   362
wenzelm@43767
   363
    def triple[A, B, C](f: T[A], g: T[B], h: T[C]): T[(A, B, C)] =
wenzelm@43767
   364
    {
wenzelm@43767
   365
      case List(t1, t2, t3) => (f(node(t1)), g(node(t2)), h(node(t3)))
wenzelm@43767
   366
      case ts => throw new XML_Body(ts)
wenzelm@43767
   367
    }
wenzelm@43767
   368
wenzelm@43767
   369
    def list[A](f: T[A]): T[List[A]] =
wenzelm@43767
   370
      (ts => ts.map(t => f(node(t))))
wenzelm@43767
   371
wenzelm@43767
   372
    def option[A](f: T[A]): T[Option[A]] =
wenzelm@43767
   373
    {
wenzelm@43767
   374
      case Nil => None
wenzelm@43767
   375
      case List(t) => Some(f(node(t)))
wenzelm@43767
   376
      case ts => throw new XML_Body(ts)
wenzelm@43767
   377
    }
wenzelm@43767
   378
wenzelm@43778
   379
    def variant[A](fs: List[V[A]]): T[A] =
wenzelm@43767
   380
    {
wenzelm@43767
   381
      case List(t) =>
wenzelm@43778
   382
        val (tag, (xs, ts)) = tagged(t)
wenzelm@43768
   383
        val f =
wenzelm@43768
   384
          try { fs(tag) }
wenzelm@43768
   385
          catch { case _: IndexOutOfBoundsException => throw new XML_Body(List(t)) }
wenzelm@43778
   386
        f(xs, ts)
wenzelm@43767
   387
      case ts => throw new XML_Body(ts)
wenzelm@43767
   388
    }
wenzelm@43767
   389
  }
wenzelm@27931
   390
}