src/Pure/PIDE/xml.scala
author wenzelm
Thu Sep 20 10:43:04 2012 +0200 (2012-09-20 ago)
changeset 49465 76ecbc7f3683
parent 49417 c5a8592fb5d3
child 49466 99ed1f422635
permissions -rw-r--r--
tuned;
wenzelm@44698
     1
/*  Title:      Pure/PIDE/xml.scala
wenzelm@45673
     2
    Module:     PIDE
wenzelm@27931
     3
    Author:     Makarius
wenzelm@27931
     4
wenzelm@44698
     5
Untyped XML trees and basic data representation.
wenzelm@27931
     6
*/
wenzelm@27931
     7
wenzelm@27931
     8
package isabelle
wenzelm@27931
     9
wenzelm@43520
    10
import java.lang.System
wenzelm@34108
    11
import java.util.WeakHashMap
wenzelm@34108
    12
import java.lang.ref.WeakReference
wenzelm@34108
    13
import javax.xml.parsers.DocumentBuilderFactory
wenzelm@34108
    14
wenzelm@27947
    15
wenzelm@29203
    16
object XML
wenzelm@29203
    17
{
wenzelm@43767
    18
  /** XML trees **/
wenzelm@43767
    19
wenzelm@27947
    20
  /* datatype representation */
wenzelm@27947
    21
wenzelm@43780
    22
  type Attributes = Properties.T
wenzelm@27931
    23
wenzelm@38268
    24
  sealed abstract class Tree { override def toString = string_of_tree(this) }
wenzelm@38230
    25
  case class Elem(markup: Markup, body: List[Tree]) extends Tree
wenzelm@29204
    26
  case class Text(content: String) extends Tree
wenzelm@29203
    27
wenzelm@38230
    28
  def elem(name: String, body: List[Tree]) = Elem(Markup(name, Nil), body)
wenzelm@38230
    29
  def elem(name: String) = Elem(Markup(name, Nil), Nil)
wenzelm@33999
    30
wenzelm@38267
    31
  type Body = List[Tree]
wenzelm@38267
    32
wenzelm@29203
    33
wenzelm@29203
    34
  /* string representation */
wenzelm@29203
    35
wenzelm@38268
    36
  def string_of_body(body: Body): String =
wenzelm@38268
    37
  {
wenzelm@38268
    38
    val s = new StringBuilder
wenzelm@38268
    39
wenzelm@38268
    40
    def text(txt: String) {
wenzelm@38268
    41
      if (txt == null) s ++= txt
wenzelm@38268
    42
      else {
wenzelm@38268
    43
        for (c <- txt.iterator) c match {
wenzelm@38268
    44
          case '<' => s ++= "&lt;"
wenzelm@38268
    45
          case '>' => s ++= "&gt;"
wenzelm@38268
    46
          case '&' => s ++= "&amp;"
wenzelm@38268
    47
          case '"' => s ++= "&quot;"
wenzelm@38268
    48
          case '\'' => s ++= "&apos;"
wenzelm@38268
    49
          case _ => s += c
wenzelm@38268
    50
        }
wenzelm@34005
    51
      }
wenzelm@29203
    52
    }
wenzelm@38268
    53
    def attrib(p: (String, String)) { s ++= " "; s ++= p._1; s ++= "=\""; text(p._2); s ++= "\"" }
wenzelm@38268
    54
    def elem(markup: Markup) { s ++= markup.name; markup.properties.foreach(attrib) }
wenzelm@38268
    55
    def tree(t: Tree): Unit =
wenzelm@38268
    56
      t match {
wenzelm@38268
    57
        case Elem(markup, Nil) =>
wenzelm@38268
    58
          s ++= "<"; elem(markup); s ++= "/>"
wenzelm@38268
    59
        case Elem(markup, ts) =>
wenzelm@38268
    60
          s ++= "<"; elem(markup); s ++= ">"
wenzelm@38268
    61
          ts.foreach(tree)
wenzelm@38268
    62
          s ++= "</"; s ++= markup.name; s ++= ">"
wenzelm@38268
    63
        case Text(txt) => text(txt)
wenzelm@38268
    64
      }
wenzelm@38268
    65
    body.foreach(tree)
wenzelm@38268
    66
    s.toString
wenzelm@29203
    67
  }
wenzelm@29203
    68
wenzelm@38268
    69
  def string_of_tree(tree: XML.Tree): String = string_of_body(List(tree))
wenzelm@27941
    70
wenzelm@27941
    71
wenzelm@49416
    72
  /* content -- text and markup */
wenzelm@49416
    73
wenzelm@49416
    74
  private def make_content(body: Body, record_markup: Boolean): (String, Markup_Tree) =
wenzelm@49416
    75
  {
wenzelm@49465
    76
    val text = new StringBuilder
wenzelm@49416
    77
    var markup_tree = Markup_Tree.empty
wenzelm@27941
    78
wenzelm@49416
    79
    def traverse(tree: Tree): Unit =
wenzelm@49416
    80
      tree match {
wenzelm@49416
    81
        case Elem(markup, trees) =>
wenzelm@49416
    82
          val offset = text.length
wenzelm@49416
    83
          trees.foreach(traverse)
wenzelm@49416
    84
          val end_offset = text.length
wenzelm@49416
    85
          if (record_markup)
wenzelm@49416
    86
            markup_tree +=
wenzelm@49416
    87
              isabelle.Text.Info(isabelle.Text.Range(offset, end_offset), Elem(markup, Nil))
wenzelm@49416
    88
        case Text(s) => text.append(s)
wenzelm@49416
    89
      }
wenzelm@27941
    90
wenzelm@49416
    91
    body.foreach(traverse)
wenzelm@49417
    92
    (text.toString, markup_tree.reverse_markup)
wenzelm@49416
    93
  }
wenzelm@49416
    94
wenzelm@49416
    95
  def content_markup(body: Body): (String, Markup_Tree) = make_content(body, true)
wenzelm@49416
    96
  def content(body: Body): String = make_content(body, false)._1
wenzelm@49416
    97
  def content(tree: Tree): String = make_content(List(tree), false)._1
wenzelm@27941
    98
wenzelm@27947
    99
wenzelm@44808
   100
wenzelm@44808
   101
  /** cache for partial sharing (weak table) **/
wenzelm@34108
   102
wenzelm@43745
   103
  class Cache(initial_size: Int = 131071, max_string: Int = 100)
wenzelm@34108
   104
  {
wenzelm@44704
   105
    private var table = new WeakHashMap[Any, WeakReference[Any]](initial_size)
wenzelm@38446
   106
wenzelm@44704
   107
    private def lookup[A](x: A): Option[A] =
wenzelm@44704
   108
    {
wenzelm@44704
   109
      val ref = table.get(x)
wenzelm@44704
   110
      if (ref == null) None
wenzelm@44704
   111
      else {
wenzelm@44704
   112
        val y = ref.asInstanceOf[WeakReference[A]].get
wenzelm@44704
   113
        if (y == null) None
wenzelm@44704
   114
        else Some(y)
wenzelm@38446
   115
      }
wenzelm@44704
   116
    }
wenzelm@44704
   117
    private def store[A](x: A): A =
wenzelm@44704
   118
    {
wenzelm@44704
   119
      table.put(x, new WeakReference[Any](x))
wenzelm@44704
   120
      x
wenzelm@44704
   121
    }
wenzelm@34108
   122
wenzelm@44704
   123
    private def trim_bytes(s: String): String = new String(s.toCharArray)
wenzelm@38869
   124
wenzelm@44704
   125
    private def _cache_string(x: String): String =
wenzelm@44704
   126
      lookup(x) match {
wenzelm@44704
   127
        case Some(y) => y
wenzelm@44704
   128
        case None =>
wenzelm@44704
   129
          val z = trim_bytes(x)
wenzelm@44704
   130
          if (z.length > max_string) z else store(z)
wenzelm@44704
   131
      }
wenzelm@44704
   132
    private def _cache_props(x: Properties.T): Properties.T =
wenzelm@44704
   133
      if (x.isEmpty) x
wenzelm@44704
   134
      else
wenzelm@34133
   135
        lookup(x) match {
wenzelm@34133
   136
          case Some(y) => y
wenzelm@44704
   137
          case None => store(x.map(p => (trim_bytes(p._1).intern, _cache_string(p._2))))
wenzelm@34133
   138
        }
wenzelm@44704
   139
    private def _cache_markup(x: Markup): Markup =
wenzelm@44704
   140
      lookup(x) match {
wenzelm@44704
   141
        case Some(y) => y
wenzelm@44704
   142
        case None =>
wenzelm@44704
   143
          x match {
wenzelm@44704
   144
            case Markup(name, props) =>
wenzelm@44704
   145
              store(Markup(_cache_string(name), _cache_props(props)))
wenzelm@44704
   146
          }
wenzelm@44704
   147
      }
wenzelm@44704
   148
    private def _cache_tree(x: XML.Tree): XML.Tree =
wenzelm@44704
   149
      lookup(x) match {
wenzelm@44704
   150
        case Some(y) => y
wenzelm@44704
   151
        case None =>
wenzelm@44704
   152
          x match {
wenzelm@44704
   153
            case XML.Elem(markup, body) =>
wenzelm@44704
   154
              store(XML.Elem(_cache_markup(markup), _cache_body(body)))
wenzelm@44704
   155
            case XML.Text(text) => store(XML.Text(_cache_string(text)))
wenzelm@44704
   156
          }
wenzelm@44704
   157
      }
wenzelm@44704
   158
    private def _cache_body(x: XML.Body): XML.Body =
wenzelm@44704
   159
      if (x.isEmpty) x
wenzelm@44704
   160
      else
wenzelm@34133
   161
        lookup(x) match {
wenzelm@34133
   162
          case Some(y) => y
wenzelm@44704
   163
          case None => x.map(_cache_tree(_))
wenzelm@34133
   164
        }
wenzelm@38446
   165
wenzelm@38446
   166
    // main methods
wenzelm@44705
   167
    def cache_string(x: String): String = synchronized { _cache_string(x) }
wenzelm@44705
   168
    def cache_markup(x: Markup): Markup = synchronized { _cache_markup(x) }
wenzelm@44705
   169
    def cache_tree(x: XML.Tree): XML.Tree = synchronized { _cache_tree(x) }
wenzelm@44705
   170
    def cache_body(x: XML.Body): XML.Body = synchronized { _cache_body(x) }
wenzelm@34108
   171
  }
wenzelm@34108
   172
wenzelm@34108
   173
wenzelm@43767
   174
wenzelm@43767
   175
  /** document object model (W3C DOM) **/
wenzelm@27948
   176
wenzelm@34871
   177
  def get_data(node: org.w3c.dom.Node): Option[XML.Tree] =
wenzelm@38231
   178
    node.getUserData(Markup.Data.name) match {
wenzelm@34047
   179
      case tree: XML.Tree => Some(tree)
wenzelm@34047
   180
      case _ => None
wenzelm@34047
   181
    }
wenzelm@34047
   182
wenzelm@34871
   183
  def document_node(doc: org.w3c.dom.Document, tree: Tree): org.w3c.dom.Node =
wenzelm@33953
   184
  {
wenzelm@34871
   185
    def DOM(tr: Tree): org.w3c.dom.Node = tr match {
wenzelm@38231
   186
      case Elem(Markup.Data, List(data, t)) =>
wenzelm@34046
   187
        val node = DOM(t)
wenzelm@38231
   188
        node.setUserData(Markup.Data.name, data, null)
wenzelm@34046
   189
        node
wenzelm@38230
   190
      case Elem(Markup(name, atts), ts) =>
wenzelm@38231
   191
        if (name == Markup.Data.name)
wenzelm@34046
   192
          error("Malformed data element: " + tr.toString)
wenzelm@27947
   193
        val node = doc.createElement(name)
wenzelm@27947
   194
        for ((name, value) <- atts) node.setAttribute(name, value)
wenzelm@27952
   195
        for (t <- ts) node.appendChild(DOM(t))
wenzelm@27947
   196
        node
wenzelm@27947
   197
      case Text(txt) => doc.createTextNode(txt)
wenzelm@27947
   198
    }
wenzelm@33953
   199
    DOM(tree)
wenzelm@33953
   200
  }
wenzelm@43767
   201
wenzelm@43767
   202
wenzelm@43767
   203
wenzelm@43767
   204
  /** XML as data representation language **/
wenzelm@43767
   205
wenzelm@43767
   206
  class XML_Atom(s: String) extends Exception(s)
wenzelm@43767
   207
  class XML_Body(body: XML.Body) extends Exception
wenzelm@43767
   208
wenzelm@43767
   209
  object Encode
wenzelm@43767
   210
  {
wenzelm@43767
   211
    type T[A] = A => XML.Body
wenzelm@43767
   212
wenzelm@43767
   213
wenzelm@43778
   214
    /* atomic values */
wenzelm@43767
   215
wenzelm@43778
   216
    def long_atom(i: Long): String = i.toString
wenzelm@43767
   217
wenzelm@43778
   218
    def int_atom(i: Int): String = i.toString
wenzelm@43767
   219
wenzelm@43778
   220
    def bool_atom(b: Boolean): String = if (b) "1" else "0"
wenzelm@43767
   221
wenzelm@43778
   222
    def unit_atom(u: Unit) = ""
wenzelm@43767
   223
wenzelm@43767
   224
wenzelm@43767
   225
    /* structural nodes */
wenzelm@43767
   226
wenzelm@43767
   227
    private def node(ts: XML.Body): XML.Tree = XML.Elem(Markup(":", Nil), ts)
wenzelm@43767
   228
wenzelm@43781
   229
    private def vector(xs: List[String]): XML.Attributes =
wenzelm@46839
   230
      xs.zipWithIndex.map({ case (x, i) => (int_atom(i), x) })
wenzelm@43778
   231
wenzelm@43778
   232
    private def tagged(tag: Int, data: (List[String], XML.Body)): XML.Tree =
wenzelm@43778
   233
      XML.Elem(Markup(int_atom(tag), vector(data._1)), data._2)
wenzelm@43767
   234
wenzelm@43767
   235
wenzelm@43767
   236
    /* representation of standard types */
wenzelm@43767
   237
wenzelm@43780
   238
    val properties: T[Properties.T] =
wenzelm@43767
   239
      (props => List(XML.Elem(Markup(":", props), Nil)))
wenzelm@43767
   240
wenzelm@43767
   241
    val string: T[String] = (s => if (s.isEmpty) Nil else List(XML.Text(s)))
wenzelm@43767
   242
wenzelm@43767
   243
    val long: T[Long] = (x => string(long_atom(x)))
wenzelm@43767
   244
wenzelm@43767
   245
    val int: T[Int] = (x => string(int_atom(x)))
wenzelm@43767
   246
wenzelm@43767
   247
    val bool: T[Boolean] = (x => string(bool_atom(x)))
wenzelm@43767
   248
wenzelm@43767
   249
    val unit: T[Unit] = (x => string(unit_atom(x)))
wenzelm@43767
   250
wenzelm@43767
   251
    def pair[A, B](f: T[A], g: T[B]): T[(A, B)] =
wenzelm@43767
   252
      (x => List(node(f(x._1)), node(g(x._2))))
wenzelm@43767
   253
wenzelm@43767
   254
    def triple[A, B, C](f: T[A], g: T[B], h: T[C]): T[(A, B, C)] =
wenzelm@43767
   255
      (x => List(node(f(x._1)), node(g(x._2)), node(h(x._3))))
wenzelm@43767
   256
wenzelm@43767
   257
    def list[A](f: T[A]): T[List[A]] =
wenzelm@43767
   258
      (xs => xs.map((x: A) => node(f(x))))
wenzelm@43767
   259
wenzelm@43767
   260
    def option[A](f: T[A]): T[Option[A]] =
wenzelm@43767
   261
    {
wenzelm@43767
   262
      case None => Nil
wenzelm@43767
   263
      case Some(x) => List(node(f(x)))
wenzelm@43767
   264
    }
wenzelm@43767
   265
wenzelm@43778
   266
    def variant[A](fs: List[PartialFunction[A, (List[String], XML.Body)]]): T[A] =
wenzelm@43767
   267
    {
wenzelm@43767
   268
      case x =>
wenzelm@43767
   269
        val (f, tag) = fs.iterator.zipWithIndex.find(p => p._1.isDefinedAt(x)).get
wenzelm@43767
   270
        List(tagged(tag, f(x)))
wenzelm@43767
   271
    }
wenzelm@43767
   272
  }
wenzelm@43767
   273
wenzelm@43767
   274
  object Decode
wenzelm@43767
   275
  {
wenzelm@43767
   276
    type T[A] = XML.Body => A
wenzelm@43778
   277
    type V[A] = (List[String], XML.Body) => A
wenzelm@43767
   278
wenzelm@43767
   279
wenzelm@43778
   280
    /* atomic values */
wenzelm@43767
   281
wenzelm@43778
   282
    def long_atom(s: String): Long =
wenzelm@43767
   283
      try { java.lang.Long.parseLong(s) }
wenzelm@43767
   284
      catch { case e: NumberFormatException => throw new XML_Atom(s) }
wenzelm@43767
   285
wenzelm@43778
   286
    def int_atom(s: String): Int =
wenzelm@43767
   287
      try { Integer.parseInt(s) }
wenzelm@43767
   288
      catch { case e: NumberFormatException => throw new XML_Atom(s) }
wenzelm@43767
   289
wenzelm@43778
   290
    def bool_atom(s: String): Boolean =
wenzelm@43767
   291
      if (s == "1") true
wenzelm@43767
   292
      else if (s == "0") false
wenzelm@43767
   293
      else throw new XML_Atom(s)
wenzelm@43767
   294
wenzelm@43778
   295
    def unit_atom(s: String): Unit =
wenzelm@43767
   296
      if (s == "") () else throw new XML_Atom(s)
wenzelm@43767
   297
wenzelm@43767
   298
wenzelm@43767
   299
    /* structural nodes */
wenzelm@43767
   300
wenzelm@43767
   301
    private def node(t: XML.Tree): XML.Body =
wenzelm@43767
   302
      t match {
wenzelm@43767
   303
        case XML.Elem(Markup(":", Nil), ts) => ts
wenzelm@43767
   304
        case _ => throw new XML_Body(List(t))
wenzelm@43767
   305
      }
wenzelm@43767
   306
wenzelm@43781
   307
    private def vector(atts: XML.Attributes): List[String] =
wenzelm@46839
   308
      atts.iterator.zipWithIndex.map(
wenzelm@46839
   309
        { case ((a, x), i) => if (int_atom(a) == i) x else throw new XML_Atom(a) }).toList
wenzelm@43778
   310
wenzelm@43778
   311
    private def tagged(t: XML.Tree): (Int, (List[String], XML.Body)) =
wenzelm@43767
   312
      t match {
wenzelm@43781
   313
        case XML.Elem(Markup(name, atts), ts) => (int_atom(name), (vector(atts), ts))
wenzelm@43767
   314
        case _ => throw new XML_Body(List(t))
wenzelm@43767
   315
      }
wenzelm@43767
   316
wenzelm@43767
   317
wenzelm@43767
   318
    /* representation of standard types */
wenzelm@43767
   319
wenzelm@43780
   320
    val properties: T[Properties.T] =
wenzelm@43767
   321
    {
wenzelm@43767
   322
      case List(XML.Elem(Markup(":", props), Nil)) => props
wenzelm@43767
   323
      case ts => throw new XML_Body(ts)
wenzelm@43767
   324
    }
wenzelm@43767
   325
wenzelm@43767
   326
    val string: T[String] =
wenzelm@43767
   327
    {
wenzelm@43767
   328
      case Nil => ""
wenzelm@43767
   329
      case List(XML.Text(s)) => s
wenzelm@43767
   330
      case ts => throw new XML_Body(ts)
wenzelm@43767
   331
    }
wenzelm@43767
   332
wenzelm@43767
   333
    val long: T[Long] = (x => long_atom(string(x)))
wenzelm@43767
   334
wenzelm@43767
   335
    val int: T[Int] = (x => int_atom(string(x)))
wenzelm@43767
   336
wenzelm@43767
   337
    val bool: T[Boolean] = (x => bool_atom(string(x)))
wenzelm@43767
   338
wenzelm@43767
   339
    val unit: T[Unit] = (x => unit_atom(string(x)))
wenzelm@43767
   340
wenzelm@43767
   341
    def pair[A, B](f: T[A], g: T[B]): T[(A, B)] =
wenzelm@43767
   342
    {
wenzelm@43767
   343
      case List(t1, t2) => (f(node(t1)), g(node(t2)))
wenzelm@43767
   344
      case ts => throw new XML_Body(ts)
wenzelm@43767
   345
    }
wenzelm@43767
   346
wenzelm@43767
   347
    def triple[A, B, C](f: T[A], g: T[B], h: T[C]): T[(A, B, C)] =
wenzelm@43767
   348
    {
wenzelm@43767
   349
      case List(t1, t2, t3) => (f(node(t1)), g(node(t2)), h(node(t3)))
wenzelm@43767
   350
      case ts => throw new XML_Body(ts)
wenzelm@43767
   351
    }
wenzelm@43767
   352
wenzelm@43767
   353
    def list[A](f: T[A]): T[List[A]] =
wenzelm@43767
   354
      (ts => ts.map(t => f(node(t))))
wenzelm@43767
   355
wenzelm@43767
   356
    def option[A](f: T[A]): T[Option[A]] =
wenzelm@43767
   357
    {
wenzelm@43767
   358
      case Nil => None
wenzelm@43767
   359
      case List(t) => Some(f(node(t)))
wenzelm@43767
   360
      case ts => throw new XML_Body(ts)
wenzelm@43767
   361
    }
wenzelm@43767
   362
wenzelm@43778
   363
    def variant[A](fs: List[V[A]]): T[A] =
wenzelm@43767
   364
    {
wenzelm@43767
   365
      case List(t) =>
wenzelm@43778
   366
        val (tag, (xs, ts)) = tagged(t)
wenzelm@43768
   367
        val f =
wenzelm@43768
   368
          try { fs(tag) }
wenzelm@43768
   369
          catch { case _: IndexOutOfBoundsException => throw new XML_Body(List(t)) }
wenzelm@43778
   370
        f(xs, ts)
wenzelm@43767
   371
      case ts => throw new XML_Body(ts)
wenzelm@43767
   372
    }
wenzelm@43767
   373
  }
wenzelm@27931
   374
}