isabelle: src/Pure/General/yxml.scala@ec706ad37564 (annotated)

27930 2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	1	/* Title: Pure/General/yxml.scala
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	2	ID: $Id$
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	3	Author: Makarius
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	4
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	5	Efficient text representation of XML trees.
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	6	*/
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	7
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	8	package isabelle
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	9
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	10	import java.util.regex.Pattern
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	11
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	12
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	13	object YXML {
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	14
27943 f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	15	/* chunk markers */
27930 2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	16
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	17	private val X = '\5'
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	18	private val Y = '\6'
27945 d2dc5a1903e8 tuned parse performance: avoid splitting terminal Y chunk; wenzelm parents: 27944 diff changeset	19	private val Y_string = Y.toString
27930 2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	20
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	21	def detect(source: CharSequence) = {
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	22	source.length >= 2 &&
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	23	source.charAt(0) == X &&
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	24	source.charAt(1) == Y
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	25	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	26
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	27
27943 f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	28	/* iterate over chunks (resembles space_explode in ML) */
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	29
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	30	private def chunks(sep: Char, source: CharSequence) = new Iterator[CharSequence] {
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	31	private val end = source.length
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	32	private var state = if (end == 0) None else get_chunk(-1)
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	33	private def get_chunk(i: Int) = {
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	34	if (i < end) {
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	35	var j = i; do j += 1 while (j < end && source.charAt(j) != sep)
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	36	Some((source.subSequence(i + 1, j), j))
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	37	}
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	38	else None
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	39	}
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	40
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	41	def hasNext() = state.isDefined
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	42	def next() = state match {
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	43	case Some((s, i)) => { state = get_chunk(i); s }
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	44	case None => throw new NoSuchElementException("next on empty iterator")
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	45	}
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	46	}
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	47
f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	48
27930 2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	49	/* parsing */
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	50
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	51	class BadYXML(msg: String) extends Exception
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	52
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	53	private def err(msg: String) = throw new BadYXML(msg)
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	54	private def err_attribute() = err("bad attribute")
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	55	private def err_element() = err("bad element")
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	56	private def err_unbalanced(name: String) =
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	57	if (name == "") err("unbalanced element")
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	58	else err("unbalanced element \"" + name + "\"")
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	59
27944 2bf3f30558ed parse_attrib: more efficient due to indexOf('='); wenzelm parents: 27943 diff changeset	60	private def parse_attrib(source: CharSequence) = {
2bf3f30558ed parse_attrib: more efficient due to indexOf('='); wenzelm parents: 27943 diff changeset	61	val s = source.toString
2bf3f30558ed parse_attrib: more efficient due to indexOf('='); wenzelm parents: 27943 diff changeset	62	val i = s.indexOf('=')
2bf3f30558ed parse_attrib: more efficient due to indexOf('='); wenzelm parents: 27943 diff changeset	63	if (i <= 0) err_attribute()
27946 ec706ad37564 parse_attrib: proper index of name end! wenzelm parents: 27945 diff changeset	64	(s.substring(0, i), s.substring(i + 1))
27944 2bf3f30558ed parse_attrib: more efficient due to indexOf('='); wenzelm parents: 27943 diff changeset	65	}
27930 2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	66
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	67
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	68	def parse_body(source: CharSequence) = {
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	69
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	70	/* stack operations */
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	71
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	72	var stack: List[((String, XML.Attributes), List[XML.Tree])] = null
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	73
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	74	def add(x: XML.Tree) = stack match {
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	75	case ((elem, body) :: pending) => stack = (elem, x :: body) :: pending
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	76	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	77
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	78	def push(name: String, atts: XML.Attributes) =
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	79	if (name == "") err_element()
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	80	else stack = ((name, atts), Nil) :: stack
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	81
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	82	def pop() = stack match {
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	83	case ((("", _), _) :: _) => err_unbalanced("")
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	84	case (((name, atts), body) :: pending) =>
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	85	stack = pending; add(XML.Elem(name, atts, body.reverse))
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	86	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	87
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	88
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	89	/* parse chunks */
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	90
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	91	stack = List((("", Nil), Nil))
27943 f34ff5e7728f replaced Pattern.split by chunks iterator (more efficient, resembles ML version more closely); wenzelm parents: 27930 diff changeset	92	for (chunk <- chunks(X, source) if chunk != "") {
27945 d2dc5a1903e8 tuned parse performance: avoid splitting terminal Y chunk; wenzelm parents: 27944 diff changeset	93	if (chunk == Y_string) pop()
d2dc5a1903e8 tuned parse performance: avoid splitting terminal Y chunk; wenzelm parents: 27944 diff changeset	94	else {
d2dc5a1903e8 tuned parse performance: avoid splitting terminal Y chunk; wenzelm parents: 27944 diff changeset	95	chunks(Y, chunk).toList match {
d2dc5a1903e8 tuned parse performance: avoid splitting terminal Y chunk; wenzelm parents: 27944 diff changeset	96	case "" :: name :: atts => push(name.toString, atts.map(parse_attrib))
d2dc5a1903e8 tuned parse performance: avoid splitting terminal Y chunk; wenzelm parents: 27944 diff changeset	97	case txts => for (txt <- txts) add(XML.Text(txt.toString))
d2dc5a1903e8 tuned parse performance: avoid splitting terminal Y chunk; wenzelm parents: 27944 diff changeset	98	}
27930 2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	99	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	100	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	101	stack match {
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	102	case List((("", _), result)) => result.reverse
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	103	case ((name, _), _) :: _ => err_unbalanced(name)
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	104	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	105	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	106
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	107	def parse(source: CharSequence) =
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	108	parse_body(source) match {
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	109	case List(result) => result
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	110	case Nil => XML.Text("")
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	111	case _ => err("multiple results")
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	112	}
2b44df907cc2 Efficient text representation of XML trees. wenzelm parents: diff changeset	113	}

author	wenzelm
	Thu, 21 Aug 2008 22:06:17 +0200
changeset 27946	ec706ad37564
parent 27945	d2dc5a1903e8
child 27960	65b10d8ef0c6
permissions	-rw-r--r--