| author | huffman | 
| Tue, 13 Jan 2009 10:18:23 -0800 | |
| changeset 29475 | c06d1b0a970f | 
| parent 29325 | a205acc94356 | 
| child 29606 | fedb8be05f24 | 
| permissions | -rw-r--r-- | 
| 26528 | 1 | (* Title: Pure/General/yxml.ML | 
| 2 | ID: $Id$ | |
| 3 | Author: Makarius | |
| 4 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 5 | Efficient text representation of XML trees using extra characters X | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 6 | and Y -- no escaping, may nest marked text verbatim. | 
| 26528 | 7 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 8 | Markup <elem att="val" ...>...body...</elem> is encoded as: | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 9 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 10 | X Y name Y att=val ... X | 
| 26528 | 11 | ... | 
| 12 | body | |
| 13 | ... | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 14 | X Y X | 
| 26528 | 15 | *) | 
| 16 | ||
| 17 | signature YXML = | |
| 18 | sig | |
| 19 | val detect: string -> bool | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 20 | val output_markup: Markup.T -> string * string | 
| 26528 | 21 | val element: string -> XML.attributes -> string list -> string | 
| 22 | val string_of: XML.tree -> string | |
| 23 | val parse_body: string -> XML.tree list | |
| 24 | val parse: string -> XML.tree | |
| 25 | end; | |
| 26 | ||
| 27 | structure YXML: YXML = | |
| 28 | struct | |
| 29 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 30 | (** string representation **) | 
| 26528 | 31 | |
| 26547 | 32 | (* markers *) | 
| 33 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 34 | val X = Symbol.ENQ; | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 35 | val Y = Symbol.ACK; | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 36 | val XY = X ^ Y; | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 37 | val XYX = XY ^ X; | 
| 26528 | 38 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 39 | val detect = String.isPrefix XY; | 
| 26528 | 40 | |
| 41 | ||
| 26547 | 42 | (* output *) | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 43 | |
| 27884 | 44 | fun output_markup (markup as (name, atts)) = | 
| 29325 | 45 | if Markup.is_none markup then Markup.no_output | 
| 27884 | 46 | else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX); | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 47 | |
| 26528 | 48 | fun element name atts body = | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 49 | let val (pre, post) = output_markup (name, atts) | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 50 | in pre ^ implode body ^ post end; | 
| 26528 | 51 | |
| 52 | fun string_of t = | |
| 53 | let | |
| 54 | fun attrib (a, x) = | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 55 | Buffer.add Y #> | 
| 26528 | 56 | Buffer.add a #> Buffer.add "=" #> Buffer.add x; | 
| 57 | fun tree (XML.Elem (name, atts, ts)) = | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 58 | Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #> | 
| 26528 | 59 | fold tree ts #> | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 60 | Buffer.add XYX | 
| 28033 | 61 | | tree (XML.Text s) = Buffer.add s; | 
| 26528 | 62 | in Buffer.empty |> tree t |> Buffer.content end; | 
| 63 | ||
| 64 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 65 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 66 | (** efficient YXML parsing **) | 
| 26528 | 67 | |
| 68 | local | |
| 69 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 70 | (* splitting *) | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 71 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 72 | fun is_char s c = ord s = Char.ord c; | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 73 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 74 | val split_string = | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 75 | Substring.full #> | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 76 | Substring.tokens (is_char X) #> | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 77 | map (Substring.fields (is_char Y) #> map Substring.string); | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 78 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 79 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 80 | (* structural errors *) | 
| 26528 | 81 | |
| 82 | fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
 | |
| 83 | fun err_attribute () = err "bad attribute"; | |
| 84 | fun err_element () = err "bad element"; | |
| 85 | fun err_unbalanced "" = err "unbalanced element" | |
| 86 |   | err_unbalanced name = err ("unbalanced element " ^ quote name);
 | |
| 87 | ||
| 88 | ||
| 89 | (* stack operations *) | |
| 90 | ||
| 91 | fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending; | |
| 92 | ||
| 93 | fun push "" _ _ = err_element () | |
| 94 | | push name atts pending = ((name, atts), []) :: pending; | |
| 95 | ||
| 96 | fun pop ((("", _), _) :: _) = err_unbalanced ""
 | |
| 97 | | pop (((name, atts), body) :: pending) = add (XML.Elem (name, atts, rev body)) pending; | |
| 98 | ||
| 99 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 100 | (* parsing *) | 
| 26528 | 101 | |
| 102 | fun parse_attrib s = | |
| 28025 | 103 | (case first_field "=" s of | 
| 28023 
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
 wenzelm parents: 
27932diff
changeset | 104 | NONE => err_attribute () | 
| 28025 | 105 |   | SOME ("", _) => err_attribute ()
 | 
| 106 | | SOME att => att); | |
| 26528 | 107 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 108 | fun parse_chunk ["", ""] = pop | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 109 |   | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
 | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 110 | | parse_chunk txts = fold (add o XML.Text) txts; | 
| 26528 | 111 | |
| 112 | in | |
| 113 | ||
| 114 | fun parse_body source = | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 115 |   (case fold parse_chunk (split_string source) [(("", []), [])] of
 | 
| 26528 | 116 |     [(("", _), result)] => rev result
 | 
| 117 | | ((name, _), _) :: _ => err_unbalanced name); | |
| 118 | ||
| 119 | fun parse source = | |
| 120 | (case parse_body source of | |
| 27798 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 121 | [result] => result | 
| 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 122 | | [] => XML.Text "" | 
| 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 123 | | _ => err "multiple results"); | 
| 26528 | 124 | |
| 125 | end; | |
| 126 | ||
| 127 | end; | |
| 128 |