| author | blanchet | 
| Thu, 25 Aug 2011 23:54:57 +0200 | |
| changeset 44502 | c537d5e5a365 | 
| parent 43782 | 834de29572d5 | 
| permissions | -rw-r--r-- | 
| 26528 | 1 | (* Title: Pure/General/yxml.ML | 
| 2 | Author: Makarius | |
| 3 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 4 | Efficient text representation of XML trees using extra characters X | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 5 | and Y -- no escaping, may nest marked text verbatim. | 
| 26528 | 6 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 7 | Markup <elem att="val" ...>...body...</elem> is encoded as: | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 8 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 9 | X Y name Y att=val ... X | 
| 26528 | 10 | ... | 
| 11 | body | |
| 12 | ... | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 13 | X Y X | 
| 26528 | 14 | *) | 
| 15 | ||
| 16 | signature YXML = | |
| 17 | sig | |
| 43777 | 18 | val X: Symbol.symbol | 
| 19 | val Y: Symbol.symbol | |
| 43772 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 wenzelm parents: 
43731diff
changeset | 20 | val embed_controls: string -> string | 
| 43731 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 wenzelm parents: 
43728diff
changeset | 21 | val detect: string -> bool | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 22 | val output_markup: Markup.T -> string * string | 
| 26528 | 23 | val element: string -> XML.attributes -> string list -> string | 
| 43728 | 24 | val string_of_body: XML.body -> string | 
| 26528 | 25 | val string_of: XML.tree -> string | 
| 38266 
492d377ecfe2
type XML.body as basic data representation language;
 wenzelm parents: 
38265diff
changeset | 26 | val parse_body: string -> XML.body | 
| 26528 | 27 | val parse: string -> XML.tree | 
| 28 | end; | |
| 29 | ||
| 30 | structure YXML: YXML = | |
| 31 | struct | |
| 32 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 33 | (** string representation **) | 
| 26528 | 34 | |
| 38265 
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
 wenzelm parents: 
38228diff
changeset | 35 | (* idempotent recoding of certain low ASCII control characters *) | 
| 34095 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 36 | |
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 37 | fun pseudo_utf8 c = | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 38 | if Symbol.is_ascii_control c | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 39 | then chr 192 ^ chr (128 + ord c) | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 40 | else c; | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 41 | |
| 43772 
c825594fd0c1
clarified YXML.embed_controls -- this is idempotent and cannot be nested;
 wenzelm parents: 
43731diff
changeset | 42 | fun embed_controls str = | 
| 34095 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 43 | if exists_string Symbol.is_ascii_control str | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 44 | then translate_string pseudo_utf8 str | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 45 | else str; | 
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 46 | |
| 
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
 wenzelm parents: 
31469diff
changeset | 47 | |
| 26547 | 48 | (* markers *) | 
| 49 | ||
| 43777 | 50 | val X = chr 5; | 
| 51 | val Y = chr 6; | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 52 | val XY = X ^ Y; | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 53 | val XYX = XY ^ X; | 
| 26528 | 54 | |
| 43782 | 55 | val detect = exists_string (fn s => s = X orelse s = Y); | 
| 43731 
70072780e095
inner syntax supports inlined YXML according to Term_XML (particularly useful for producing text under program control);
 wenzelm parents: 
43728diff
changeset | 56 | |
| 26528 | 57 | |
| 26547 | 58 | (* output *) | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 59 | |
| 27884 | 60 | fun output_markup (markup as (name, atts)) = | 
| 38474 
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
 wenzelm parents: 
38266diff
changeset | 61 | if Markup.is_empty markup then Markup.no_output | 
| 27884 | 62 | else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX); | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 63 | |
| 26528 | 64 | fun element name atts body = | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 65 | let val (pre, post) = output_markup (name, atts) | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 66 | in pre ^ implode body ^ post end; | 
| 26528 | 67 | |
| 43728 | 68 | fun string_of_body body = | 
| 26528 | 69 | let | 
| 70 | fun attrib (a, x) = | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 71 | Buffer.add Y #> | 
| 26528 | 72 | Buffer.add a #> Buffer.add "=" #> Buffer.add x; | 
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
34095diff
changeset | 73 | fun tree (XML.Elem ((name, atts), ts)) = | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 74 | Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #> | 
| 43728 | 75 | trees ts #> | 
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 76 | Buffer.add XYX | 
| 43728 | 77 | | tree (XML.Text s) = Buffer.add s | 
| 78 | and trees ts = fold tree ts; | |
| 79 | in Buffer.empty |> trees body |> Buffer.content end; | |
| 80 | ||
| 81 | val string_of = string_of_body o single; | |
| 26528 | 82 | |
| 83 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 84 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 85 | (** efficient YXML parsing **) | 
| 26528 | 86 | |
| 87 | local | |
| 88 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 89 | (* splitting *) | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 90 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 91 | fun is_char s c = ord s = Char.ord c; | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 92 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 93 | val split_string = | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 94 | Substring.full #> | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 95 | Substring.tokens (is_char X) #> | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 96 | map (Substring.fields (is_char Y) #> map Substring.string); | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 97 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 98 | |
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 99 | (* structural errors *) | 
| 26528 | 100 | |
| 101 | fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg);
 | |
| 102 | fun err_attribute () = err "bad attribute"; | |
| 103 | fun err_element () = err "bad element"; | |
| 104 | fun err_unbalanced "" = err "unbalanced element" | |
| 105 |   | err_unbalanced name = err ("unbalanced element " ^ quote name);
 | |
| 106 | ||
| 107 | ||
| 108 | (* stack operations *) | |
| 109 | ||
| 110 | fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending; | |
| 111 | ||
| 112 | fun push "" _ _ = err_element () | |
| 113 | | push name atts pending = ((name, atts), []) :: pending; | |
| 114 | ||
| 115 | fun pop ((("", _), _) :: _) = err_unbalanced ""
 | |
| 38228 
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
 wenzelm parents: 
34095diff
changeset | 116 | | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending; | 
| 26528 | 117 | |
| 118 | ||
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 119 | (* parsing *) | 
| 26528 | 120 | |
| 121 | fun parse_attrib s = | |
| 28025 | 122 | (case first_field "=" s of | 
| 28023 
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
 wenzelm parents: 
27932diff
changeset | 123 | NONE => err_attribute () | 
| 28025 | 124 |   | SOME ("", _) => err_attribute ()
 | 
| 125 | | SOME att => att); | |
| 26528 | 126 | |
| 26540 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 127 | fun parse_chunk ["", ""] = pop | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 128 |   | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
 | 
| 
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
 wenzelm parents: 
26528diff
changeset | 129 | | parse_chunk txts = fold (add o XML.Text) txts; | 
| 26528 | 130 | |
| 131 | in | |
| 132 | ||
| 133 | fun parse_body source = | |
| 43615 
8e0f6cfa8eb2
reverted ce00462f,b3759dce, 7a165592: unwanted generalisation
 noschinl parents: 
42355diff
changeset | 134 |   (case fold parse_chunk (split_string source) [(("", []), [])] of
 | 
| 26528 | 135 |     [(("", _), result)] => rev result
 | 
| 136 | | ((name, _), _) :: _ => err_unbalanced name); | |
| 137 | ||
| 138 | fun parse source = | |
| 139 | (case parse_body source of | |
| 27798 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 140 | [result] => result | 
| 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 141 | | [] => XML.Text "" | 
| 
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
 wenzelm parents: 
26684diff
changeset | 142 | | _ => err "multiple results"); | 
| 26528 | 143 | |
| 144 | end; | |
| 145 | ||
| 146 | end; | |
| 147 |