author | noschinl |
Wed, 13 Apr 2011 21:38:00 +0200 | |
changeset 42331 | b3759dcea95e |
parent 42330 | 7a1655920fe8 |
child 42355 | ce00462fe8aa |
permissions | -rw-r--r-- |
26528 | 1 |
(* Title: Pure/General/yxml.ML |
2 |
Author: Makarius |
|
3 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
4 |
Efficient text representation of XML trees using extra characters X |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
5 |
and Y -- no escaping, may nest marked text verbatim. |
26528 | 6 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
7 |
Markup <elem att="val" ...>...body...</elem> is encoded as: |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
8 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
9 |
X Y name Y att=val ... X |
26528 | 10 |
... |
11 |
body |
|
12 |
... |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
13 |
X Y X |
26528 | 14 |
*) |
15 |
||
16 |
signature YXML = |
|
17 |
sig |
|
38265
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents:
38228
diff
changeset
|
18 |
val escape_controls: string -> string |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
19 |
val output_markup: Markup.T -> string * string |
26528 | 20 |
val element: string -> XML.attributes -> string list -> string |
21 |
val string_of: XML.tree -> string |
|
38266
492d377ecfe2
type XML.body as basic data representation language;
wenzelm
parents:
38265
diff
changeset
|
22 |
val parse_body: string -> XML.body |
26528 | 23 |
val parse: string -> XML.tree |
42331 | 24 |
val parse_file: (XML.tree -> 'a) -> Path.T -> 'a list |
26528 | 25 |
end; |
26 |
||
27 |
structure YXML: YXML = |
|
28 |
struct |
|
29 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
30 |
(** string representation **) |
26528 | 31 |
|
38265
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents:
38228
diff
changeset
|
32 |
(* idempotent recoding of certain low ASCII control characters *) |
34095
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
33 |
|
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
34 |
fun pseudo_utf8 c = |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
35 |
if Symbol.is_ascii_control c |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
36 |
then chr 192 ^ chr (128 + ord c) |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
37 |
else c; |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
38 |
|
38265
cc9fde54311f
renamed YXML.binary_text to YXML.escape_controls to emphasize what it actually does;
wenzelm
parents:
38228
diff
changeset
|
39 |
fun escape_controls str = |
34095
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
40 |
if exists_string Symbol.is_ascii_control str |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
41 |
then translate_string pseudo_utf8 str |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
42 |
else str; |
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
43 |
|
c2f176a38448
robust representation of low ASCII control characters within XML/YXML text;
wenzelm
parents:
31469
diff
changeset
|
44 |
|
26547 | 45 |
(* markers *) |
46 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
47 |
val X = Symbol.ENQ; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
48 |
val Y = Symbol.ACK; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
49 |
val XY = X ^ Y; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
50 |
val XYX = XY ^ X; |
26528 | 51 |
|
52 |
||
26547 | 53 |
(* output *) |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
54 |
|
27884 | 55 |
fun output_markup (markup as (name, atts)) = |
38474
e498dc2eb576
uniform Markup.empty/Markup.Empty in ML and Scala;
wenzelm
parents:
38266
diff
changeset
|
56 |
if Markup.is_empty markup then Markup.no_output |
27884 | 57 |
else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX); |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
58 |
|
26528 | 59 |
fun element name atts body = |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
60 |
let val (pre, post) = output_markup (name, atts) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
61 |
in pre ^ implode body ^ post end; |
26528 | 62 |
|
63 |
fun string_of t = |
|
64 |
let |
|
65 |
fun attrib (a, x) = |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
66 |
Buffer.add Y #> |
26528 | 67 |
Buffer.add a #> Buffer.add "=" #> Buffer.add x; |
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
34095
diff
changeset
|
68 |
fun tree (XML.Elem ((name, atts), ts)) = |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
69 |
Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #> |
26528 | 70 |
fold tree ts #> |
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
71 |
Buffer.add XYX |
28033 | 72 |
| tree (XML.Text s) = Buffer.add s; |
26528 | 73 |
in Buffer.empty |> tree t |> Buffer.content end; |
74 |
||
75 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
76 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
77 |
(** efficient YXML parsing **) |
26528 | 78 |
|
79 |
local |
|
80 |
||
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
81 |
(* splitting *) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
82 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
83 |
fun is_char s c = ord s = Char.ord c; |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
84 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
85 |
val split_string = |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
86 |
Substring.full #> |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
87 |
Substring.tokens (is_char X) #> |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
88 |
map (Substring.fields (is_char Y) #> map Substring.string); |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
89 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
90 |
|
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
91 |
(* structural errors *) |
26528 | 92 |
|
93 |
fun err msg = raise Fail ("Malformed YXML encoding: " ^ msg); |
|
94 |
fun err_attribute () = err "bad attribute"; |
|
95 |
fun err_element () = err "bad element"; |
|
96 |
fun err_unbalanced "" = err "unbalanced element" |
|
97 |
| err_unbalanced name = err ("unbalanced element " ^ quote name); |
|
98 |
||
99 |
||
100 |
(* stack operations *) |
|
101 |
||
102 |
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending; |
|
103 |
||
104 |
fun push "" _ _ = err_element () |
|
105 |
| push name atts pending = ((name, atts), []) :: pending; |
|
106 |
||
107 |
fun pop ((("", _), _) :: _) = err_unbalanced "" |
|
38228
ada3ab6b9085
simplified type XML.tree: embed Markup.T directly, avoid slightly odd triple;
wenzelm
parents:
34095
diff
changeset
|
108 |
| pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending; |
26528 | 109 |
|
42330
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
110 |
val stack_init = [(("", []), [])]; |
26528 | 111 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
112 |
(* parsing *) |
26528 | 113 |
|
114 |
fun parse_attrib s = |
|
28025 | 115 |
(case first_field "=" s of |
28023
92dd3ad302b7
simplified parse_attrib (find_substring instead of space_explode);
wenzelm
parents:
27932
diff
changeset
|
116 |
NONE => err_attribute () |
28025 | 117 |
| SOME ("", _) => err_attribute () |
118 |
| SOME att => att); |
|
26528 | 119 |
|
26540
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
120 |
fun parse_chunk ["", ""] = pop |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
121 |
| parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts) |
173d548ce9d2
replaced ETX/EOT by ENQ/ACK, which are less likely to be interpreted by tty etc.;
wenzelm
parents:
26528
diff
changeset
|
122 |
| parse_chunk txts = fold (add o XML.Text) txts; |
26528 | 123 |
|
42330
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
124 |
fun preparse_chunk _ "" x = x |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
125 |
| preparse_chunk f str (pending, results) = |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
126 |
(case parse_chunk (String.fields (is_char Y) str) pending of |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
127 |
[(("", _), [result])] => (stack_init, f result :: results) |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
128 |
| foo => (foo, results)); |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
129 |
|
26528 | 130 |
in |
131 |
||
132 |
fun parse_body source = |
|
42330
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
133 |
(case fold parse_chunk (split_string source) stack_init of |
26528 | 134 |
[(("", _), result)] => rev result |
135 |
| ((name, _), _) :: _ => err_unbalanced name); |
|
136 |
||
137 |
fun parse source = |
|
138 |
(case parse_body source of |
|
27798
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
139 |
[result] => result |
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
140 |
| [] => XML.Text "" |
b96c73f11a23
YXML.parse: allow text without markup, potentially empty;
wenzelm
parents:
26684
diff
changeset
|
141 |
| _ => err "multiple results"); |
26528 | 142 |
|
42331 | 143 |
fun parse_file f path = |
42330
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
144 |
(case File.fold_fields (is_char X) (preparse_chunk f) |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
145 |
path (stack_init, []) of |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
146 |
([(("", _), [])], result) => rev result |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
147 |
| (((name, _), _) :: _, _) => err_unbalanced name); |
7a1655920fe8
Add YXML.parse_file to parse and process big data files
noschinl
parents:
38474
diff
changeset
|
148 |
|
26528 | 149 |
end; |
150 |
||
151 |
end; |
|
152 |