author  berghofe 
Thu, 04 Sep 2003 19:39:52 +0200  
changeset 14185  9b3841638c06 
parent 13729  1a8dda49fd86 
child 14596  c36e116b578b 
permissions  rwrr 
12416  1 
(* Title: Pure/General/xml.ML 
2 
ID: $Id$ 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

3 
Author: Markus Wenzel, LMU Muenchen 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

4 
Stefan Berghofer, TU Muenchen 
12416  5 
License: GPL (GNU GENERAL PUBLIC LICENSE) 
6 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

7 
Basic support for XML input and output. 
12416  8 
*) 
9 

10 
signature XML = 

11 
sig 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

12 
datatype tree = 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

13 
Elem of string * (string * string) list * tree list 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

14 
 Text of string 
12416  15 
val element: string > (string * string) list > string list > string 
16 
val text: string > string 

17 
val header: string 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

18 
val string_of_tree: tree > string 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

19 
val tree_of_string: string > tree 
14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

20 
val parse_content: string list > tree list * string list 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

21 
val parse_elem: string list > tree * string list 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

22 
val parse_document: string list > (string option * tree) * string list 
12416  23 
end; 
24 

25 
structure XML: XML = 

26 
struct 

27 

28 
(* character data *) 

29 

30 
fun encode "<" = "<" 

31 
 encode ">" = ">" 

32 
 encode "&" = "&" 

33 
 encode "'" = "'" 

34 
 encode "\"" = """ 

35 
 encode c = c; 

36 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

37 
fun decode "<" = "<" 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

38 
 decode ">" = ">" 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

39 
 decode "&" = "&" 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

40 
 decode "'" = "'" 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

41 
 decode """ = "\"" 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

42 
 decode c = c; 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

43 

12416  44 
val text = implode o map encode o explode; 
45 

46 

47 
(* elements *) 

48 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

49 
datatype tree = 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

50 
Elem of string * (string * string) list * tree list 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

51 
 Text of string; 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

52 

12416  53 
fun attribute (a, x) = a ^ " = " ^ quote (text x); 
54 

55 
fun element name atts cs = 

56 
let val elem = space_implode " " (name :: map attribute atts) in 

57 
if null cs then enclose "<" "/>" elem 

58 
else enclose "<" ">" elem ^ implode cs ^ enclose "</" ">" name 

59 
end; 

60 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

61 
fun string_of_tree (Elem (name, atts, ts)) = 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

62 
element name atts (map string_of_tree ts) 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

63 
 string_of_tree (Text s) = s 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

64 

12416  65 
val header = "<?xml version=\"1.0\"?>\n"; 
66 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

67 

1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

68 
(* parser *) 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

69 

14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

70 
fun err s (xs, _) = "XML parsing error: " ^ s ^ "\nfound:\n" ^ 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

71 
implode (take (100, xs)); 
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

72 

14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

73 
val scan_whspc = Scan.repeat ($$ " "  $$ "\n"); 
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

74 

14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

75 
val literal = Scan.literal o Scan.make_lexicon o single o explode; 
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

76 

1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

77 
val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode; 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

78 

14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

79 
val parse_chars = Scan.repeat1 (Scan.unless (scan_whspc  $$ "<") 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

80 
(scan_special  Scan.one Symbol.not_eof)) >> implode; 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

81 

9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

82 
val parse_cdata = literal "<![CDATA["  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

83 
(Scan.repeat (Scan.unless (literal "]]>") (Scan.one Symbol.not_eof)) >> 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

84 
implode)  literal "]]>"; 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

85 

9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

86 
val parse_att = 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

87 
Symbol.scan_id  scan_whspc  $$ "="  scan_whspc  $$ "\""  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

88 
(Scan.repeat (Scan.unless ($$ "\"") 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

89 
(scan_special  Scan.one Symbol.not_eof)) >> implode)  $$ "\""; 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

90 

9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

91 
val parse_comment = literal "<!"  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

92 
Scan.repeat (Scan.unless (literal ">") (Scan.one Symbol.not_eof))  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

93 
literal ">"; 
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

94 

14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

95 
val parse_pi = literal "<?"  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

96 
Scan.repeat (Scan.unless (literal "?>") (Scan.one Symbol.not_eof))  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

97 
literal "?>"; 
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

98 

14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

99 
fun parse_content xs = 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

100 
((Scan.optional (scan_whspc  parse_chars >> (single o Text)) []  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

101 
(Scan.repeat (scan_whspc  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

102 
( parse_elem >> single 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

103 
 parse_cdata >> (single o Text) 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

104 
 parse_pi >> K [] 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

105 
 parse_comment >> K [])  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

106 
Scan.optional (scan_whspc  parse_chars >> (single o Text)) [] 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

107 
>> op @) >> flat) >> op @)  scan_whspc) xs 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

108 

9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

109 
and parse_elem xs = 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

110 
($$ "<"  Symbol.scan_id  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

111 
Scan.repeat (scan_whspc  parse_att)  scan_whspc : (fn (s, _) => 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

112 
!! (err "Expected > or />") 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

113 
( literal "/>" >> K [] 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

114 
 $$ ">"  parse_content  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

115 
!! (err ("Expected </" ^ s ^ ">")) 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

116 
(literal ("</" ^ s)  scan_whspc  $$ ">"))) >> 
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

117 
(fn ((s, atts), ts) => Elem (s, atts, ts))) xs; 
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

118 

14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

119 
val parse_document = 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

120 
Scan.option (literal "<!DOCTYPE"  scan_whspc  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

121 
(Scan.repeat (Scan.unless ($$ ">") 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

122 
(Scan.one Symbol.not_eof)) >> implode)  $$ ">"  scan_whspc)  
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

123 
parse_elem; 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

124 

13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

125 
fun tree_of_string s = 
14185
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

126 
(case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element") 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

127 
(scan_whspc  parse_elem  scan_whspc))) (Symbol.explode s) of 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

128 
(x, []) => x 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

129 
 (_, ys) => error ("XML parsing error: Unprocessed input\n" ^ 
9b3841638c06
Tried to make parser a bit more standardconforming.
berghofe
parents:
13729
diff
changeset

130 
implode (take (100, ys)))); 
13729
1a8dda49fd86
Added XML parser (useful for parsing PGIP / PGML).
berghofe
parents:
12416
diff
changeset

131 

12416  132 
end; 