Tried to make parser a bit more standard-conforming.
authorberghofe
Thu, 04 Sep 2003 19:39:52 +0200
changeset 14185 9b3841638c06
parent 14184 2e0e02d68cbb
child 14186 6d2a494e33be
Tried to make parser a bit more standard-conforming.
src/Pure/General/xml.ML
--- a/src/Pure/General/xml.ML	Thu Sep 04 16:04:15 2003 +0200
+++ b/src/Pure/General/xml.ML	Thu Sep 04 19:39:52 2003 +0200
@@ -17,6 +17,9 @@
   val header: string
   val string_of_tree: tree -> string
   val tree_of_string: string -> tree
+  val parse_content: string list -> tree list * string list
+  val parse_elem: string list -> tree * string list
+  val parse_document: string list -> (string option * tree) * string list
 end;
 
 structure XML: XML =
@@ -64,50 +67,66 @@
 
 (* parser *)
 
-datatype token_kind = Idt | Sym | Str | Chr;
-
-fun is_kind k (k', _) = k = k';
+fun err s (xs, _) = "XML parsing error: " ^ s ^ "\nfound:\n" ^
+  implode (take (100, xs));
 
-val stopper = ((Sym, ""), fn (Sym, "") => true | _ => false);
+val scan_whspc = Scan.repeat ($$ " " || $$ "\n");
 
-val scan_whspc = $$ " " || $$ "\n";
-
-val scan_symb = Scan.literal (Scan.make_lexicon
-  (map explode ["<", "</", ">", "/>", "\"", "=", "&"])) >> implode;
+val literal = Scan.literal o Scan.make_lexicon o single o explode;
 
 val scan_special = $$ "&" ^^ Symbol.scan_id ^^ $$ ";" >> decode;
 
-val lexer = Scan.repeat (Scan.repeat scan_whspc |--
-  (   Scan.max (op <= o pairself (size o snd))
-        (Symbol.scan_id >> pair Idt)
-        (Scan.repeat1 (scan_special || Scan.unless (scan_whspc || scan_symb)
-           (Scan.one Symbol.not_eof)) >> (pair Chr o implode))
-   || $$ "\"" |-- Scan.repeat
-        (   scan_special
-         || Scan.unless ($$ "\"") (Scan.one Symbol.not_eof)) --|
-           $$ "\"" >> (pair Str o implode)
-   || scan_symb >> pair Sym) --| Scan.repeat scan_whspc);
+val parse_chars = Scan.repeat1 (Scan.unless (scan_whspc -- $$ "<")
+  (scan_special || Scan.one Symbol.not_eof)) >> implode;
+
+val parse_cdata = literal "<![CDATA[" |--
+  (Scan.repeat (Scan.unless (literal "]]>") (Scan.one Symbol.not_eof)) >>
+    implode) --| literal "]]>";
+
+val parse_att =
+  Symbol.scan_id --| scan_whspc --| $$ "=" --| scan_whspc --| $$ "\"" --
+  (Scan.repeat (Scan.unless ($$ "\"")
+    (scan_special || Scan.one Symbol.not_eof)) >> implode) --| $$ "\"";
+
+val parse_comment = literal "<!--" --
+  Scan.repeat (Scan.unless (literal "-->") (Scan.one Symbol.not_eof)) --
+  literal "-->";
 
-val parse_att = (Scan.one (is_kind Idt) >> snd) --| $$ (Sym, "=") --
-  (Scan.one (is_kind Str) >> snd);
+val parse_pi = literal "<?" |--
+  Scan.repeat (Scan.unless (literal "?>") (Scan.one Symbol.not_eof)) --|
+  literal "?>";
 
-fun parse_tree xs =
-  ($$ (Sym, "<") |-- (Scan.one (is_kind Idt) >> snd) --
-    Scan.repeat parse_att :-- (fn (s, _) =>
-          $$ (Sym, "/>") >> K []
-       || $$ (Sym, ">") |-- Scan.repeat
-            (   Scan.one (not o is_kind Sym) >> (Text o snd)
-             || parse_tree) --|
-          $$ (Sym, "</") --| Scan.one (equal (Idt, s)) --| $$ (Sym, ">")) >>
+fun parse_content xs =
+  ((Scan.optional (scan_whspc |-- parse_chars >> (single o Text)) [] --
+    (Scan.repeat (scan_whspc |--
+       (   parse_elem >> single
+        || parse_cdata >> (single o Text)
+        || parse_pi >> K []
+        || parse_comment >> K []) --
+       Scan.optional (scan_whspc |-- parse_chars >> (single o Text)) []
+         >> op @) >> flat) >> op @) --| scan_whspc) xs
+
+and parse_elem xs =
+  ($$ "<" |-- Symbol.scan_id --
+    Scan.repeat (scan_whspc |-- parse_att) --| scan_whspc :-- (fn (s, _) =>
+      !! (err "Expected > or />")
+        (   literal "/>" >> K []
+         || $$ ">" |-- parse_content --|
+            !! (err ("Expected </" ^ s ^ ">"))
+              (literal ("</" ^ s) --| scan_whspc --| $$ ">"))) >>
     (fn ((s, atts), ts) => Elem (s, atts, ts))) xs;
 
+val parse_document =
+  Scan.option (literal "<!DOCTYPE" -- scan_whspc |--
+    (Scan.repeat (Scan.unless ($$ ">")
+      (Scan.one Symbol.not_eof)) >> implode) --| $$ ">" --| scan_whspc) --
+  parse_elem;
+
 fun tree_of_string s =
-  let val msg = "Malformed XML expression:\n" ^ s
-  in case Scan.finite Symbol.stopper lexer (explode s) of
-      (toks, []) => (case Scan.finite stopper
-        (Scan.error (!! (K msg) parse_tree)) toks of
-          (t, []) => t | _ => error msg)
-    | _ => error msg
-  end;
+  (case Scan.finite Symbol.stopper (Scan.error (!! (err "Malformed element")
+      (scan_whspc |-- parse_elem --| scan_whspc))) (Symbol.explode s) of
+     (x, []) => x
+   | (_, ys) => error ("XML parsing error: Unprocessed input\n" ^
+       implode (take (100, ys))));
 
 end;