src/Pure/PIDE/yxml.ML
author wenzelm
Thu Dec 13 18:00:24 2012 +0100 (2012-12-13 ago)
changeset 50503 50f141b34bb7
parent 49656 7ff712de5747
child 59433 9da5b2c61049
permissions -rw-r--r--
enable Isabelle/ML to produce uninterpreted result messages as well;
wenzelm@44698
     1
(*  Title:      Pure/PIDE/yxml.ML
wenzelm@26528
     2
    Author:     Makarius
wenzelm@26528
     3
wenzelm@26540
     4
Efficient text representation of XML trees using extra characters X
wenzelm@44698
     5
and Y -- no escaping, may nest marked text verbatim.  Suitable for
wenzelm@44698
     6
direct inlining into plain text.
wenzelm@26528
     7
wenzelm@26540
     8
Markup <elem att="val" ...>...body...</elem> is encoded as:
wenzelm@26540
     9
wenzelm@26540
    10
  X Y name Y att=val ... X
wenzelm@26528
    11
  ...
wenzelm@26528
    12
  body
wenzelm@26528
    13
  ...
wenzelm@26540
    14
  X Y X
wenzelm@26528
    15
*)
wenzelm@26528
    16
wenzelm@26528
    17
signature YXML =
wenzelm@26528
    18
sig
wenzelm@43777
    19
  val X: Symbol.symbol
wenzelm@43777
    20
  val Y: Symbol.symbol
wenzelm@43772
    21
  val embed_controls: string -> string
wenzelm@43731
    22
  val detect: string -> bool
wenzelm@26540
    23
  val output_markup: Markup.T -> string * string
wenzelm@43728
    24
  val string_of_body: XML.body -> string
wenzelm@26528
    25
  val string_of: XML.tree -> string
wenzelm@49656
    26
  val output_markup_elem: Markup.T -> (string * string) * string
wenzelm@38266
    27
  val parse_body: string -> XML.body
wenzelm@26528
    28
  val parse: string -> XML.tree
wenzelm@26528
    29
end;
wenzelm@26528
    30
wenzelm@26528
    31
structure YXML: YXML =
wenzelm@26528
    32
struct
wenzelm@26528
    33
wenzelm@26540
    34
(** string representation **)
wenzelm@26528
    35
wenzelm@38265
    36
(* idempotent recoding of certain low ASCII control characters *)
wenzelm@34095
    37
wenzelm@34095
    38
fun pseudo_utf8 c =
wenzelm@34095
    39
  if Symbol.is_ascii_control c
wenzelm@34095
    40
  then chr 192 ^ chr (128 + ord c)
wenzelm@34095
    41
  else c;
wenzelm@34095
    42
wenzelm@43772
    43
fun embed_controls str =
wenzelm@34095
    44
  if exists_string Symbol.is_ascii_control str
wenzelm@34095
    45
  then translate_string pseudo_utf8 str
wenzelm@34095
    46
  else str;
wenzelm@34095
    47
wenzelm@34095
    48
wenzelm@26547
    49
(* markers *)
wenzelm@26547
    50
wenzelm@43777
    51
val X = chr 5;
wenzelm@43777
    52
val Y = chr 6;
wenzelm@26540
    53
val XY = X ^ Y;
wenzelm@26540
    54
val XYX = XY ^ X;
wenzelm@26528
    55
wenzelm@43782
    56
val detect = exists_string (fn s => s = X orelse s = Y);
wenzelm@43731
    57
wenzelm@26528
    58
wenzelm@26547
    59
(* output *)
wenzelm@26540
    60
wenzelm@27884
    61
fun output_markup (markup as (name, atts)) =
wenzelm@38474
    62
  if Markup.is_empty markup then Markup.no_output
wenzelm@27884
    63
  else (XY ^ name ^ implode (map (fn (a, x) => Y ^ a ^ "=" ^ x) atts) ^ X, XYX);
wenzelm@26540
    64
wenzelm@43728
    65
fun string_of_body body =
wenzelm@26528
    66
  let
wenzelm@46831
    67
    fun attrib (a, x) = Buffer.add Y #> Buffer.add a #> Buffer.add "=" #> Buffer.add x;
wenzelm@38228
    68
    fun tree (XML.Elem ((name, atts), ts)) =
wenzelm@26540
    69
          Buffer.add XY #> Buffer.add name #> fold attrib atts #> Buffer.add X #>
wenzelm@43728
    70
          trees ts #>
wenzelm@26540
    71
          Buffer.add XYX
wenzelm@43728
    72
      | tree (XML.Text s) = Buffer.add s
wenzelm@43728
    73
    and trees ts = fold tree ts;
wenzelm@43728
    74
  in Buffer.empty |> trees body |> Buffer.content end;
wenzelm@43728
    75
wenzelm@43728
    76
val string_of = string_of_body o single;
wenzelm@26528
    77
wenzelm@26528
    78
wenzelm@49656
    79
(* wrapped elements *)
wenzelm@49656
    80
wenzelm@49656
    81
val Z = chr 0;
wenzelm@49656
    82
val Z_text = [XML.Text Z];
wenzelm@49656
    83
wenzelm@49656
    84
fun output_markup_elem markup =
wenzelm@49656
    85
  let val [bg1, bg2, en] = space_explode Z (string_of (XML.wrap_elem ((markup, Z_text), Z_text)))
wenzelm@49656
    86
  in ((bg1, bg2), en) end;
wenzelm@49656
    87
wenzelm@49656
    88
wenzelm@26540
    89
wenzelm@26540
    90
(** efficient YXML parsing **)
wenzelm@26528
    91
wenzelm@26528
    92
local
wenzelm@26528
    93
wenzelm@26540
    94
(* splitting *)
wenzelm@26540
    95
wenzelm@26540
    96
fun is_char s c = ord s = Char.ord c;
wenzelm@26540
    97
wenzelm@26540
    98
val split_string =
wenzelm@26540
    99
  Substring.full #>
wenzelm@26540
   100
  Substring.tokens (is_char X) #>
wenzelm@26540
   101
  map (Substring.fields (is_char Y) #> map Substring.string);
wenzelm@26540
   102
wenzelm@26540
   103
wenzelm@26540
   104
(* structural errors *)
wenzelm@26528
   105
wenzelm@46832
   106
fun err msg = raise Fail ("Malformed YXML: " ^ msg);
wenzelm@26528
   107
fun err_attribute () = err "bad attribute";
wenzelm@26528
   108
fun err_element () = err "bad element";
wenzelm@26528
   109
fun err_unbalanced "" = err "unbalanced element"
wenzelm@26528
   110
  | err_unbalanced name = err ("unbalanced element " ^ quote name);
wenzelm@26528
   111
wenzelm@26528
   112
wenzelm@26528
   113
(* stack operations *)
wenzelm@26528
   114
wenzelm@26528
   115
fun add x ((elem, body) :: pending) = (elem, x :: body) :: pending;
wenzelm@26528
   116
wenzelm@26528
   117
fun push "" _ _ = err_element ()
wenzelm@26528
   118
  | push name atts pending = ((name, atts), []) :: pending;
wenzelm@26528
   119
wenzelm@26528
   120
fun pop ((("", _), _) :: _) = err_unbalanced ""
wenzelm@38228
   121
  | pop ((markup, body) :: pending) = add (XML.Elem (markup, rev body)) pending;
wenzelm@26528
   122
wenzelm@26528
   123
wenzelm@26540
   124
(* parsing *)
wenzelm@26528
   125
wenzelm@26528
   126
fun parse_attrib s =
wenzelm@28025
   127
  (case first_field "=" s of
wenzelm@28023
   128
    NONE => err_attribute ()
wenzelm@28025
   129
  | SOME ("", _) => err_attribute ()
wenzelm@28025
   130
  | SOME att => att);
wenzelm@26528
   131
wenzelm@26540
   132
fun parse_chunk ["", ""] = pop
wenzelm@26540
   133
  | parse_chunk ("" :: name :: atts) = push name (map parse_attrib atts)
wenzelm@26540
   134
  | parse_chunk txts = fold (add o XML.Text) txts;
wenzelm@26528
   135
wenzelm@26528
   136
in
wenzelm@26528
   137
wenzelm@26528
   138
fun parse_body source =
noschinl@43615
   139
  (case fold parse_chunk (split_string source) [(("", []), [])] of
wenzelm@26528
   140
    [(("", _), result)] => rev result
wenzelm@26528
   141
  | ((name, _), _) :: _ => err_unbalanced name);
wenzelm@26528
   142
wenzelm@26528
   143
fun parse source =
wenzelm@26528
   144
  (case parse_body source of
wenzelm@27798
   145
    [result] => result
wenzelm@27798
   146
  | [] => XML.Text ""
wenzelm@27798
   147
  | _ => err "multiple results");
wenzelm@26528
   148
wenzelm@26528
   149
end;
wenzelm@26528
   150
wenzelm@26528
   151
end;
wenzelm@26528
   152