22 val is_eof: symbol -> bool |
22 val is_eof: symbol -> bool |
23 val not_eof: symbol -> bool |
23 val not_eof: symbol -> bool |
24 val stopper: symbol Scan.stopper |
24 val stopper: symbol Scan.stopper |
25 val sync: symbol |
25 val sync: symbol |
26 val is_sync: symbol -> bool |
26 val is_sync: symbol -> bool |
27 val malformed: symbol |
|
28 val end_malformed: symbol |
|
29 val separate_chars: string -> string |
|
30 val is_regular: symbol -> bool |
27 val is_regular: symbol -> bool |
|
28 val is_malformed: symbol -> bool |
|
29 val malformed_msg: symbol -> string |
31 val is_ascii: symbol -> bool |
30 val is_ascii: symbol -> bool |
32 val is_ascii_letter: symbol -> bool |
31 val is_ascii_letter: symbol -> bool |
33 val is_ascii_digit: symbol -> bool |
32 val is_ascii_digit: symbol -> bool |
34 val is_ascii_hex: symbol -> bool |
33 val is_ascii_hex: symbol -> bool |
35 val is_ascii_quasi: symbol -> bool |
34 val is_ascii_quasi: symbol -> bool |
40 val to_ascii_lower: symbol -> symbol |
39 val to_ascii_lower: symbol -> symbol |
41 val to_ascii_upper: symbol -> symbol |
40 val to_ascii_upper: symbol -> symbol |
42 val is_raw: symbol -> bool |
41 val is_raw: symbol -> bool |
43 val decode_raw: symbol -> string |
42 val decode_raw: symbol -> string |
44 val encode_raw: string -> string |
43 val encode_raw: string -> string |
45 datatype sym = Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string |
44 datatype sym = |
|
45 Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string | |
|
46 Malformed of string |
46 val decode: symbol -> sym |
47 val decode: symbol -> sym |
47 datatype kind = Letter | Digit | Quasi | Blank | Other |
48 datatype kind = Letter | Digit | Quasi | Blank | Other |
48 val kind: symbol -> kind |
49 val kind: symbol -> kind |
49 val is_letter: symbol -> bool |
50 val is_letter: symbol -> bool |
50 val is_digit: symbol -> bool |
51 val is_digit: symbol -> bool |
54 val is_letdig: symbol -> bool |
55 val is_letdig: symbol -> bool |
55 val is_ident: symbol list -> bool |
56 val is_ident: symbol list -> bool |
56 val beginning: int -> symbol list -> string |
57 val beginning: int -> symbol list -> string |
57 val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a |
58 val scanner: string -> (string list -> 'a * string list) -> symbol list -> 'a |
58 val scan_id: string list -> string * string list |
59 val scan_id: string list -> string * string list |
59 val source: {do_recover: bool} -> (string, 'a) Source.source -> |
60 val source: (string, 'a) Source.source -> (symbol, (string, 'a) Source.source) Source.source |
60 (symbol, (string, 'a) Source.source) Source.source |
|
61 val explode: string -> symbol list |
61 val explode: string -> symbol list |
62 val esc: symbol -> string |
62 val esc: symbol -> string |
63 val escape: string -> string |
63 val escape: string -> string |
64 val strip_blanks: string -> string |
64 val strip_blanks: string -> string |
65 val bump_init: string -> string |
65 val bump_init: string -> string |
109 fun is_char s = size s = 1; |
109 fun is_char s = size s = 1; |
110 |
110 |
111 fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s; |
111 fun is_utf8 s = size s > 0 andalso forall_string (fn c => ord c >= 128) s; |
112 |
112 |
113 fun is_symbolic s = |
113 fun is_symbolic s = |
114 String.isPrefix "\\<" s andalso not (String.isPrefix "\\<^" s); |
114 String.isPrefix "\\<" s andalso String.isSuffix ">" s andalso not (String.isPrefix "\\<^" s); |
115 |
115 |
116 fun is_printable s = |
116 fun is_printable s = |
117 if is_char s then ord space <= ord s andalso ord s <= ord "~" |
117 if is_char s then ord space <= ord s andalso ord s <= ord "~" |
118 else is_utf8 s orelse not (String.isPrefix "\\<^" s); |
118 else is_utf8 s orelse is_symbolic s; |
119 |
119 |
120 |
120 |
121 (* input source control *) |
121 (* input source control *) |
122 |
122 |
123 val eof = ""; |
123 val eof = ""; |
126 val stopper = Scan.stopper (K eof) is_eof; |
126 val stopper = Scan.stopper (K eof) is_eof; |
127 |
127 |
128 val sync = "\\<^sync>"; |
128 val sync = "\\<^sync>"; |
129 fun is_sync s = s = sync; |
129 fun is_sync s = s = sync; |
130 |
130 |
131 val malformed = "[["; |
131 fun is_regular s = not_eof s andalso s <> sync; |
132 val end_malformed = "]]"; |
132 |
133 |
133 fun is_malformed s = String.isPrefix "\\<" s andalso not (String.isSuffix ">" s); |
134 val separate_chars = explode #> space_implode space; |
134 fun malformed_msg s = "Malformed symbolic character: " ^ quote s; |
135 fun malformed_msg s = "Malformed symbolic character: " ^ quote (separate_chars s); |
|
136 |
|
137 fun is_regular s = |
|
138 not_eof s andalso s <> sync andalso s <> malformed andalso s <> end_malformed; |
|
139 |
135 |
140 |
136 |
141 (* ascii symbols *) |
137 (* ascii symbols *) |
142 |
138 |
143 fun is_ascii s = is_char s andalso ord s < 128; |
139 fun is_ascii s = is_char s andalso ord s < 128; |
223 |
219 |
224 |
220 |
225 (* symbol variants *) |
221 (* symbol variants *) |
226 |
222 |
227 datatype sym = |
223 datatype sym = |
228 Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string; |
224 Char of string | UTF8 of string | Sym of string | Ctrl of string | Raw of string | |
|
225 Malformed of string; |
229 |
226 |
230 fun decode s = |
227 fun decode s = |
231 if is_char s then Char s |
228 if is_char s then Char s |
232 else if is_utf8 s then UTF8 s |
229 else if is_utf8 s then UTF8 s |
233 else if is_raw s then Raw (decode_raw s) |
230 else if is_raw s then Raw (decode_raw s) |
|
231 else if is_malformed s then Malformed s |
234 else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4)) |
232 else if String.isPrefix "\\<^" s then Ctrl (String.substring (s, 3, size s - 4)) |
235 else if String.isPrefix "\\<" s then Sym (String.substring (s, 2, size s - 3)) |
233 else Sym (String.substring (s, 2, size s - 3)); |
236 else error (malformed_msg s); |
|
237 |
234 |
238 |
235 |
239 (* standard symbol kinds *) |
236 (* standard symbol kinds *) |
240 |
237 |
241 datatype kind = Letter | Digit | Quasi | Blank | Other; |
238 datatype kind = Letter | Digit | Quasi | Blank | Other; |
443 |
440 |
444 val scan_raw = |
441 val scan_raw = |
445 Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) || |
442 Scan.this_string "raw:" ^^ (Scan.many raw_chr >> implode) || |
446 Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode); |
443 Scan.this_string "raw" ^^ (Scan.many1 is_ascii_digit >> implode); |
447 |
444 |
448 val scan = |
445 val scan_total = |
449 Scan.one is_plain || |
446 Scan.one is_plain || |
450 Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 || |
447 Scan.one is_utf8 ::: Scan.many is_utf8_trailer >> implode_pseudo_utf8 || |
451 scan_encoded_newline || |
448 scan_encoded_newline || |
452 ($$ "\\" ^^ $$ "<" ^^ |
449 ($$ "\\" ^^ $$ "<" ^^ (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) || |
453 !! (fn (cs, _) => malformed_msg (beginning 10 ("\\" :: "<" :: cs))) |
450 Scan.this_string "\\<^" || |
454 (($$ "^" ^^ (scan_raw || scan_id) || scan_id) ^^ $$ ">")) || |
451 Scan.this_string "\\<" || |
455 Scan.one not_eof; |
452 Scan.one not_eof; |
456 |
453 |
457 val scan_resync = |
|
458 Scan.one is_ascii_blank || $$ "\"" || $$ "`" || $$ "\\" || |
|
459 Scan.this_string "(*" || Scan.this_string "*)" || |
|
460 Scan.this_string "{*" || Scan.this_string "*}"; |
|
461 |
|
462 val recover = |
|
463 Scan.this ["\\", "<"] @@@ |
|
464 Scan.repeat (Scan.unless scan_resync (Scan.one not_eof)) |
|
465 >> (fn ss => malformed :: ss @ [end_malformed]); |
|
466 |
|
467 in |
454 in |
468 |
455 |
469 fun source {do_recover} src = |
456 fun source src = Source.source stopper (Scan.bulk scan_total) NONE src; |
470 Source.source stopper (Scan.bulk scan) |
|
471 (if do_recover then SOME (false, K recover) else NONE) src; |
|
472 |
457 |
473 end; |
458 end; |
474 |
459 |
475 |
460 |
476 (* explode *) |
461 (* explode *) |