--- a/src/Pure/ML/ml_lex.ML Sun Sep 16 14:52:29 2007 +0200
+++ b/src/Pure/ML/ml_lex.ML Sun Sep 16 14:52:31 2007 +0200
@@ -8,23 +8,18 @@
signature ML_LEX =
sig
datatype token_kind =
- Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
+ Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
Space | Comment | Error of string | EOF
eqtype token
- val str_of_kind: token_kind -> string
val stopper: token * (token -> bool)
- val not_eof: token -> bool
+ val is_regular: token -> bool
+ val is_improper: token -> bool
+ val pos_of: token -> string
val kind_of: token -> token_kind
- val is_kind: token_kind -> token -> bool
- val is_proper: token -> bool
val val_of: token -> string
- val !!! : string -> (int * 'a -> 'b) -> int * 'a -> 'b
val keywords: string list
- val scan: int * string list -> token * (int * string list)
- val source: bool option -> (string, 'a) Source.source ->
- (token, int * (string, 'a) Source.source) Source.source
- val source_proper: (token, 'a) Source.source ->
- (token, (token, 'a) Source.source) Source.source
+ val source: (Symbol.symbol, 'a) Source.source ->
+ (token, int * (Symbol.symbol, 'a) Source.source) Source.source
end;
structure ML_Lex: ML_LEX =
@@ -35,29 +30,13 @@
(* datatype token *)
datatype token_kind =
- Keyword | Ident | LongIdent | TypeVar | Selector | Word | Int | Real | Char | String |
+ Keyword | Ident | LongIdent | TypeVar | Word | Int | Real | Char | String |
Space | Comment | Error of string | EOF;
datatype token = Token of int * (token_kind * string);
-val str_of_kind =
- fn Keyword => "keyword"
- | Ident => "identifier"
- | LongIdent => "long identifier"
- | TypeVar => "type variable"
- | Selector => "record selector"
- | Word => "word"
- | Int => "integer"
- | Real => "real"
- | Char => "character"
- | String => "string"
- | Space => "white space"
- | Comment => "comment"
- | Error _ => "bad input"
- | EOF => "end-of-file";
-
-(* end-of-file *)
+(* control tokens *)
val eof = Token (0, (EOF, ""));
@@ -65,22 +44,22 @@
| is_eof _ = false;
val stopper = (eof, is_eof);
-val not_eof = not o is_eof;
-(* token kind *)
+fun is_regular (Token (_, (Error _, _))) = false
+ | is_regular (Token (_, (EOF, _))) = false
+ | is_regular _ = true;
+
+fun is_improper (Token (_, (Space, _))) = true
+ | is_improper (Token (_, (Comment, _))) = true
+ | is_improper _ = false;
+
+
+(* token content *)
+
+fun pos_of (Token (n, _)) = " (line " ^ string_of_int n ^ ")";
fun kind_of (Token (_, (k, _))) = k;
-
-fun is_kind k (Token (_, (k', _))) = k = k';
-
-fun is_proper (Token (_, (Space, _))) = false
- | is_proper (Token (_, (Comment, _))) = false
- | is_proper _ = true;
-
-
-(* token value *)
-
fun val_of (Token (_, (_, x))) = x;
fun token_leq (Token (_, (_, x)), Token (_, (_, x'))) = x <= x';
@@ -97,7 +76,7 @@
(* line numbering *)
-fun incr_line scan = Scan.depend (fn n => scan >> pair (n + 1));
+fun incr_line scan = Scan.depend (fn (n: int) => scan >> pair (n + 1));
val keep_line = Scan.lift;
val scan_blank =
@@ -123,6 +102,8 @@
(* identifiers *)
+local
+
val scan_letdigs =
Scan.many (Symbol.is_ascii_letter orf Symbol.is_ascii_digit orf Symbol.is_ascii_quasi) >> implode;
@@ -130,6 +111,8 @@
val scan_symbolic = Scan.many1 (member (op =) (explode "!#$%&*+-/:<=>?@\\^`|~")) >> implode;
+in
+
val scan_ident = scan_alphanumeric || scan_symbolic;
val scan_longident =
@@ -137,24 +120,20 @@
val scan_typevar = $$ "'" ^^ scan_letdigs;
-
-(* selectors -- not fully standard conformant *)
-
-val scan_numeric =
- Scan.one (member (op =) (explode "123456789")) ^^ (Scan.many Symbol.is_ascii_digit >> implode);
-
-val scan_selector =
- keep_line ($$ "#") ^^ scan_blanks ^^
- !!! "malformed record selector" (keep_line (scan_numeric || scan_alphanumeric || scan_symbolic));
+end;
(* numerals *)
+local
+
val scan_dec = Scan.many1 Symbol.is_ascii_digit >> implode;
val scan_hex = Scan.many1 Symbol.is_ascii_hex >> implode;
val scan_sign = Scan.optional ($$ "~") "";
val scan_decint = scan_sign ^^ scan_dec;
+in
+
val scan_word = Scan.this_string "0wx" ^^ scan_hex || Scan.this_string "0w" ^^ scan_dec;
val scan_int = scan_sign ^^ (Scan.this_string "0x" ^^ scan_hex || scan_dec);
@@ -165,40 +144,59 @@
scan_decint ^^ $$ "." ^^ scan_dec ^^ Scan.optional scan_exp "" ||
scan_decint ^^ scan_exp;
+end;
+
(* chars and strings *)
+local
+
+val scan_escape =
+ Scan.one (member (op =) (explode "\"\\abtnvfr")) ||
+ $$ "^" ^^ Scan.one (fn s => ord "@" <= ord s andalso ord s <= ord "_") ||
+ Scan.one Symbol.is_ascii_digit ^^
+ Scan.one Symbol.is_ascii_digit ^^
+ Scan.one Symbol.is_ascii_digit;
+
+val scan_str =
+ keep_line (Scan.one (fn s => Symbol.is_printable s andalso s <> "\"" andalso s <> "\\")) ||
+ keep_line ($$ "\\") ^^ !!! "bad escape character in string" (keep_line scan_escape);
+
val scan_gap = keep_line ($$ "\\") ^^ scan_blanks1 ^^ keep_line ($$ "\\");
val scan_gaps = Scan.repeat scan_gap >> implode;
-val scan_str =
- scan_blank ||
- scan_gap ||
- keep_line ($$ "\\") |-- !!! "bad escape character in string"
- (scan_blank || keep_line ($$ "\"" || $$ "\\")) ||
- keep_line (Scan.one (fn s => s <> "\"" andalso s <> "\\" andalso Symbol.is_regular s));
+in
val scan_char =
keep_line ($$ "#" ^^ $$ "\"") ^^ scan_gaps ^^ scan_str ^^ scan_gaps ^^ keep_line ($$ "\"");
val scan_string =
keep_line ($$ "\"") ^^
- !!! "missing quote at end of string" ((Scan.repeat scan_str >> implode) ^^ keep_line ($$ "\""));
+ !!! "missing quote at end of string"
+ ((Scan.repeat (scan_gap || scan_str) >> implode) ^^ keep_line ($$ "\""));
+
+end;
(* scan nested comments *)
+local
+
val scan_cmt =
Scan.lift scan_blank ||
- Scan.depend (fn d => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
+ Scan.depend (fn (d: int) => keep_line ($$ "(" ^^ $$ "*") >> pair (d + 1)) ||
Scan.depend (fn 0 => Scan.fail | d => keep_line ($$ "*" ^^ $$ ")") >> pair (d - 1)) ||
Scan.lift (keep_line ($$ "*" --| Scan.ahead (~$$ ")"))) ||
Scan.lift (keep_line (Scan.one (fn s => s <> "*" andalso Symbol.is_regular s)));
+in
+
val scan_comment =
keep_line ($$ "(" ^^ $$ "*") ^^
- !!! "missing end of comment"
- (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
+ !!! "missing end of comment"
+ (Scan.pass 0 (Scan.repeat scan_cmt >> implode) ^^ keep_line ($$ "*" ^^ $$ ")"));
+
+end;
(* scan token *)
@@ -210,23 +208,22 @@
fun token k x = Token (n, (k, x));
in
scan_char >> token Char ||
- scan_selector >> token Selector ||
scan_string >> token String ||
scan_blanks1 >> token Space ||
scan_comment >> token Comment ||
keep_line (Scan.max token_leq
(scan_keyword >> token Keyword)
- (scan_longident >> token LongIdent ||
+ (scan_word >> token Word ||
+ scan_real >> token Real ||
+ scan_int >> token Int ||
+ scan_longident >> token LongIdent ||
scan_ident >> token Ident ||
- scan_typevar >> token TypeVar ||
- scan_real >> token Real ||
- scan_word >> token Word ||
- scan_int >> token Int))
+ scan_typevar >> token TypeVar))
end);
in !! (lex_err (fn cs => "bad input " ^ quote (Symbol.beginning 10 cs))) scanner end;
-(* token sources *)
+(* token source *)
local
@@ -237,12 +234,10 @@
in
-fun source do_recover src =
- Source.source' 1 Symbol.stopper (Scan.bulk scan) (Option.map (rpair recover) do_recover) src;
+fun source src =
+ Source.source' 1 Symbol.stopper (Scan.bulk scan) (SOME (false, recover)) src;
end;
-fun source_proper src = src |> Source.filter is_proper;
+end;
-
-end;