clasohm@0: (* Title: Pure/Thy/scan clasohm@0: ID: $Id$ clasohm@0: Author: Sonia Mahjoub / Tobias Nipkow clasohm@0: Copyright 1992 TU Muenchen clasohm@0: nipkow@213: modified December 1993 by Max Breitling (Type-variables added) nipkow@213: clasohm@0: The scanner. Adapted from Larry Paulson's ML for the Working Programmer. clasohm@0: *) clasohm@0: clasohm@0: signature LEXICAL = clasohm@0: sig clasohm@0: clasohm@0: clasohm@0: datatype token = Id of string clasohm@0: | Key of string clasohm@0: | Nat of string clasohm@0: | Stg of string clasohm@0: | Txt of string nipkow@213: | TypVar of string clasohm@0: clasohm@0: val scan : string list -> (token * int) list clasohm@0: end; clasohm@0: clasohm@0: signature KEYWORD = clasohm@0: sig clasohm@0: val alphas : string list clasohm@0: val symbols : string list clasohm@0: end; clasohm@0: clasohm@0: clasohm@0: functor LexicalFUN (Keyword: KEYWORD): LEXICAL = clasohm@0: clasohm@0: struct clasohm@0: clasohm@0: clasohm@0: clasohm@0: datatype token = Id of string clasohm@0: | Key of string clasohm@0: | Nat of string clasohm@0: | Stg of string nipkow@213: | Txt of string nipkow@213: | TypVar of string; clasohm@0: clasohm@0: clasohm@0: fun lexerr(n,s) = clasohm@0: error("Lexical error on line " ^ (string_of_int n) ^ ": " ^ s); clasohm@0: nipkow@213: val specials = explode"!{}@#$%^&*()+-=[]:\";,./?`_~<>|\\"; clasohm@0: clasohm@0: fun is_symbol c = "_" = c orelse "'" = c; clasohm@0: clasohm@0: fun alphanum (id, c::cs) = clasohm@0: if is_letter c orelse is_digit c orelse is_symbol c clasohm@0: then alphanum (id ^ c , cs) clasohm@0: else (id , c :: cs) clasohm@0: | alphanum (id ,[]) = (id ,[]); clasohm@0: clasohm@0: fun numeric (nat, c::cs) = clasohm@0: if is_digit c clasohm@0: then numeric (nat^c, cs) clasohm@0: else (nat, c::cs) clasohm@0: | numeric (nat, []) = (nat,[]); clasohm@0: clasohm@0: fun tokenof (a, n) = clasohm@0: if a mem Keyword.alphas clasohm@0: then (Key a, n) clasohm@0: else (Id a, n); clasohm@0: clasohm@0: fun symbolic (sy, c::cs) = clasohm@0: if (sy mem Keyword.symbols) andalso clasohm@0: not((sy^c) mem Keyword.symbols) clasohm@0: orelse not (c mem specials) clasohm@0: then (sy, c::cs) clasohm@0: else symbolic(sy^c, cs) clasohm@0: | symbolic (sy, []) = (sy, []); clasohm@0: clasohm@0: fun stringerr(n) = lexerr(n, "No matching quote found on this line"); clasohm@0: clasohm@0: fun is_control_chr ([],_,n) = stringerr(n) clasohm@0: | is_control_chr (c::cs,s,n) = clasohm@0: let val m = ord c clasohm@0: in if (m >= 64 andalso m <= 95) clasohm@0: then (cs, s^c, n) clasohm@0: else stringerr(n) clasohm@0: end; clasohm@0: clasohm@0: fun is_3_dgt (c1::c2::cs, c,n) = clasohm@0: let val s = c^c1^c2 clasohm@0: in if (s >= "000" andalso s <= "255") clasohm@0: then (cs, s) clasohm@0: else stringerr(n) clasohm@0: end clasohm@0: | is_3_dgt (_,_,n) = stringerr(n); clasohm@0: clasohm@0: fun is_imprt_seq ([],_,n) = stringerr(n) clasohm@0: | is_imprt_seq ((c::cs),s,n) = clasohm@0: if c = "\\" then (cs,s^"\\",n) clasohm@0: else if c = "\n" clasohm@0: then is_imprt_seq (cs,s^"\n",n+1) clasohm@0: else if (c = "\t") orelse (c = " ") clasohm@0: then is_imprt_seq (cs,s^c,n) clasohm@0: else stringerr(n); clasohm@0: clasohm@0: fun is_escape_seq ([],_,n) = stringerr(n) clasohm@0: | is_escape_seq ((c::cs),s,n) = clasohm@0: if c = "\\" then (cs,s^"\\",n) clasohm@0: else if c = "\n" clasohm@0: then is_imprt_seq (cs,s^"\n",n+1) clasohm@0: else if (c = "\t") orelse (c = " ") clasohm@0: then is_imprt_seq (cs,s^c,n) clasohm@0: else if c = "^" clasohm@0: then is_control_chr (cs,s^"^",n) clasohm@0: else if ("0" <= c andalso c <= "2") clasohm@0: then let val (cs',s') = clasohm@0: is_3_dgt(cs,c,n) clasohm@0: in (cs',s^s',n) clasohm@0: end clasohm@0: else stringerr(n); clasohm@0: clasohm@0: clasohm@0: fun string (_,[],_,n) = stringerr(n) clasohm@0: | string (toks, c::cs, s, n) = clasohm@0: if c = "\"" then ((Stg s, n)::toks , cs, n) clasohm@0: else if c = "\\" clasohm@0: then let val (cs',s',n') = clasohm@0: is_escape_seq (cs, s^"\\",n) clasohm@0: in string (toks,cs',s',n') end clasohm@0: else string (toks,cs,s^c,n); clasohm@0: clasohm@0: clasohm@0: fun comment ((c1::c2::cs), n) = clasohm@0: if c1 = "*" andalso c2 = ")" then (cs,n) else clasohm@0: if c1 = "\n" then comment((c2::cs), n+1) clasohm@0: else comment((c2::cs), n) clasohm@0: | comment (_, n) = lexerr(n, "Missing end of comment"); clasohm@0: clasohm@0: fun scanning (toks , [], n) = rev toks clasohm@0: | scanning (toks , c :: cs, n) = clasohm@0: if is_letter c clasohm@0: then let val (id , cs2) = alphanum (c , cs) clasohm@0: in if id = "ML" clasohm@0: then let val text = implode cs2 clasohm@0: in scanning ((Txt text,n) :: toks , [], n) clasohm@0: end clasohm@0: else scanning (tokenof(id,n) :: toks , cs2, n) clasohm@0: end clasohm@0: else if is_digit c clasohm@0: then let val (nat , cs2) = numeric(c , cs) clasohm@0: in scanning ((Nat nat,n) :: toks , cs2, n) end nipkow@213: nipkow@213: else if c = "'" andalso is_letter(hd cs) nipkow@213: then let val (var, cs2) = alphanum (hd cs, tl cs) nipkow@213: in scanning((TypVar (c^var),n) :: toks, cs2, n) end nipkow@213: clasohm@0: else if c mem specials clasohm@0: then if c = "\"" clasohm@0: then let val (toks', cs', n') = string (toks, cs, "", n) clasohm@0: in scanning (toks', cs', n') end clasohm@0: else let val (sy , cs2) = symbolic (c , cs) clasohm@0: in if sy = "(*" clasohm@0: then let val (cs3,n2) = comment(cs2,n) clasohm@0: in scanning (toks , cs3, n2) end clasohm@0: else scanning ((Key sy,n) :: toks, cs2, n) clasohm@0: end clasohm@0: else if c = "\n" then scanning (toks, cs, n+1) clasohm@0: else if c = " " orelse c = "\t" then scanning (toks , cs, n) clasohm@0: else lexerr(n,"Illegal character " ^ c); clasohm@0: clasohm@0: fun scan a = scanning ([] , a, 1); clasohm@0: clasohm@0: end;