# HG changeset patch # User wenzelm # Date 1392408380 -3600 # Node ID c0f8aebfb43dbffa6d199e3fa3c7b2722dda9e1d # Parent 5d2835453ad3ee422a2312a318ce746f5151df0a lexical syntax for SML (in Scala); tuned; diff -r 5d2835453ad3 -r c0f8aebfb43d src/Pure/General/scan.scala --- a/src/Pure/General/scan.scala Fri Feb 14 20:58:48 2014 +0100 +++ b/src/Pure/General/scan.scala Fri Feb 14 21:06:20 2014 +0100 @@ -43,9 +43,9 @@ p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None) - /* symbol range */ + /* repeated symbols */ - def symbol_range(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] = + def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] = new Parser[String] { def apply(in: Input) = @@ -66,16 +66,22 @@ if (count < min_count) Failure("bad input", in) else Success(in.source.subSequence(start, i).toString, in.drop(i - start)) } - }.named("symbol_range") + }.named("repeated") def one(pred: Symbol.Symbol => Boolean): Parser[String] = - symbol_range(pred, 1, 1) + repeated(pred, 1, 1) def many(pred: Symbol.Symbol => Boolean): Parser[String] = - symbol_range(pred, 0, Integer.MAX_VALUE) + repeated(pred, 0, Integer.MAX_VALUE) def many1(pred: Symbol.Symbol => Boolean): Parser[String] = - symbol_range(pred, 1, Integer.MAX_VALUE) + repeated(pred, 1, Integer.MAX_VALUE) + + + /* character */ + + def character(pred: Char => Boolean): Symbol.Symbol => Boolean = + (s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0)) /* quoted strings */ @@ -279,7 +285,7 @@ /* keyword */ - def keyword(lexicon: Lexicon): Parser[String] = new Parser[String] + def literal(lexicon: Lexicon): Parser[String] = new Parser[String] { def apply(in: Input) = { diff -r 5d2835453ad3 -r c0f8aebfb43d src/Pure/General/symbol.scala --- a/src/Pure/General/symbol.scala Fri Feb 14 20:58:48 2014 +0100 +++ b/src/Pure/General/symbol.scala Fri Feb 14 21:06:20 2014 +0100 @@ -19,9 +19,16 @@ /* ASCII characters */ def is_ascii_letter(c: Char): Boolean = 'A' <= c && c <= 'Z' || 'a' <= c && c <= 'z' + def is_ascii_digit(c: Char): Boolean = '0' <= c && c <= '9' + + def is_ascii_hex(c: Char): Boolean = + '0' <= c && c <= '9' || 'A' <= c && c <= 'F' || 'a' <= c && c <= 'f' + def is_ascii_quasi(c: Char): Boolean = c == '_' || c == '\'' + def is_ascii_blank(c: Char): Boolean = " \t\n\u000b\f\r".contains(c) + def is_ascii_letdig(c: Char): Boolean = is_ascii_letter(c) || is_ascii_digit(c) || is_ascii_quasi(c) diff -r 5d2835453ad3 -r c0f8aebfb43d src/Pure/Isar/completion.scala --- a/src/Pure/Isar/completion.scala Fri Feb 14 20:58:48 2014 +0100 +++ b/src/Pure/Isar/completion.scala Fri Feb 14 21:06:20 2014 +0100 @@ -185,7 +185,7 @@ line: CharSequence): Option[Completion.Result] = { val raw_result = - Scan.Parsers.parse(Scan.Parsers.keyword(abbrevs_lex), new Library.Reverse(line)) match { + Scan.Parsers.parse(Scan.Parsers.literal(abbrevs_lex), new Library.Reverse(line)) match { case Scan.Parsers.Success(reverse_a, _) => val abbrevs = abbrevs_map.get_list(reverse_a) abbrevs match { diff -r 5d2835453ad3 -r c0f8aebfb43d src/Pure/Isar/token.scala --- a/src/Pure/Isar/token.scala Fri Feb 14 20:58:48 2014 +0100 +++ b/src/Pure/Isar/token.scala Fri Feb 14 21:06:20 2014 +0100 @@ -81,7 +81,7 @@ (x => Token(Token.Kind.SYM_IDENT, x)) val command_keyword = - keyword(lexicon) ^^ + literal(lexicon) ^^ (x => Token(if (is_command(x)) Token.Kind.COMMAND else Token.Kind.KEYWORD, x)) val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x)) diff -r 5d2835453ad3 -r c0f8aebfb43d src/Pure/ML/ml_lex.scala --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/Pure/ML/ml_lex.scala Fri Feb 14 21:06:20 2014 +0100 @@ -0,0 +1,162 @@ +/* Title: Pure/ML/ml_lex.scala + Author: Makarius + +Lexical syntax for SML. +*/ + +package isabelle + +import scala.util.parsing.input.{Reader, CharSequenceReader} + + +object ML_Lex +{ + /** tokens **/ + + object Kind extends Enumeration + { + val KEYWORD = Value("keyword") + val IDENT = Value("identifier") + val LONG_IDENT = Value("long identifier") + val TYPE_VAR = Value("type variable") + val WORD = Value("word") + val INT = Value("integer") + val REAL = Value("real") + val CHAR = Value("character") + val STRING = Value("quoted string") + val SPACE = Value("white space") + val COMMENT = Value("comment text") + val ERROR = Value("bad input") + } + + sealed case class Token(val kind: Kind.Value, val source: String) + + + + /** parsers **/ + + private val lexicon = + Scan.Lexicon("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", + "=>", "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as", + "case", "datatype", "do", "else", "end", "eqtype", "exception", "fn", + "fun", "functor", "handle", "if", "in", "include", "infix", "infixr", + "let", "local", "nonfix", "of", "op", "open", "orelse", "raise", "rec", + "sharing", "sig", "signature", "struct", "structure", "then", "type", + "val", "where", "while", "with", "withtype") + + private object Parsers extends Scan.Parsers + { + /* string material */ + + private val blanks1 = many1(character(Symbol.is_ascii_blank)) + + private val escape = + one(character("\"\\abtnvfr".contains(_))) | + "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } | + repeated(character(Symbol.is_ascii_digit), 3, 3) + + private val str = + one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) | + "\\" ~ escape ^^ { case x ~ y => x + y } + + private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z } + private val gaps = rep(gap) ^^ (_.mkString) + + + /* delimited token */ + + private def delimited_token: Parser[Token] = + { + val char = + "#\"" ~ gaps ~ str ~ gaps ~ "\"" ^^ + { case u ~ v ~ x ~ y ~ z => Token(Kind.CHAR, u + v + x + y + z) } + + val string = + "\"" ~ (rep(gap | str) ^^ (_.mkString)) ~ "\"" ^^ + { case x ~ y ~ z => Token(Kind.STRING, x + y + z) } + + val cmt = comment ^^ (x => Token(Kind.COMMENT, x)) + + char | (string | cmt) + } + + private def other_token: Parser[Token] = + { + /* identifiers */ + + val letdigs = many(character(Symbol.is_ascii_letdig)) + + val alphanumeric = + one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y } + + val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_))) + + val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x)) + + val long_ident = + rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~ + (alphanumeric | (symbolic | "=")) ^^ + { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) } + + val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) } + + + /* numerals */ + + val dec = many1(character(Symbol.is_ascii_digit)) + val hex = many1(character(Symbol.is_ascii_hex)) + val sign = opt("~") ^^ { case Some(x) => x case None => "" } + val decint = sign ~ dec ^^ { case x ~ y => x + y } + val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y } + + val word = + ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^ + (x => Token(Kind.WORD, x)) + + val int = + sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^ + { case x ~ y => Token(Kind.INT, x + y) } + + val real = + (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^ + { case x ~ y ~ z ~ w => x + y + z + w } | + decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x)) + + + /* recover delimited */ + + val recover_char = + "#\"" ~ gaps ~ (opt(str ~ gaps) ^^ { case Some(x ~ y) => x + y case None => "" }) ^^ + { case x ~ y ~ z => x + y + z } + + val recover_string = + "\"" ~ (rep(gap | str) ^^ (_.mkString)) ^^ { case x ~ y => x + y } + + val recover_delimited = + (recover_char | (recover_string | recover_comment)) ^^ (x => Token(Kind.ERROR, x)) + + + /* token */ + + val space = blanks1 ^^ (x => Token(Kind.SPACE, x)) + + val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x)) + + val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x)) + + space | (recover_delimited | + (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad)) + } + + def token: Parser[Token] = delimited_token | other_token + } + + def tokenize(input: CharSequence): List[Token] = + { + Parsers.parseAll(Parsers.rep(Parsers.token), new CharSequenceReader(input)) match { + case Parsers.Success(tokens, _) => tokens + case _ => error("Unexpected failure of tokenizing input:\n" + input.toString) + } + } +} + diff -r 5d2835453ad3 -r c0f8aebfb43d src/Pure/build-jars --- a/src/Pure/build-jars Fri Feb 14 20:58:48 2014 +0100 +++ b/src/Pure/build-jars Fri Feb 14 21:06:20 2014 +0100 @@ -43,6 +43,7 @@ Isar/outer_syntax.scala Isar/parse.scala Isar/token.scala + ML/ml_lex.scala PIDE/command.scala PIDE/document.scala PIDE/document_id.scala