wenzelm@36956: /* Title: Pure/Isar/token.scala wenzelm@34139: Author: Makarius wenzelm@34139: wenzelm@36956: Outer token syntax for Isabelle/Isar. wenzelm@34139: */ wenzelm@34139: wenzelm@34139: package isabelle wenzelm@34139: wenzelm@34139: wenzelm@59083: import scala.collection.mutable wenzelm@59083: import scala.util.parsing.input wenzelm@59083: wenzelm@59083: wenzelm@36956: object Token wenzelm@34139: { wenzelm@34157: /* tokens */ wenzelm@34139: wenzelm@36956: object Kind extends Enumeration wenzelm@34139: { wenzelm@59081: /*immediate source*/ wenzelm@34157: val COMMAND = Value("command") wenzelm@34157: val KEYWORD = Value("keyword") wenzelm@34157: val IDENT = Value("identifier") wenzelm@34157: val LONG_IDENT = Value("long identifier") wenzelm@34157: val SYM_IDENT = Value("symbolic identifier") wenzelm@34157: val VAR = Value("schematic variable") wenzelm@34157: val TYPE_IDENT = Value("type variable") wenzelm@34157: val TYPE_VAR = Value("schematic type variable") wenzelm@40290: val NAT = Value("natural number") wenzelm@40290: val FLOAT = Value("floating-point number") wenzelm@59081: val SPACE = Value("white space") wenzelm@59081: /*delimited content*/ wenzelm@34157: val STRING = Value("string") wenzelm@34157: val ALT_STRING = Value("back-quoted string") wenzelm@34157: val VERBATIM = Value("verbatim text") wenzelm@55512: val CARTOUCHE = Value("text cartouche") wenzelm@34157: val COMMENT = Value("comment text") wenzelm@59081: /*special content*/ wenzelm@48754: val ERROR = Value("bad input") wenzelm@34157: val UNPARSED = Value("unparsed input") wenzelm@34139: } wenzelm@34139: wenzelm@34157: wenzelm@55494: /* parsers */ wenzelm@55494: wenzelm@55494: object Parsers extends Parsers wenzelm@55494: wenzelm@55494: trait Parsers extends Scan.Parsers wenzelm@55494: { wenzelm@55494: private def delimited_token: Parser[Token] = wenzelm@55494: { wenzelm@55494: val string = quoted("\"") ^^ (x => Token(Token.Kind.STRING, x)) wenzelm@55494: val alt_string = quoted("`") ^^ (x => Token(Token.Kind.ALT_STRING, x)) wenzelm@55494: val verb = verbatim ^^ (x => Token(Token.Kind.VERBATIM, x)) wenzelm@55494: val cart = cartouche ^^ (x => Token(Token.Kind.CARTOUCHE, x)) wenzelm@55494: val cmt = comment ^^ (x => Token(Token.Kind.COMMENT, x)) wenzelm@55494: wenzelm@55494: string | (alt_string | (verb | (cart | cmt))) wenzelm@55494: } wenzelm@55494: wenzelm@58900: private def other_token(keywords: Keyword.Keywords): Parser[Token] = wenzelm@55494: { wenzelm@55494: val letdigs1 = many1(Symbol.is_letdig) wenzelm@55494: val sub = one(s => s == Symbol.sub_decoded || s == "\\<^sub>") wenzelm@55494: val id = wenzelm@55494: one(Symbol.is_letter) ~ wenzelm@55494: (rep(letdigs1 | (sub ~ letdigs1 ^^ { case x ~ y => x + y })) ^^ (_.mkString)) ^^ wenzelm@55494: { case x ~ y => x + y } wenzelm@55494: wenzelm@55494: val nat = many1(Symbol.is_digit) wenzelm@55494: val natdot = nat ~ "." ~ nat ^^ { case x ~ y ~ z => x + y + z } wenzelm@55494: val id_nat = id ~ opt("." ~ nat) ^^ { case x ~ Some(y ~ z) => x + y + z case x ~ None => x } wenzelm@55494: wenzelm@55494: val ident = id ~ rep("." ~> id) ^^ wenzelm@55494: { case x ~ Nil => Token(Token.Kind.IDENT, x) wenzelm@55494: case x ~ ys => Token(Token.Kind.LONG_IDENT, (x :: ys).mkString(".")) } wenzelm@55494: wenzelm@55494: val var_ = "?" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.VAR, x + y) } wenzelm@55494: val type_ident = "'" ~ id ^^ { case x ~ y => Token(Token.Kind.TYPE_IDENT, x + y) } wenzelm@55494: val type_var = "?'" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.TYPE_VAR, x + y) } wenzelm@55494: val nat_ = nat ^^ (x => Token(Token.Kind.NAT, x)) wenzelm@55494: val float = wenzelm@55494: ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x)) wenzelm@55494: wenzelm@55494: val sym_ident = wenzelm@55494: (many1(Symbol.is_symbolic_char) | one(sym => Symbol.is_symbolic(sym))) ^^ wenzelm@55494: (x => Token(Token.Kind.SYM_IDENT, x)) wenzelm@55494: wenzelm@58899: val keyword = wenzelm@58900: literal(keywords.minor) ^^ (x => Token(Token.Kind.KEYWORD, x)) ||| wenzelm@58900: literal(keywords.major) ^^ (x => Token(Token.Kind.COMMAND, x)) wenzelm@55494: wenzelm@55494: val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x)) wenzelm@55494: wenzelm@55494: val recover_delimited = wenzelm@55494: (recover_quoted("\"") | wenzelm@55494: (recover_quoted("`") | wenzelm@55494: (recover_verbatim | wenzelm@55494: (recover_cartouche | recover_comment)))) ^^ (x => Token(Token.Kind.ERROR, x)) wenzelm@55494: wenzelm@55494: val bad = one(_ => true) ^^ (x => Token(Token.Kind.ERROR, x)) wenzelm@55494: wenzelm@55494: space | (recover_delimited | wenzelm@55494: (((ident | (var_ | (type_ident | (type_var | (float | (nat_ | sym_ident)))))) ||| wenzelm@58899: keyword) | bad)) wenzelm@55494: } wenzelm@55494: wenzelm@58900: def token(keywords: Keyword.Keywords): Parser[Token] = wenzelm@58900: delimited_token | other_token(keywords) wenzelm@55494: wenzelm@58900: def token_line(keywords: Keyword.Keywords, ctxt: Scan.Line_Context) wenzelm@55510: : Parser[(Token, Scan.Line_Context)] = wenzelm@55494: { wenzelm@55494: val string = wenzelm@55510: quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Token.Kind.STRING, x), c) } wenzelm@55494: val alt_string = wenzelm@55510: quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Token.Kind.ALT_STRING, x), c) } wenzelm@55510: val verb = verbatim_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.VERBATIM, x), c) } wenzelm@55510: val cart = cartouche_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.CARTOUCHE, x), c) } wenzelm@55510: val cmt = comment_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.COMMENT, x), c) } wenzelm@58900: val other = other_token(keywords) ^^ { case x => (x, Scan.Finished) } wenzelm@55494: wenzelm@55494: string | (alt_string | (verb | (cart | (cmt | other)))) wenzelm@55494: } wenzelm@55494: } wenzelm@55494: wenzelm@55494: wenzelm@59083: /* explode */ wenzelm@59083: wenzelm@59083: def explode(keywords: Keyword.Keywords, inp: CharSequence): List[Token] = wenzelm@59083: { wenzelm@59083: val in: input.Reader[Char] = new input.CharSequenceReader(inp) wenzelm@59083: Parsers.parseAll(Parsers.rep(Parsers.token(keywords)), in) match { wenzelm@59083: case Parsers.Success(tokens, _) => tokens wenzelm@59083: case _ => error("Unexpected failure of tokenizing input:\n" + inp.toString) wenzelm@59083: } wenzelm@59083: } wenzelm@59083: wenzelm@59083: def explode_line(keywords: Keyword.Keywords, inp: CharSequence, context: Scan.Line_Context) wenzelm@59083: : (List[Token], Scan.Line_Context) = wenzelm@59083: { wenzelm@59083: var in: input.Reader[Char] = new input.CharSequenceReader(inp) wenzelm@59083: val toks = new mutable.ListBuffer[Token] wenzelm@59083: var ctxt = context wenzelm@59083: while (!in.atEnd) { wenzelm@59083: Parsers.parse(Parsers.token_line(keywords, ctxt), in) match { wenzelm@59083: case Parsers.Success((x, c), rest) => { toks += x; ctxt = c; in = rest } wenzelm@59083: case Parsers.NoSuccess(_, rest) => wenzelm@59083: error("Unexpected failure of tokenizing input:\n" + rest.source.toString) wenzelm@59083: } wenzelm@59083: } wenzelm@59083: (toks.toList, ctxt) wenzelm@59083: } wenzelm@59083: wenzelm@59083: wenzelm@34157: /* token reader */ wenzelm@34139: wenzelm@56464: object Pos wenzelm@56464: { wenzelm@59706: val none: Pos = new Pos(0, 0, "", "") wenzelm@59706: val start: Pos = new Pos(1, 1, "", "") wenzelm@59706: def file(file: String): Pos = new Pos(1, 1, file, "") wenzelm@59706: def id(id: String): Pos = new Pos(0, 1, "", id) wenzelm@56464: } wenzelm@56464: wenzelm@59671: final class Pos private[Token]( wenzelm@59696: val line: Int, wenzelm@59696: val offset: Symbol.Offset, wenzelm@59706: val file: String, wenzelm@59706: val id: String) wenzelm@56464: extends scala.util.parsing.input.Position wenzelm@34139: { wenzelm@34157: def column = 0 wenzelm@34157: def lineContents = "" wenzelm@34157: wenzelm@56464: def advance(token: Token): Pos = wenzelm@34157: { wenzelm@59671: var line1 = line wenzelm@59671: var offset1 = offset wenzelm@59671: for (s <- Symbol.iterator(token.source)) { wenzelm@59671: if (line1 > 0 && Symbol.is_newline(s)) line1 += 1 wenzelm@59671: if (offset1 > 0) offset1 += 1 wenzelm@59671: } wenzelm@59671: if (line1 == line && offset1 == offset) this wenzelm@59706: else new Pos(line1, offset1, file, id) wenzelm@34157: } wenzelm@56464: wenzelm@59695: private def position(end_offset: Symbol.Offset): Position.T = wenzelm@59671: (if (line > 0) Position.Line(line) else Nil) ::: wenzelm@59671: (if (offset > 0) Position.Offset(offset) else Nil) ::: wenzelm@59671: (if (end_offset > 0) Position.End_Offset(end_offset) else Nil) ::: wenzelm@59706: (if (file != "") Position.File(file) else Nil) ::: wenzelm@59706: (if (id != "") Position.Id_String(id) else Nil) wenzelm@59671: wenzelm@59671: def position(): Position.T = position(0) wenzelm@59671: def position(token: Token): Position.T = position(advance(token).offset) wenzelm@59671: wenzelm@59671: override def toString: String = Position.here_undelimited(position()) wenzelm@34139: } wenzelm@34139: wenzelm@34157: abstract class Reader extends scala.util.parsing.input.Reader[Token] wenzelm@34157: wenzelm@56464: private class Token_Reader(tokens: List[Token], val pos: Pos) extends Reader wenzelm@34139: { wenzelm@34157: def first = tokens.head wenzelm@34157: def rest = new Token_Reader(tokens.tail, pos.advance(first)) wenzelm@34157: def atEnd = tokens.isEmpty wenzelm@34139: } wenzelm@34139: wenzelm@59705: def reader(tokens: List[Token], start: Token.Pos): Reader = wenzelm@59705: new Token_Reader(tokens, start) wenzelm@34139: } wenzelm@34139: wenzelm@36956: wenzelm@36956: sealed case class Token(val kind: Token.Kind.Value, val source: String) wenzelm@36956: { wenzelm@36956: def is_command: Boolean = kind == Token.Kind.COMMAND wenzelm@59122: def is_command_kind(keywords: Keyword.Keywords, pred: String => Boolean): Boolean = wenzelm@59701: is_command && keywords.is_command_kind(source, pred) wenzelm@48718: def is_keyword: Boolean = kind == Token.Kind.KEYWORD wenzelm@55505: def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source) wenzelm@48365: def is_ident: Boolean = kind == Token.Kind.IDENT wenzelm@48605: def is_sym_ident: Boolean = kind == Token.Kind.SYM_IDENT wenzelm@46943: def is_string: Boolean = kind == Token.Kind.STRING wenzelm@48349: def is_nat: Boolean = kind == Token.Kind.NAT wenzelm@48365: def is_float: Boolean = kind == Token.Kind.FLOAT wenzelm@36956: def is_name: Boolean = wenzelm@36956: kind == Token.Kind.IDENT || wenzelm@36956: kind == Token.Kind.SYM_IDENT || wenzelm@36956: kind == Token.Kind.STRING || wenzelm@36956: kind == Token.Kind.NAT wenzelm@36956: def is_xname: Boolean = is_name || kind == Token.Kind.LONG_IDENT wenzelm@56998: def is_text: Boolean = is_xname || kind == Token.Kind.VERBATIM || kind == Token.Kind.CARTOUCHE wenzelm@36956: def is_space: Boolean = kind == Token.Kind.SPACE wenzelm@36956: def is_comment: Boolean = kind == Token.Kind.COMMENT wenzelm@51048: def is_improper: Boolean = is_space || is_comment wenzelm@48599: def is_proper: Boolean = !is_space && !is_comment wenzelm@48754: def is_error: Boolean = kind == Token.Kind.ERROR wenzelm@47012: def is_unparsed: Boolean = kind == Token.Kind.UNPARSED wenzelm@36956: wenzelm@48754: def is_unfinished: Boolean = is_error && wenzelm@48754: (source.startsWith("\"") || wenzelm@48754: source.startsWith("`") || wenzelm@48754: source.startsWith("{*") || wenzelm@57021: source.startsWith("(*") || wenzelm@57021: source.startsWith(Symbol.open) || wenzelm@57021: source.startsWith(Symbol.open_decoded)) wenzelm@48754: wenzelm@48718: def is_begin: Boolean = is_keyword && source == "begin" wenzelm@58751: def is_end: Boolean = is_command && source == "end" wenzelm@43611: wenzelm@58753: def is_begin_block: Boolean = is_command && source == "{" wenzelm@58753: def is_end_block: Boolean = is_command && source == "}" wenzelm@58753: wenzelm@36956: def content: String = wenzelm@55492: if (kind == Token.Kind.STRING) Scan.Parsers.quoted_content("\"", source) wenzelm@55492: else if (kind == Token.Kind.ALT_STRING) Scan.Parsers.quoted_content("`", source) wenzelm@55492: else if (kind == Token.Kind.VERBATIM) Scan.Parsers.verbatim_content(source) wenzelm@55492: else if (kind == Token.Kind.CARTOUCHE) Scan.Parsers.cartouche_content(source) wenzelm@55492: else if (kind == Token.Kind.COMMENT) Scan.Parsers.comment_content(source) wenzelm@36956: else source wenzelm@36956: } wenzelm@36956: