src/Pure/Isar/token.scala
author wenzelm
Tue Dec 09 21:14:11 2014 +0100 (2014-12-09)
changeset 59122 c1dbcde94cd2
parent 59083 88b0b1f28adc
child 59671 9715eb8e9408
permissions -rw-r--r--
tuned signature;
wenzelm@36956
     1
/*  Title:      Pure/Isar/token.scala
wenzelm@34139
     2
    Author:     Makarius
wenzelm@34139
     3
wenzelm@36956
     4
Outer token syntax for Isabelle/Isar.
wenzelm@34139
     5
*/
wenzelm@34139
     6
wenzelm@34139
     7
package isabelle
wenzelm@34139
     8
wenzelm@34139
     9
wenzelm@59083
    10
import scala.collection.mutable
wenzelm@59083
    11
import scala.util.parsing.input
wenzelm@59083
    12
wenzelm@59083
    13
wenzelm@36956
    14
object Token
wenzelm@34139
    15
{
wenzelm@34157
    16
  /* tokens */
wenzelm@34139
    17
wenzelm@36956
    18
  object Kind extends Enumeration
wenzelm@34139
    19
  {
wenzelm@59081
    20
    /*immediate source*/
wenzelm@34157
    21
    val COMMAND = Value("command")
wenzelm@34157
    22
    val KEYWORD = Value("keyword")
wenzelm@34157
    23
    val IDENT = Value("identifier")
wenzelm@34157
    24
    val LONG_IDENT = Value("long identifier")
wenzelm@34157
    25
    val SYM_IDENT = Value("symbolic identifier")
wenzelm@34157
    26
    val VAR = Value("schematic variable")
wenzelm@34157
    27
    val TYPE_IDENT = Value("type variable")
wenzelm@34157
    28
    val TYPE_VAR = Value("schematic type variable")
wenzelm@40290
    29
    val NAT = Value("natural number")
wenzelm@40290
    30
    val FLOAT = Value("floating-point number")
wenzelm@59081
    31
    val SPACE = Value("white space")
wenzelm@59081
    32
    /*delimited content*/
wenzelm@34157
    33
    val STRING = Value("string")
wenzelm@34157
    34
    val ALT_STRING = Value("back-quoted string")
wenzelm@34157
    35
    val VERBATIM = Value("verbatim text")
wenzelm@55512
    36
    val CARTOUCHE = Value("text cartouche")
wenzelm@34157
    37
    val COMMENT = Value("comment text")
wenzelm@59081
    38
    /*special content*/
wenzelm@48754
    39
    val ERROR = Value("bad input")
wenzelm@34157
    40
    val UNPARSED = Value("unparsed input")
wenzelm@34139
    41
  }
wenzelm@34139
    42
wenzelm@34157
    43
wenzelm@55494
    44
  /* parsers */
wenzelm@55494
    45
wenzelm@55494
    46
  object Parsers extends Parsers
wenzelm@55494
    47
wenzelm@55494
    48
  trait Parsers extends Scan.Parsers
wenzelm@55494
    49
  {
wenzelm@55494
    50
    private def delimited_token: Parser[Token] =
wenzelm@55494
    51
    {
wenzelm@55494
    52
      val string = quoted("\"") ^^ (x => Token(Token.Kind.STRING, x))
wenzelm@55494
    53
      val alt_string = quoted("`") ^^ (x => Token(Token.Kind.ALT_STRING, x))
wenzelm@55494
    54
      val verb = verbatim ^^ (x => Token(Token.Kind.VERBATIM, x))
wenzelm@55494
    55
      val cart = cartouche ^^ (x => Token(Token.Kind.CARTOUCHE, x))
wenzelm@55494
    56
      val cmt = comment ^^ (x => Token(Token.Kind.COMMENT, x))
wenzelm@55494
    57
wenzelm@55494
    58
      string | (alt_string | (verb | (cart | cmt)))
wenzelm@55494
    59
    }
wenzelm@55494
    60
wenzelm@58900
    61
    private def other_token(keywords: Keyword.Keywords): Parser[Token] =
wenzelm@55494
    62
    {
wenzelm@55494
    63
      val letdigs1 = many1(Symbol.is_letdig)
wenzelm@55494
    64
      val sub = one(s => s == Symbol.sub_decoded || s == "\\<^sub>")
wenzelm@55494
    65
      val id =
wenzelm@55494
    66
        one(Symbol.is_letter) ~
wenzelm@55494
    67
          (rep(letdigs1 | (sub ~ letdigs1 ^^ { case x ~ y => x + y })) ^^ (_.mkString)) ^^
wenzelm@55494
    68
        { case x ~ y => x + y }
wenzelm@55494
    69
wenzelm@55494
    70
      val nat = many1(Symbol.is_digit)
wenzelm@55494
    71
      val natdot = nat ~ "." ~ nat ^^ { case x ~ y ~ z => x + y + z }
wenzelm@55494
    72
      val id_nat = id ~ opt("." ~ nat) ^^ { case x ~ Some(y ~ z) => x + y + z case x ~ None => x }
wenzelm@55494
    73
wenzelm@55494
    74
      val ident = id ~ rep("." ~> id) ^^
wenzelm@55494
    75
        { case x ~ Nil => Token(Token.Kind.IDENT, x)
wenzelm@55494
    76
          case x ~ ys => Token(Token.Kind.LONG_IDENT, (x :: ys).mkString(".")) }
wenzelm@55494
    77
wenzelm@55494
    78
      val var_ = "?" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.VAR, x + y) }
wenzelm@55494
    79
      val type_ident = "'" ~ id ^^ { case x ~ y => Token(Token.Kind.TYPE_IDENT, x + y) }
wenzelm@55494
    80
      val type_var = "?'" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.TYPE_VAR, x + y) }
wenzelm@55494
    81
      val nat_ = nat ^^ (x => Token(Token.Kind.NAT, x))
wenzelm@55494
    82
      val float =
wenzelm@55494
    83
        ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
wenzelm@55494
    84
wenzelm@55494
    85
      val sym_ident =
wenzelm@55494
    86
        (many1(Symbol.is_symbolic_char) | one(sym => Symbol.is_symbolic(sym))) ^^
wenzelm@55494
    87
        (x => Token(Token.Kind.SYM_IDENT, x))
wenzelm@55494
    88
wenzelm@58899
    89
      val keyword =
wenzelm@58900
    90
        literal(keywords.minor) ^^ (x => Token(Token.Kind.KEYWORD, x)) |||
wenzelm@58900
    91
        literal(keywords.major) ^^ (x => Token(Token.Kind.COMMAND, x))
wenzelm@55494
    92
wenzelm@55494
    93
      val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
wenzelm@55494
    94
wenzelm@55494
    95
      val recover_delimited =
wenzelm@55494
    96
        (recover_quoted("\"") |
wenzelm@55494
    97
          (recover_quoted("`") |
wenzelm@55494
    98
            (recover_verbatim |
wenzelm@55494
    99
              (recover_cartouche | recover_comment)))) ^^ (x => Token(Token.Kind.ERROR, x))
wenzelm@55494
   100
wenzelm@55494
   101
      val bad = one(_ => true) ^^ (x => Token(Token.Kind.ERROR, x))
wenzelm@55494
   102
wenzelm@55494
   103
      space | (recover_delimited |
wenzelm@55494
   104
        (((ident | (var_ | (type_ident | (type_var | (float | (nat_ | sym_ident)))))) |||
wenzelm@58899
   105
          keyword) | bad))
wenzelm@55494
   106
    }
wenzelm@55494
   107
wenzelm@58900
   108
    def token(keywords: Keyword.Keywords): Parser[Token] =
wenzelm@58900
   109
      delimited_token | other_token(keywords)
wenzelm@55494
   110
wenzelm@58900
   111
    def token_line(keywords: Keyword.Keywords, ctxt: Scan.Line_Context)
wenzelm@55510
   112
      : Parser[(Token, Scan.Line_Context)] =
wenzelm@55494
   113
    {
wenzelm@55494
   114
      val string =
wenzelm@55510
   115
        quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Token.Kind.STRING, x), c) }
wenzelm@55494
   116
      val alt_string =
wenzelm@55510
   117
        quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Token.Kind.ALT_STRING, x), c) }
wenzelm@55510
   118
      val verb = verbatim_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.VERBATIM, x), c) }
wenzelm@55510
   119
      val cart = cartouche_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.CARTOUCHE, x), c) }
wenzelm@55510
   120
      val cmt = comment_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.COMMENT, x), c) }
wenzelm@58900
   121
      val other = other_token(keywords) ^^ { case x => (x, Scan.Finished) }
wenzelm@55494
   122
wenzelm@55494
   123
      string | (alt_string | (verb | (cart | (cmt | other))))
wenzelm@55494
   124
    }
wenzelm@55494
   125
  }
wenzelm@55494
   126
wenzelm@55494
   127
wenzelm@59083
   128
  /* explode */
wenzelm@59083
   129
wenzelm@59083
   130
  def explode(keywords: Keyword.Keywords, inp: CharSequence): List[Token] =
wenzelm@59083
   131
  {
wenzelm@59083
   132
    val in: input.Reader[Char] = new input.CharSequenceReader(inp)
wenzelm@59083
   133
    Parsers.parseAll(Parsers.rep(Parsers.token(keywords)), in) match {
wenzelm@59083
   134
      case Parsers.Success(tokens, _) => tokens
wenzelm@59083
   135
      case _ => error("Unexpected failure of tokenizing input:\n" + inp.toString)
wenzelm@59083
   136
    }
wenzelm@59083
   137
  }
wenzelm@59083
   138
wenzelm@59083
   139
  def explode_line(keywords: Keyword.Keywords, inp: CharSequence, context: Scan.Line_Context)
wenzelm@59083
   140
    : (List[Token], Scan.Line_Context) =
wenzelm@59083
   141
  {
wenzelm@59083
   142
    var in: input.Reader[Char] = new input.CharSequenceReader(inp)
wenzelm@59083
   143
    val toks = new mutable.ListBuffer[Token]
wenzelm@59083
   144
    var ctxt = context
wenzelm@59083
   145
    while (!in.atEnd) {
wenzelm@59083
   146
      Parsers.parse(Parsers.token_line(keywords, ctxt), in) match {
wenzelm@59083
   147
        case Parsers.Success((x, c), rest) => { toks += x; ctxt = c; in = rest }
wenzelm@59083
   148
        case Parsers.NoSuccess(_, rest) =>
wenzelm@59083
   149
          error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
wenzelm@59083
   150
      }
wenzelm@59083
   151
    }
wenzelm@59083
   152
    (toks.toList, ctxt)
wenzelm@59083
   153
  }
wenzelm@59083
   154
wenzelm@59083
   155
wenzelm@34157
   156
  /* token reader */
wenzelm@34139
   157
wenzelm@56464
   158
  object Pos
wenzelm@56464
   159
  {
wenzelm@56464
   160
    val none: Pos = new Pos(0, "")
wenzelm@56464
   161
  }
wenzelm@56464
   162
wenzelm@56464
   163
  final class Pos private[Token](val line: Int, val file: String)
wenzelm@56464
   164
    extends scala.util.parsing.input.Position
wenzelm@34139
   165
  {
wenzelm@34157
   166
    def column = 0
wenzelm@34157
   167
    def lineContents = ""
wenzelm@34157
   168
wenzelm@56464
   169
    def advance(token: Token): Pos =
wenzelm@34157
   170
    {
wenzelm@34157
   171
      var n = 0
wenzelm@34157
   172
      for (c <- token.content if c == '\n') n += 1
wenzelm@56464
   173
      if (n == 0) this else new Pos(line + n, file)
wenzelm@34157
   174
    }
wenzelm@56464
   175
wenzelm@56464
   176
    def position: Position.T = Position.Line_File(line, file)
wenzelm@56532
   177
    override def toString: String = Position.here_undelimited(position)
wenzelm@34139
   178
  }
wenzelm@34139
   179
wenzelm@34157
   180
  abstract class Reader extends scala.util.parsing.input.Reader[Token]
wenzelm@34157
   181
wenzelm@56464
   182
  private class Token_Reader(tokens: List[Token], val pos: Pos) extends Reader
wenzelm@34139
   183
  {
wenzelm@34157
   184
    def first = tokens.head
wenzelm@34157
   185
    def rest = new Token_Reader(tokens.tail, pos.advance(first))
wenzelm@34157
   186
    def atEnd = tokens.isEmpty
wenzelm@34139
   187
  }
wenzelm@34139
   188
wenzelm@48335
   189
  def reader(tokens: List[Token], file: String = ""): Reader =
wenzelm@56464
   190
    new Token_Reader(tokens, new Pos(1, file))
wenzelm@34139
   191
}
wenzelm@34139
   192
wenzelm@36956
   193
wenzelm@36956
   194
sealed case class Token(val kind: Token.Kind.Value, val source: String)
wenzelm@36956
   195
{
wenzelm@36956
   196
  def is_command: Boolean = kind == Token.Kind.COMMAND
wenzelm@59122
   197
  def is_command_kind(keywords: Keyword.Keywords, pred: String => Boolean): Boolean =
wenzelm@59122
   198
    is_command &&
wenzelm@59122
   199
      (keywords.command_kind(source) match { case Some(k) => pred(k) case None => false })
wenzelm@48718
   200
  def is_keyword: Boolean = kind == Token.Kind.KEYWORD
wenzelm@55505
   201
  def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source)
wenzelm@48365
   202
  def is_ident: Boolean = kind == Token.Kind.IDENT
wenzelm@48605
   203
  def is_sym_ident: Boolean = kind == Token.Kind.SYM_IDENT
wenzelm@46943
   204
  def is_string: Boolean = kind == Token.Kind.STRING
wenzelm@48349
   205
  def is_nat: Boolean = kind == Token.Kind.NAT
wenzelm@48365
   206
  def is_float: Boolean = kind == Token.Kind.FLOAT
wenzelm@36956
   207
  def is_name: Boolean =
wenzelm@36956
   208
    kind == Token.Kind.IDENT ||
wenzelm@36956
   209
    kind == Token.Kind.SYM_IDENT ||
wenzelm@36956
   210
    kind == Token.Kind.STRING ||
wenzelm@36956
   211
    kind == Token.Kind.NAT
wenzelm@36956
   212
  def is_xname: Boolean = is_name || kind == Token.Kind.LONG_IDENT
wenzelm@56998
   213
  def is_text: Boolean = is_xname || kind == Token.Kind.VERBATIM || kind == Token.Kind.CARTOUCHE
wenzelm@36956
   214
  def is_space: Boolean = kind == Token.Kind.SPACE
wenzelm@36956
   215
  def is_comment: Boolean = kind == Token.Kind.COMMENT
wenzelm@51048
   216
  def is_improper: Boolean = is_space || is_comment
wenzelm@48599
   217
  def is_proper: Boolean = !is_space && !is_comment
wenzelm@48754
   218
  def is_error: Boolean = kind == Token.Kind.ERROR
wenzelm@47012
   219
  def is_unparsed: Boolean = kind == Token.Kind.UNPARSED
wenzelm@36956
   220
wenzelm@48754
   221
  def is_unfinished: Boolean = is_error &&
wenzelm@48754
   222
   (source.startsWith("\"") ||
wenzelm@48754
   223
    source.startsWith("`") ||
wenzelm@48754
   224
    source.startsWith("{*") ||
wenzelm@57021
   225
    source.startsWith("(*") ||
wenzelm@57021
   226
    source.startsWith(Symbol.open) ||
wenzelm@57021
   227
    source.startsWith(Symbol.open_decoded))
wenzelm@48754
   228
wenzelm@48718
   229
  def is_begin: Boolean = is_keyword && source == "begin"
wenzelm@58751
   230
  def is_end: Boolean = is_command && source == "end"
wenzelm@43611
   231
wenzelm@58753
   232
  def is_begin_block: Boolean = is_command && source == "{"
wenzelm@58753
   233
  def is_end_block: Boolean = is_command && source == "}"
wenzelm@58753
   234
wenzelm@36956
   235
  def content: String =
wenzelm@55492
   236
    if (kind == Token.Kind.STRING) Scan.Parsers.quoted_content("\"", source)
wenzelm@55492
   237
    else if (kind == Token.Kind.ALT_STRING) Scan.Parsers.quoted_content("`", source)
wenzelm@55492
   238
    else if (kind == Token.Kind.VERBATIM) Scan.Parsers.verbatim_content(source)
wenzelm@55492
   239
    else if (kind == Token.Kind.CARTOUCHE) Scan.Parsers.cartouche_content(source)
wenzelm@55492
   240
    else if (kind == Token.Kind.COMMENT) Scan.Parsers.comment_content(source)
wenzelm@36956
   241
    else source
wenzelm@36956
   242
}
wenzelm@36956
   243