src/Pure/ML/ml_lex.scala
author wenzelm
Sun, 07 Jan 2018 21:04:51 +0100
changeset 67365 fb539f83683a
parent 67364 f74672cf83c6
child 67366 e2575ccc0f5c
permissions -rw-r--r--
support for formal comments in ML in Isabelle/Scala;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     1
/*  Title:      Pure/ML/ml_lex.scala
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     3
59109
364992cd3c50 tuned comment;
wenzelm
parents: 59108
diff changeset
     4
Lexical syntax for Isabelle/ML and Standard ML.
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     5
*/
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     6
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     7
package isabelle
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
     8
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
     9
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
    10
import scala.collection.mutable
64824
330ec9bc4b75 tuned signature;
wenzelm
parents: 63610
diff changeset
    11
import scala.util.parsing.input.Reader
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    12
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    13
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    14
object ML_Lex
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    15
{
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    16
  /** keywords **/
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    17
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    18
  val keywords: Set[String] =
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    19
    Set("#", "(", ")", ",", "->", "...", ":", ":>", ";", "=", "=>",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    20
      "[", "]", "_", "{", "|", "}", "abstype", "and", "andalso", "as",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    21
      "case", "datatype", "do", "else", "end", "eqtype", "exception",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    22
      "fn", "fun", "functor", "handle", "if", "in", "include",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    23
      "infix", "infixr", "let", "local", "nonfix", "of", "op", "open",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    24
      "orelse", "raise", "rec", "sharing", "sig", "signature",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    25
      "struct", "structure", "then", "type", "val", "where", "while",
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    26
      "with", "withtype")
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    27
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    28
  val keywords2: Set[String] =
58933
6585e59aee3e tuned markup;
wenzelm
parents: 56278
diff changeset
    29
    Set("and", "case", "do", "else", "end", "if", "in", "let", "local",
6585e59aee3e tuned markup;
wenzelm
parents: 56278
diff changeset
    30
      "of", "sig", "struct", "then", "while", "with")
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    31
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    32
  val keywords3: Set[String] =
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    33
    Set("handle", "open", "raise")
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    34
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    35
  private val lexicon: Scan.Lexicon = Scan.Lexicon(keywords.toList: _*)
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    36
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    37
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    38
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    39
  /** tokens **/
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    40
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    41
  object Kind extends Enumeration
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    42
  {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    43
    val KEYWORD = Value("keyword")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    44
    val IDENT = Value("identifier")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    45
    val LONG_IDENT = Value("long identifier")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    46
    val TYPE_VAR = Value("type variable")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    47
    val WORD = Value("word")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    48
    val INT = Value("integer")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    49
    val REAL = Value("real")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    50
    val CHAR = Value("character")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    51
    val STRING = Value("quoted string")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    52
    val SPACE = Value("white space")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    53
    val COMMENT = Value("comment text")
67365
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
    54
    val COMMENT_CARTOUCHE = Value("comment cartouche")
61471
9d4c08af61b8 support control symbol antiquotations;
wenzelm
parents: 60215
diff changeset
    55
    val CONTROL = Value("control symbol antiquotation")
55512
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    56
    val ANTIQ = Value("antiquotation")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    57
    val ANTIQ_START = Value("antiquotation: start")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    58
    val ANTIQ_STOP = Value("antiquotation: stop")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    59
    val ANTIQ_OTHER = Value("antiquotation: other")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    60
    val ANTIQ_STRING = Value("antiquotation: quoted string")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    61
    val ANTIQ_ALT_STRING = Value("antiquotation: back-quoted string")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    62
    val ANTIQ_CARTOUCHE = Value("antiquotation: text cartouche")
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    63
    val ERROR = Value("bad input")
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    64
  }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    65
60215
5fb4990dfc73 misc tuning, based on warnings by IntelliJ IDEA;
wenzelm
parents: 59112
diff changeset
    66
  sealed case class Token(kind: Kind.Value, source: String)
55500
cdbbaa3074a8 isabelle-ml mode with separate token marker;
wenzelm
parents: 55499
diff changeset
    67
  {
55501
fdde1d62e1fb refined ML keyword styles;
wenzelm
parents: 55500
diff changeset
    68
    def is_keyword: Boolean = kind == Kind.KEYWORD
55505
2a1ca7f6607b more uniform ML keyword markup;
wenzelm
parents: 55502
diff changeset
    69
    def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source)
63610
4b40b8196dc7 Sidekick parser for isabelle-ml and sml mode;
wenzelm
parents: 63204
diff changeset
    70
    def is_space: Boolean = kind == Kind.SPACE
67365
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
    71
    def is_comment: Boolean = kind == Kind.COMMENT || kind == Kind.COMMENT_CARTOUCHE
55500
cdbbaa3074a8 isabelle-ml mode with separate token marker;
wenzelm
parents: 55499
diff changeset
    72
  }
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    73
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    74
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    75
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    76
  /** parsers **/
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    77
55510
1585a65aad64 tuned signature -- emphasize line-oriented aspect;
wenzelm
parents: 55505
diff changeset
    78
  case object ML_String extends Scan.Line_Context
55512
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    79
  case class Antiq(ctxt: Scan.Line_Context) extends Scan.Line_Context
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
    80
55512
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
    81
  private object Parsers extends Scan.Parsers with Antiquote.Parsers
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    82
  {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    83
    /* string material */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    84
55500
cdbbaa3074a8 isabelle-ml mode with separate token marker;
wenzelm
parents: 55499
diff changeset
    85
    private val blanks = many(character(Symbol.is_ascii_blank))
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    86
    private val blanks1 = many1(character(Symbol.is_ascii_blank))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    87
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
    88
    private val gap = "\\" ~ blanks1 ~ "\\" ^^ { case x ~ y ~ z => x + y + z }
55500
cdbbaa3074a8 isabelle-ml mode with separate token marker;
wenzelm
parents: 55499
diff changeset
    89
    private val gap_start = "\\" ~ blanks ~ """\z""".r ^^ { case x ~ y ~ _ => x + y }
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
    90
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    91
    private val escape =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    92
      one(character("\"\\abtnvfr".contains(_))) |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    93
      "^" ~ one(character(c => '@' <= c && c <= '_')) ^^ { case x ~ y => x + y } |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    94
      repeated(character(Symbol.is_ascii_digit), 3, 3)
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    95
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    96
    private val str =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    97
      one(character(c => c != '"' && c != '\\' && ' ' <= c && c <= '~')) |
59108
50ccc027e8a7 clarified Isabelle/ML strings (refining 72238ea2201c);
wenzelm
parents: 58933
diff changeset
    98
      one(s => Symbol.is_symbolic(s) | Symbol.is_control(s)) |
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
    99
      "\\" ~ escape ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   100
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   101
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   102
    /* ML char -- without gaps */
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   103
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   104
    private val ml_char: Parser[Token] =
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   105
      "#\"" ~ str ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.CHAR, x + y + z) }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   106
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   107
    private val recover_ml_char: Parser[String] =
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   108
      "#\"" ~ opt(str) ^^ { case x ~ Some(y) => x + y case x ~ None => x }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   109
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   110
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   111
    /* ML string */
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   112
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   113
    private val ml_string_body: Parser[String] =
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   114
      rep(gap | str) ^^ (_.mkString)
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   115
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   116
    private val recover_ml_string: Parser[String] =
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   117
      "\"" ~ ml_string_body ^^ { case x ~ y => x + y }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   118
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   119
    private val ml_string: Parser[Token] =
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   120
      "\"" ~ ml_string_body ~ "\"" ^^ { case x ~ y ~ z => Token(Kind.STRING, x + y + z) }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   121
55510
1585a65aad64 tuned signature -- emphasize line-oriented aspect;
wenzelm
parents: 55505
diff changeset
   122
    private def ml_string_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   123
    {
55510
1585a65aad64 tuned signature -- emphasize line-oriented aspect;
wenzelm
parents: 55505
diff changeset
   124
      def result(x: String, c: Scan.Line_Context) = (Token(Kind.STRING, x), c)
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   125
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   126
      ctxt match {
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   127
        case Scan.Finished =>
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   128
          "\"" ~ ml_string_body ~ ("\"" | gap_start) ^^
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   129
            { case x ~ y ~ z => result(x + y + z, if (z == "\"") Scan.Finished else ML_String) }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   130
        case ML_String =>
55500
cdbbaa3074a8 isabelle-ml mode with separate token marker;
wenzelm
parents: 55499
diff changeset
   131
          blanks ~ opt_term("\\" ~ ml_string_body ~ ("\"" | gap_start)) ^^
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   132
            { case x ~ Some(y ~ z ~ w) =>
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   133
                result(x + y + z + w, if (w == "\"") Scan.Finished else ML_String)
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   134
              case x ~ None => result(x, ML_String) }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   135
        case _ => failure("")
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   136
      }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   137
    }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   138
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   139
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   140
    /* ML comment */
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   141
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   142
    private val ml_comment: Parser[Token] =
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   143
      comment ^^ (x => Token(Kind.COMMENT, x))
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   144
55510
1585a65aad64 tuned signature -- emphasize line-oriented aspect;
wenzelm
parents: 55505
diff changeset
   145
    private def ml_comment_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
1585a65aad64 tuned signature -- emphasize line-oriented aspect;
wenzelm
parents: 55505
diff changeset
   146
      comment_line(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT, x), c) }
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   147
67365
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   148
    private val ml_comment_cartouche: Parser[Token] =
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   149
      comment_cartouche ^^ (x => Token(Kind.COMMENT_CARTOUCHE, x))
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   150
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   151
    private def ml_comment_cartouche_line(ctxt: Scan.Line_Context)
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   152
        : Parser[(Token, Scan.Line_Context)] =
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   153
      comment_cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.COMMENT_CARTOUCHE, x), c) }
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   154
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   155
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   156
    private def other_token: Parser[Token] =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   157
    {
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   158
      /* identifiers */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   159
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   160
      val letdigs = many(character(Symbol.is_ascii_letdig))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   161
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   162
      val alphanumeric =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   163
        one(character(Symbol.is_ascii_letter)) ~ letdigs ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   164
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   165
      val symbolic = many1(character("!#$%&*+-/:<=>?@\\^`|~".contains(_)))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   166
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   167
      val ident = (alphanumeric | symbolic) ^^ (x => Token(Kind.IDENT, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   168
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   169
      val long_ident =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   170
        rep1(alphanumeric ~ "." ^^ { case x ~ y => x + y }) ~
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   171
          (alphanumeric | (symbolic | "=")) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   172
          { case x ~ y => Token(Kind.LONG_IDENT, x.mkString + y) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   173
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   174
      val type_var = "'" ~ letdigs ^^ { case x ~ y => Token(Kind.TYPE_VAR, x + y) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   175
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   176
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   177
      /* numerals */
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   178
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   179
      val dec = many1(character(Symbol.is_ascii_digit))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   180
      val hex = many1(character(Symbol.is_ascii_hex))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   181
      val sign = opt("~") ^^ { case Some(x) => x case None => "" }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   182
      val decint = sign ~ dec ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   183
      val exp = ("E" | "e") ~ decint ^^ { case x ~ y => x + y }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   184
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   185
      val word =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   186
        ("0wx" ~ hex ^^ { case x ~ y => x + y } | "0w" ~ dec ^^ { case x ~ y => x + y }) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   187
          (x => Token(Kind.WORD, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   188
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   189
      val int =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   190
        sign ~ ("0x" ~ hex ^^ { case x ~ y => x + y } | dec) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   191
          { case x ~ y => Token(Kind.INT, x + y) }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   192
63204
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 61596
diff changeset
   193
      val rat =
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 61596
diff changeset
   194
        decint ~ opt("/" ~ dec) ^^ { case x ~ None => x case x ~ Some(y ~ z) => x + y + z }
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 61596
diff changeset
   195
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   196
      val real =
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   197
        (decint ~ "." ~ dec ~ (opt(exp) ^^ { case Some(x) => x case None => "" }) ^^
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   198
          { case x ~ y ~ z ~ w => x + y + z + w } |
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   199
         decint ~ exp ^^ { case x ~ y => x + y }) ^^ (x => Token(Kind.REAL, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   200
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   201
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   202
      /* main */
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   203
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   204
      val space = blanks1 ^^ (x => Token(Kind.SPACE, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   205
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   206
      val keyword = literal(lexicon) ^^ (x => Token(Kind.KEYWORD, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   207
61471
9d4c08af61b8 support control symbol antiquotations;
wenzelm
parents: 60215
diff changeset
   208
      val ml_control = control ^^ (x => Token(Kind.CONTROL, x))
63204
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 61596
diff changeset
   209
      val ml_antiq =
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 61596
diff changeset
   210
        "@" ~ rat ^^ { case x ~ y => Token(Kind.ANTIQ, x + y) } |
921a5be54132 support rat numerals via special antiquotation syntax;
wenzelm
parents: 61596
diff changeset
   211
        antiq ^^ (x => Token(Kind.ANTIQ, x))
55512
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   212
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   213
      val bad = one(_ => true) ^^ (x => Token(Kind.ERROR, x))
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   214
67364
wenzelm
parents: 67095
diff changeset
   215
      val recover =
wenzelm
parents: 67095
diff changeset
   216
        (recover_ml_char | (recover_ml_string | (recover_cartouche | recover_comment))) ^^
wenzelm
parents: 67095
diff changeset
   217
          (x => Token(Kind.ERROR, x))
wenzelm
parents: 67095
diff changeset
   218
wenzelm
parents: 67095
diff changeset
   219
      space | (ml_control | (recover | (ml_antiq |
61471
9d4c08af61b8 support control symbol antiquotations;
wenzelm
parents: 60215
diff changeset
   220
        (((word | (real | (int | (long_ident | (ident | type_var))))) ||| keyword) | bad))))
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   221
    }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   222
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   223
55512
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   224
    /* antiquotations (line-oriented) */
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   225
67095
91ffe1f8bf5c proper treatment of multi-line cartouche as rudiment of antiquotation, e.g. relevant for syntax-highlighting in Isabelle/jEdit;
wenzelm
parents: 64824
diff changeset
   226
    def ml_cartouche_line(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
91ffe1f8bf5c proper treatment of multi-line cartouche as rudiment of antiquotation, e.g. relevant for syntax-highlighting in Isabelle/jEdit;
wenzelm
parents: 64824
diff changeset
   227
      cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), c) }
91ffe1f8bf5c proper treatment of multi-line cartouche as rudiment of antiquotation, e.g. relevant for syntax-highlighting in Isabelle/jEdit;
wenzelm
parents: 64824
diff changeset
   228
55512
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   229
    def ml_antiq_start(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   230
      ctxt match {
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   231
        case Scan.Finished => "@{" ^^ (x => (Token(Kind.ANTIQ_START, x), Antiq(Scan.Finished)))
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   232
        case _ => failure("")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   233
      }
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   234
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   235
    def ml_antiq_stop(ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   236
      ctxt match {
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   237
        case Antiq(Scan.Finished) => "}" ^^ (x => (Token(Kind.ANTIQ_STOP, x), Scan.Finished))
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   238
        case _ => failure("")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   239
      }
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   240
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   241
    def ml_antiq_body(context: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   242
      context match {
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   243
        case Antiq(ctxt) =>
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   244
          (if (ctxt == Scan.Finished) antiq_other ^^ (x => (Token(Kind.ANTIQ_OTHER, x), context))
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   245
           else failure("")) |
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   246
          quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_STRING, x), Antiq(c)) } |
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   247
          quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_ALT_STRING, x), Antiq(c)) } |
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   248
          cartouche_line(ctxt) ^^ { case (x, c) => (Token(Kind.ANTIQ_CARTOUCHE, x), Antiq(c)) }
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   249
        case _ => failure("")
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   250
      }
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   251
75c68e05f9ea support ML antiquotations in Scala;
wenzelm
parents: 55510
diff changeset
   252
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   253
    /* token */
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   254
67365
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   255
    def token: Parser[Token] =
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   256
      ml_char | (ml_string | (ml_comment | (ml_comment_cartouche | other_token)))
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   257
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55512
diff changeset
   258
    def token_line(SML: Boolean, ctxt: Scan.Line_Context): Parser[(Token, Scan.Line_Context)] =
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   259
    {
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   260
      val other = (ml_char | other_token) ^^ (x => (x, Scan.Finished))
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   261
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55512
diff changeset
   262
      if (SML) ml_string_line(ctxt) | (ml_comment_line(ctxt) | other)
67365
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   263
      else {
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55512
diff changeset
   264
        ml_string_line(ctxt) |
61596
8323b8e21fe9 ML cartouches via control antiquotation;
wenzelm
parents: 61471
diff changeset
   265
          (ml_comment_line(ctxt) |
67365
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   266
            (ml_comment_cartouche_line(ctxt) |
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   267
              (ml_cartouche_line(ctxt) |
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   268
                (ml_antiq_start(ctxt) | (ml_antiq_stop(ctxt) | (ml_antiq_body(ctxt) | other))))))
fb539f83683a support for formal comments in ML in Isabelle/Scala;
wenzelm
parents: 67364
diff changeset
   269
      }
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   270
    }
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   271
  }
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   272
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   273
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   274
  /* tokenize */
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   275
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   276
  def tokenize(input: CharSequence): List[Token] =
64824
330ec9bc4b75 tuned signature;
wenzelm
parents: 63610
diff changeset
   277
    Parsers.parseAll(Parsers.rep(Parsers.token), Scan.char_reader(input)) match {
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   278
      case Parsers.Success(tokens, _) => tokens
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   279
      case _ => error("Unexpected failure of tokenizing input:\n" + input.toString)
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   280
    }
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   281
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55512
diff changeset
   282
  def tokenize_line(SML: Boolean, input: CharSequence, context: Scan.Line_Context)
55510
1585a65aad64 tuned signature -- emphasize line-oriented aspect;
wenzelm
parents: 55505
diff changeset
   283
    : (List[Token], Scan.Line_Context) =
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   284
  {
64824
330ec9bc4b75 tuned signature;
wenzelm
parents: 63610
diff changeset
   285
    var in: Reader[Char] = Scan.char_reader(input)
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   286
    val toks = new mutable.ListBuffer[Token]
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   287
    var ctxt = context
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   288
    while (!in.atEnd) {
56278
2576d3a40ed6 separate tokenization and language context for SML: no symbols, no antiquotes;
wenzelm
parents: 55512
diff changeset
   289
      Parsers.parse(Parsers.token_line(SML, ctxt), in) match {
60215
5fb4990dfc73 misc tuning, based on warnings by IntelliJ IDEA;
wenzelm
parents: 59112
diff changeset
   290
        case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest
55499
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   291
        case Parsers.NoSuccess(_, rest) =>
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   292
          error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   293
      }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   294
    }
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   295
    (toks.toList, ctxt)
2581fbee5b95 partial scans via ML_Lex.tokenize_context;
wenzelm
parents: 55497
diff changeset
   296
  }
55497
c0f8aebfb43d lexical syntax for SML (in Scala);
wenzelm
parents:
diff changeset
   297
}