| author | wenzelm | 
| Thu, 26 Nov 2020 16:14:16 +0100 | |
| changeset 72724 | 75cce7926ec1 | 
| parent 72669 | 5e7916535860 | 
| child 72744 | 0017eb17ac1c | 
| permissions | -rw-r--r-- | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 1 | /* Title: Pure/Isar/token.scala | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 2 | Author: Makarius | 
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 3 | |
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 4 | Outer token syntax for Isabelle/Isar. | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 5 | */ | 
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 6 | |
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 7 | package isabelle | 
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 8 | |
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 9 | |
| 59083 | 10 | import scala.collection.mutable | 
| 11 | import scala.util.parsing.input | |
| 12 | ||
| 13 | ||
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 14 | object Token | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 15 | {
 | 
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 16 | /* tokens */ | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 17 | |
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 18 | object Kind extends Enumeration | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 19 |   {
 | 
| 59081 | 20 | /*immediate source*/ | 
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 21 |     val COMMAND = Value("command")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 22 |     val KEYWORD = Value("keyword")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 23 |     val IDENT = Value("identifier")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 24 |     val LONG_IDENT = Value("long identifier")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 25 |     val SYM_IDENT = Value("symbolic identifier")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 26 |     val VAR = Value("schematic variable")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 27 |     val TYPE_IDENT = Value("type variable")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 28 |     val TYPE_VAR = Value("schematic type variable")
 | 
| 40290 
47f572aff50a
support for floating-point tokens in outer syntax (coinciding with inner syntax version);
 wenzelm parents: 
38367diff
changeset | 29 |     val NAT = Value("natural number")
 | 
| 
47f572aff50a
support for floating-point tokens in outer syntax (coinciding with inner syntax version);
 wenzelm parents: 
38367diff
changeset | 30 |     val FLOAT = Value("floating-point number")
 | 
| 59081 | 31 |     val SPACE = Value("white space")
 | 
| 32 | /*delimited content*/ | |
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 33 |     val STRING = Value("string")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 34 |     val ALT_STRING = Value("back-quoted string")
 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 35 |     val VERBATIM = Value("verbatim text")
 | 
| 55512 | 36 |     val CARTOUCHE = Value("text cartouche")
 | 
| 67439 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 37 |     val INFORMAL_COMMENT = Value("informal comment")
 | 
| 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 38 |     val FORMAL_COMMENT = Value("formal comment")
 | 
| 59081 | 39 | /*special content*/ | 
| 48754 
c2c1e5944536
clarified undefined, unparsed, unfinished command spans;
 wenzelm parents: 
48718diff
changeset | 40 |     val ERROR = Value("bad input")
 | 
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 41 |     val UNPARSED = Value("unparsed input")
 | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 42 | } | 
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 43 | |
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 44 | |
| 55494 | 45 | /* parsers */ | 
| 46 | ||
| 47 | object Parsers extends Parsers | |
| 48 | ||
| 67439 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 49 | trait Parsers extends Scan.Parsers with Comment.Parsers | 
| 55494 | 50 |   {
 | 
| 51 | private def delimited_token: Parser[Token] = | |
| 52 |     {
 | |
| 53 |       val string = quoted("\"") ^^ (x => Token(Token.Kind.STRING, x))
 | |
| 54 |       val alt_string = quoted("`") ^^ (x => Token(Token.Kind.ALT_STRING, x))
 | |
| 55 | val verb = verbatim ^^ (x => Token(Token.Kind.VERBATIM, x)) | |
| 56 | val cart = cartouche ^^ (x => Token(Token.Kind.CARTOUCHE, x)) | |
| 67439 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 57 | val cmt = comment ^^ (x => Token(Token.Kind.INFORMAL_COMMENT, x)) | 
| 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 58 | val formal_cmt = comment_cartouche ^^ (x => Token(Token.Kind.FORMAL_COMMENT, x)) | 
| 55494 | 59 | |
| 67439 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 60 | string | (alt_string | (verb | (cart | (cmt | formal_cmt)))) | 
| 55494 | 61 | } | 
| 62 | ||
| 58900 | 63 | private def other_token(keywords: Keyword.Keywords): Parser[Token] = | 
| 55494 | 64 |     {
 | 
| 65 | val letdigs1 = many1(Symbol.is_letdig) | |
| 62103 | 66 | val sub = one(s => s == Symbol.sub_decoded || s == Symbol.sub) | 
| 55494 | 67 | val id = | 
| 68 | one(Symbol.is_letter) ~ | |
| 69 |           (rep(letdigs1 | (sub ~ letdigs1 ^^ { case x ~ y => x + y })) ^^ (_.mkString)) ^^
 | |
| 70 |         { case x ~ y => x + y }
 | |
| 71 | ||
| 72 | val nat = many1(Symbol.is_digit) | |
| 73 |       val natdot = nat ~ "." ~ nat ^^ { case x ~ y ~ z => x + y + z }
 | |
| 74 |       val id_nat = id ~ opt("." ~ nat) ^^ { case x ~ Some(y ~ z) => x + y + z case x ~ None => x }
 | |
| 75 | ||
| 76 |       val ident = id ~ rep("." ~> id) ^^
 | |
| 77 |         { case x ~ Nil => Token(Token.Kind.IDENT, x)
 | |
| 78 |           case x ~ ys => Token(Token.Kind.LONG_IDENT, (x :: ys).mkString(".")) }
 | |
| 79 | ||
| 80 |       val var_ = "?" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.VAR, x + y) }
 | |
| 81 |       val type_ident = "'" ~ id ^^ { case x ~ y => Token(Token.Kind.TYPE_IDENT, x + y) }
 | |
| 82 |       val type_var = "?'" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.TYPE_VAR, x + y) }
 | |
| 83 | val nat_ = nat ^^ (x => Token(Token.Kind.NAT, x)) | |
| 84 | val float = | |
| 85 |         ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
 | |
| 86 | ||
| 87 | val sym_ident = | |
| 88 | (many1(Symbol.is_symbolic_char) | one(sym => Symbol.is_symbolic(sym))) ^^ | |
| 89 | (x => Token(Token.Kind.SYM_IDENT, x)) | |
| 90 | ||
| 58899 | 91 | val keyword = | 
| 58900 | 92 | literal(keywords.minor) ^^ (x => Token(Token.Kind.KEYWORD, x)) ||| | 
| 93 | literal(keywords.major) ^^ (x => Token(Token.Kind.COMMAND, x)) | |
| 55494 | 94 | |
| 95 | val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x)) | |
| 96 | ||
| 97 | val recover_delimited = | |
| 98 |         (recover_quoted("\"") |
 | |
| 99 |           (recover_quoted("`") |
 | |
| 100 | (recover_verbatim | | |
| 101 | (recover_cartouche | recover_comment)))) ^^ (x => Token(Token.Kind.ERROR, x)) | |
| 102 | ||
| 103 | val bad = one(_ => true) ^^ (x => Token(Token.Kind.ERROR, x)) | |
| 104 | ||
| 105 | space | (recover_delimited | | |
| 106 | (((ident | (var_ | (type_ident | (type_var | (float | (nat_ | sym_ident)))))) ||| | |
| 58899 | 107 | keyword) | bad)) | 
| 55494 | 108 | } | 
| 109 | ||
| 58900 | 110 | def token(keywords: Keyword.Keywords): Parser[Token] = | 
| 67446 | 111 | delimited_token | other_token(keywords) | 
| 55494 | 112 | |
| 58900 | 113 | def token_line(keywords: Keyword.Keywords, ctxt: Scan.Line_Context) | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 114 | : Parser[(Token, Scan.Line_Context)] = | 
| 55494 | 115 |     {
 | 
| 116 | val string = | |
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 117 |         quoted_line("\"", ctxt) ^^ { case (x, c) => (Token(Token.Kind.STRING, x), c) }
 | 
| 55494 | 118 | val alt_string = | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 119 |         quoted_line("`", ctxt) ^^ { case (x, c) => (Token(Token.Kind.ALT_STRING, x), c) }
 | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 120 |       val verb = verbatim_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.VERBATIM, x), c) }
 | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55505diff
changeset | 121 |       val cart = cartouche_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.CARTOUCHE, x), c) }
 | 
| 67439 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 122 |       val cmt = comment_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.INFORMAL_COMMENT, x), c) }
 | 
| 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 123 | val formal_cmt = | 
| 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 124 |         comment_cartouche_line(ctxt) ^^ { case (x, c) => (Token(Token.Kind.FORMAL_COMMENT, x), c) }
 | 
| 58900 | 125 |       val other = other_token(keywords) ^^ { case x => (x, Scan.Finished) }
 | 
| 55494 | 126 | |
| 67439 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 127 | string | (alt_string | (verb | (cart | (cmt | (formal_cmt | other))))) | 
| 55494 | 128 | } | 
| 129 | } | |
| 130 | ||
| 131 | ||
| 59083 | 132 | /* explode */ | 
| 133 | ||
| 134 | def explode(keywords: Keyword.Keywords, inp: CharSequence): List[Token] = | |
| 64824 | 135 |     Parsers.parseAll(Parsers.rep(Parsers.token(keywords)), Scan.char_reader(inp)) match {
 | 
| 59083 | 136 | case Parsers.Success(tokens, _) => tokens | 
| 137 |       case _ => error("Unexpected failure of tokenizing input:\n" + inp.toString)
 | |
| 138 | } | |
| 139 | ||
| 140 | def explode_line(keywords: Keyword.Keywords, inp: CharSequence, context: Scan.Line_Context) | |
| 141 | : (List[Token], Scan.Line_Context) = | |
| 142 |   {
 | |
| 64824 | 143 | var in: input.Reader[Char] = Scan.char_reader(inp) | 
| 59083 | 144 | val toks = new mutable.ListBuffer[Token] | 
| 145 | var ctxt = context | |
| 146 |     while (!in.atEnd) {
 | |
| 147 |       Parsers.parse(Parsers.token_line(keywords, ctxt), in) match {
 | |
| 60215 | 148 | case Parsers.Success((x, c), rest) => toks += x; ctxt = c; in = rest | 
| 59083 | 149 | case Parsers.NoSuccess(_, rest) => | 
| 150 |           error("Unexpected failure of tokenizing input:\n" + rest.source.toString)
 | |
| 151 | } | |
| 152 | } | |
| 153 | (toks.toList, ctxt) | |
| 154 | } | |
| 155 | ||
| 64671 | 156 | val newline: Token = explode(Keyword.Keywords.empty, "\n").head | 
| 157 | ||
| 59083 | 158 | |
| 69603 | 159 | /* embedded */ | 
| 160 | ||
| 161 | def read_embedded(keywords: Keyword.Keywords, inp: CharSequence): Option[Token] = | |
| 162 |     explode(keywords, inp) match {
 | |
| 163 | case List(tok) if tok.is_embedded => Some(tok) | |
| 164 | case _ => None | |
| 165 | } | |
| 166 | ||
| 167 | ||
| 65523 | 168 | /* names */ | 
| 169 | ||
| 170 | def read_name(keywords: Keyword.Keywords, inp: CharSequence): Option[Token] = | |
| 171 |     explode(keywords, inp) match {
 | |
| 172 | case List(tok) if tok.is_name => Some(tok) | |
| 173 | case _ => None | |
| 174 | } | |
| 175 | ||
| 176 | def quote_name(keywords: Keyword.Keywords, name: String): String = | |
| 177 | if (read_name(keywords, name).isDefined) name | |
| 178 |     else quote(name.replace("\"", "\\\""))
 | |
| 179 | ||
| 180 | ||
| 67132 | 181 | /* plain antiquotation (0 or 1 args) */ | 
| 182 | ||
| 183 | def read_antiq_arg(keywords: Keyword.Keywords, inp: CharSequence): Option[(String, Option[String])] = | |
| 184 |     explode(keywords, inp).filter(_.is_proper) match {
 | |
| 185 | case List(t) if t.is_name => Some(t.content, None) | |
| 186 | case List(t1, t2) if t1.is_name && t2.is_embedded => Some(t1.content, Some(t2.content)) | |
| 187 | case _ => None | |
| 188 | } | |
| 189 | ||
| 190 | ||
| 59735 | 191 | /* implode */ | 
| 192 | ||
| 193 | def implode(toks: List[Token]): String = | |
| 194 |     toks match {
 | |
| 195 | case List(tok) => tok.source | |
| 60215 | 196 | case _ => toks.map(_.source).mkString | 
| 59735 | 197 | } | 
| 198 | ||
| 199 | ||
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 200 | /* token reader */ | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 201 | |
| 56464 | 202 | object Pos | 
| 203 |   {
 | |
| 59706 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 204 | val none: Pos = new Pos(0, 0, "", "") | 
| 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 205 | val start: Pos = new Pos(1, 1, "", "") | 
| 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 206 | def file(file: String): Pos = new Pos(1, 1, file, "") | 
| 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 207 | def id(id: String): Pos = new Pos(0, 1, "", id) | 
| 59715 
4f0d0e4ad68d
avoid duplicate header errors, more precise positions;
 wenzelm parents: 
59706diff
changeset | 208 | val command: Pos = id(Markup.COMMAND) | 
| 56464 | 209 | } | 
| 210 | ||
| 59671 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 211 | final class Pos private[Token]( | 
| 59696 | 212 | val line: Int, | 
| 213 | val offset: Symbol.Offset, | |
| 59706 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 214 | val file: String, | 
| 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 215 | val id: String) | 
| 64824 | 216 | extends input.Position | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 217 |   {
 | 
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 218 | def column = 0 | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 219 | def lineContents = "" | 
| 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 220 | |
| 67895 | 221 | def advance(token: Token): Pos = advance(token.source) | 
| 222 | def advance(source: String): Pos = | |
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 223 |     {
 | 
| 59671 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 224 | var line1 = line | 
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 225 | var offset1 = offset | 
| 67895 | 226 |       for (s <- Symbol.iterator(source)) {
 | 
| 59671 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 227 | if (line1 > 0 && Symbol.is_newline(s)) line1 += 1 | 
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 228 | if (offset1 > 0) offset1 += 1 | 
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 229 | } | 
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 230 | if (line1 == line && offset1 == offset) this | 
| 59706 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 231 | else new Pos(line1, offset1, file, id) | 
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 232 | } | 
| 56464 | 233 | |
| 59695 | 234 | private def position(end_offset: Symbol.Offset): Position.T = | 
| 59671 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 235 | (if (line > 0) Position.Line(line) else Nil) ::: | 
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 236 | (if (offset > 0) Position.Offset(offset) else Nil) ::: | 
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 237 | (if (end_offset > 0) Position.End_Offset(end_offset) else Nil) ::: | 
| 59706 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 238 | (if (file != "") Position.File(file) else Nil) ::: | 
| 
bf6ca55aae13
proper command id for inlined errors, which is important for Command.State.accumulate;
 wenzelm parents: 
59705diff
changeset | 239 | (if (id != "") Position.Id_String(id) else Nil) | 
| 59671 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 240 | |
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 241 | def position(): Position.T = position(0) | 
| 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 242 | def position(token: Token): Position.T = position(advance(token).offset) | 
| 67895 | 243 | def position(source: String): Position.T = position(advance(source).offset) | 
| 59671 
9715eb8e9408
more precise position information in Isabelle/Scala, with YXML markup as in Isabelle/ML;
 wenzelm parents: 
59122diff
changeset | 244 | |
| 64728 | 245 | override def toString: String = Position.here(position(), delimited = false) | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 246 | } | 
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 247 | |
| 64824 | 248 | abstract class Reader extends input.Reader[Token] | 
| 34157 
0a0a19153626
explicit representation of Token_Kind -- cannot really depend on runtime types due to erasure;
 wenzelm parents: 
34143diff
changeset | 249 | |
| 56464 | 250 | private class Token_Reader(tokens: List[Token], val pos: Pos) extends Reader | 
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 251 |   {
 | 
| 71601 | 252 | def first: Token = tokens.head | 
| 253 | def rest: Token_Reader = new Token_Reader(tokens.tail, pos.advance(first)) | |
| 254 | def atEnd: Boolean = tokens.isEmpty | |
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 255 | } | 
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 256 | |
| 59705 | 257 | def reader(tokens: List[Token], start: Token.Pos): Reader = | 
| 258 | new Token_Reader(tokens, start) | |
| 65335 | 259 | |
| 260 | ||
| 261 | /* XML data representation */ | |
| 262 | ||
| 263 | val encode: XML.Encode.T[Token] = (tok: Token) => | |
| 264 |   {
 | |
| 265 | import XML.Encode._ | |
| 266 | pair(int, string)(tok.kind.id, tok.source) | |
| 267 | } | |
| 268 | ||
| 269 | val decode: XML.Decode.T[Token] = (body: XML.Body) => | |
| 270 |   {
 | |
| 271 | import XML.Decode._ | |
| 272 | val (k, s) = pair(int, string)(body) | |
| 273 | Token(Kind(k), s) | |
| 274 | } | |
| 34139 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 275 | } | 
| 
d1ded303fe0e
Outer lexical syntax for Isabelle/Isar -- Scala version.
 wenzelm parents: diff
changeset | 276 | |
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 277 | |
| 60215 | 278 | sealed case class Token(kind: Token.Kind.Value, source: String) | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 279 | {
 | 
| 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 280 | def is_command: Boolean = kind == Token.Kind.COMMAND | 
| 63446 | 281 | def is_command(name: String): Boolean = kind == Token.Kind.COMMAND && source == name | 
| 48718 | 282 | def is_keyword: Boolean = kind == Token.Kind.KEYWORD | 
| 63446 | 283 | def is_keyword(name: String): Boolean = kind == Token.Kind.KEYWORD && source == name | 
| 63450 | 284 | def is_keyword(name: Char): Boolean = | 
| 285 | kind == Token.Kind.KEYWORD && source.length == 1 && source(0) == name | |
| 55505 | 286 | def is_delimiter: Boolean = is_keyword && !Symbol.is_ascii_identifier(source) | 
| 48365 
d88aefda01c4
basic support for stand-alone options with external string representation;
 wenzelm parents: 
48349diff
changeset | 287 | def is_ident: Boolean = kind == Token.Kind.IDENT | 
| 48605 
e777363440d6
allow negative int values as well, according to real = int | float;
 wenzelm parents: 
48599diff
changeset | 288 | def is_sym_ident: Boolean = kind == Token.Kind.SYM_IDENT | 
| 46943 | 289 | def is_string: Boolean = kind == Token.Kind.STRING | 
| 48349 
a78e5d399599
support Session.Queue with ordering and dependencies;
 wenzelm parents: 
48335diff
changeset | 290 | def is_nat: Boolean = kind == Token.Kind.NAT | 
| 48365 
d88aefda01c4
basic support for stand-alone options with external string representation;
 wenzelm parents: 
48349diff
changeset | 291 | def is_float: Boolean = kind == Token.Kind.FLOAT | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 292 | def is_name: Boolean = | 
| 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 293 | kind == Token.Kind.IDENT || | 
| 62969 | 294 | kind == Token.Kind.LONG_IDENT || | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 295 | kind == Token.Kind.SYM_IDENT || | 
| 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 296 | kind == Token.Kind.STRING || | 
| 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 297 | kind == Token.Kind.NAT | 
| 64471 
c40c2975fb02
more uniform path syntax, as in ML (see 5a7c919a4ada);
 wenzelm parents: 
63477diff
changeset | 298 | def is_embedded: Boolean = is_name || | 
| 
c40c2975fb02
more uniform path syntax, as in ML (see 5a7c919a4ada);
 wenzelm parents: 
63477diff
changeset | 299 | kind == Token.Kind.CARTOUCHE || | 
| 
c40c2975fb02
more uniform path syntax, as in ML (see 5a7c919a4ada);
 wenzelm parents: 
63477diff
changeset | 300 | kind == Token.Kind.VAR || | 
| 
c40c2975fb02
more uniform path syntax, as in ML (see 5a7c919a4ada);
 wenzelm parents: 
63477diff
changeset | 301 | kind == Token.Kind.TYPE_IDENT || | 
| 
c40c2975fb02
more uniform path syntax, as in ML (see 5a7c919a4ada);
 wenzelm parents: 
63477diff
changeset | 302 | kind == Token.Kind.TYPE_VAR | 
| 67432 | 303 | def is_text: Boolean = is_embedded || kind == Token.Kind.VERBATIM | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 304 | def is_space: Boolean = kind == Token.Kind.SPACE | 
| 67441 | 305 | def is_informal_comment: Boolean = kind == Token.Kind.INFORMAL_COMMENT | 
| 306 | def is_formal_comment: Boolean = kind == Token.Kind.FORMAL_COMMENT | |
| 69891 
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
 wenzelm parents: 
69603diff
changeset | 307 | def is_marker: Boolean = | 
| 
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
 wenzelm parents: 
69603diff
changeset | 308 | kind == Token.Kind.FORMAL_COMMENT && | 
| 
def3ec9cdb7e
document markers are formal comments, and may thus occur anywhere in the command-span;
 wenzelm parents: 
69603diff
changeset | 309 | (source.startsWith(Symbol.marker) || source.startsWith(Symbol.marker_decoded)) | 
| 67441 | 310 | def is_comment: Boolean = is_informal_comment || is_formal_comment | 
| 68729 
3a02b424d5fb
clarified ignored span / core range: include formal comments, e.g. relevant for error messages from antiquotations;
 wenzelm parents: 
67895diff
changeset | 311 | def is_ignored: Boolean = is_space || is_informal_comment | 
| 48599 | 312 | def is_proper: Boolean = !is_space && !is_comment | 
| 48754 
c2c1e5944536
clarified undefined, unparsed, unfinished command spans;
 wenzelm parents: 
48718diff
changeset | 313 | def is_error: Boolean = kind == Token.Kind.ERROR | 
| 47012 
0e246130486b
clarified command span classification: strict Command.is_command, permissive Command.name;
 wenzelm parents: 
46943diff
changeset | 314 | def is_unparsed: Boolean = kind == Token.Kind.UNPARSED | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 315 | |
| 48754 
c2c1e5944536
clarified undefined, unparsed, unfinished command spans;
 wenzelm parents: 
48718diff
changeset | 316 | def is_unfinished: Boolean = is_error && | 
| 
c2c1e5944536
clarified undefined, unparsed, unfinished command spans;
 wenzelm parents: 
48718diff
changeset | 317 |    (source.startsWith("\"") ||
 | 
| 
c2c1e5944536
clarified undefined, unparsed, unfinished command spans;
 wenzelm parents: 
48718diff
changeset | 318 |     source.startsWith("`") ||
 | 
| 
c2c1e5944536
clarified undefined, unparsed, unfinished command spans;
 wenzelm parents: 
48718diff
changeset | 319 |     source.startsWith("{*") ||
 | 
| 57021 
6a8fd2ac6756
explicit treatment of unfinished cartouches, which is important for Thy_Syntax.consolidate_spans;
 wenzelm parents: 
56998diff
changeset | 320 |     source.startsWith("(*") ||
 | 
| 
6a8fd2ac6756
explicit treatment of unfinished cartouches, which is important for Thy_Syntax.consolidate_spans;
 wenzelm parents: 
56998diff
changeset | 321 | source.startsWith(Symbol.open) || | 
| 
6a8fd2ac6756
explicit treatment of unfinished cartouches, which is important for Thy_Syntax.consolidate_spans;
 wenzelm parents: 
56998diff
changeset | 322 | source.startsWith(Symbol.open_decoded)) | 
| 48754 
c2c1e5944536
clarified undefined, unparsed, unfinished command spans;
 wenzelm parents: 
48718diff
changeset | 323 | |
| 71601 | 324 | def is_open_bracket: Boolean = is_keyword && Word.open_brackets.exists(is_keyword) | 
| 325 | def is_close_bracket: Boolean = is_keyword && Word.close_brackets.exists(is_keyword) | |
| 63450 | 326 | |
| 63446 | 327 |   def is_begin: Boolean = is_keyword("begin")
 | 
| 328 |   def is_end: Boolean = is_command("end")
 | |
| 63477 
f5c81436b930
clarified indentation: 'begin' is treated like a separate command without indent;
 wenzelm parents: 
63450diff
changeset | 329 | def is_begin_or_command: Boolean = is_begin || is_command | 
| 43611 | 330 | |
| 72669 | 331 | def symbol_length: Symbol.Offset = Symbol.iterator(source).length | 
| 332 | ||
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 333 | def content: String = | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55137diff
changeset | 334 |     if (kind == Token.Kind.STRING) Scan.Parsers.quoted_content("\"", source)
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55137diff
changeset | 335 |     else if (kind == Token.Kind.ALT_STRING) Scan.Parsers.quoted_content("`", source)
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55137diff
changeset | 336 | else if (kind == Token.Kind.VERBATIM) Scan.Parsers.verbatim_content(source) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55137diff
changeset | 337 | else if (kind == Token.Kind.CARTOUCHE) Scan.Parsers.cartouche_content(source) | 
| 67439 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 338 | else if (kind == Token.Kind.INFORMAL_COMMENT) Scan.Parsers.comment_content(source) | 
| 
78759a7bd874
more uniform support for formal comments in outer syntax, notably \<^cancel> and \<^latex>;
 wenzelm parents: 
67432diff
changeset | 339 | else if (kind == Token.Kind.FORMAL_COMMENT) Comment.content(source) | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 340 | else source | 
| 66914 | 341 | |
| 66915 
f4259adc928a
disallow blanks, relevant for session_name / theory_name e.g. in build log files;
 wenzelm parents: 
66914diff
changeset | 342 | def is_system_name: Boolean = | 
| 
f4259adc928a
disallow blanks, relevant for session_name / theory_name e.g. in build log files;
 wenzelm parents: 
66914diff
changeset | 343 |   {
 | 
| 
f4259adc928a
disallow blanks, relevant for session_name / theory_name e.g. in build log files;
 wenzelm parents: 
66914diff
changeset | 344 | val s = content | 
| 69551 
adb52af5ba55
exclude file name components that are special on Windows;
 wenzelm parents: 
68730diff
changeset | 345 | is_name && Path.is_wellformed(s) && | 
| 71601 | 346 | !s.exists(Symbol.is_ascii_blank) && | 
| 69551 
adb52af5ba55
exclude file name components that are special on Windows;
 wenzelm parents: 
68730diff
changeset | 347 | !Path.is_reserved(s) | 
| 66915 
f4259adc928a
disallow blanks, relevant for session_name / theory_name e.g. in build log files;
 wenzelm parents: 
66914diff
changeset | 348 | } | 
| 36956 
21be4832c362
renamed class Outer_Lex to Token and Token_Kind to Token.Kind;
 wenzelm parents: 
34311diff
changeset | 349 | } |