src/Pure/General/scan.scala
author wenzelm
Fri Feb 14 15:42:27 2014 +0100 (2014-02-14)
changeset 55492 28d4db6c6e79
parent 55034 04b443d54aee
child 55494 009b71c1ed23
permissions -rw-r--r--
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
     1 /*  Title:      Pure/General/scan.scala
     2     Author:     Makarius
     3 
     4 Efficient scanning of keywords and tokens.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.{IndexedSeq, TraversableOnce}
    11 import scala.collection.immutable.PagedSeq
    12 import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
    13 import scala.util.parsing.combinator.RegexParsers
    14 
    15 import java.io.{File => JFile, BufferedInputStream, FileInputStream}
    16 
    17 
    18 object Scan
    19 {
    20   /** context of partial scans **/
    21 
    22   sealed abstract class Context
    23   case object Finished extends Context
    24   case class Quoted(quote: String) extends Context
    25   case object Verbatim extends Context
    26   case class Cartouche(depth: Int) extends Context
    27   case class Comment(depth: Int) extends Context
    28 
    29 
    30 
    31   /** parser combinators **/
    32 
    33   object Parsers extends Parsers
    34 
    35   trait Parsers extends RegexParsers
    36   {
    37     override val whiteSpace = "".r
    38 
    39 
    40     /* optional termination */
    41 
    42     def opt_term[T](p: => Parser[T]): Parser[Option[T]] =
    43       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
    44 
    45 
    46     /* symbol range */
    47 
    48     def symbol_range(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
    49       new Parser[String]
    50       {
    51         def apply(in: Input) =
    52         {
    53           val start = in.offset
    54           val end = in.source.length
    55           val matcher = new Symbol.Matcher(in.source)
    56 
    57           var i = start
    58           var count = 0
    59           var finished = false
    60           while (!finished && i < end && count < max_count) {
    61             val n = matcher(i, end)
    62             val sym = in.source.subSequence(i, i + n).toString
    63             if (pred(sym)) { i += n; count += 1 }
    64             else finished = true
    65           }
    66           if (count < min_count) Failure("bad input", in)
    67           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
    68         }
    69       }.named("symbol_range")
    70 
    71     def one(pred: Symbol.Symbol => Boolean): Parser[String] =
    72       symbol_range(pred, 1, 1)
    73 
    74     def many(pred: Symbol.Symbol => Boolean): Parser[String] =
    75       symbol_range(pred, 0, Integer.MAX_VALUE)
    76 
    77     def many1(pred: Symbol.Symbol => Boolean): Parser[String] =
    78       symbol_range(pred, 1, Integer.MAX_VALUE)
    79 
    80 
    81     /* quoted strings */
    82 
    83     private def quoted_body(quote: Symbol.Symbol): Parser[String] =
    84     {
    85       rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
    86         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
    87     }
    88 
    89     private def quoted(quote: Symbol.Symbol): Parser[String] =
    90     {
    91       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
    92     }.named("quoted")
    93 
    94     def quoted_content(quote: Symbol.Symbol, source: String): String =
    95     {
    96       require(parseAll(quoted(quote), source).successful)
    97       val body = source.substring(1, source.length - 1)
    98       if (body.exists(_ == '\\')) {
    99         val content =
   100           rep(many1(sym => sym != quote && sym != "\\") |
   101               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
   102         parseAll(content ^^ (_.mkString), body).get
   103       }
   104       else body
   105     }
   106 
   107     def quoted_context(quote: Symbol.Symbol, ctxt: Context): Parser[(String, Context)] =
   108     {
   109       ctxt match {
   110         case Finished =>
   111           quote ~ quoted_body(quote) ~ opt_term(quote) ^^
   112             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   113               case x ~ y ~ None => (x + y, Quoted(quote)) }
   114         case Quoted(q) if q == quote =>
   115           quoted_body(quote) ~ opt_term(quote) ^^
   116             { case x ~ Some(y) => (x + y, Finished)
   117               case x ~ None => (x, ctxt) }
   118         case _ => failure("")
   119       }
   120     }.named("quoted_context")
   121 
   122     def recover_quoted(quote: Symbol.Symbol): Parser[String] =
   123       quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
   124 
   125 
   126     /* verbatim text */
   127 
   128     private def verbatim_body: Parser[String] =
   129       rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)
   130 
   131     private def verbatim: Parser[String] =
   132     {
   133       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
   134     }.named("verbatim")
   135 
   136     def verbatim_content(source: String): String =
   137     {
   138       require(parseAll(verbatim, source).successful)
   139       source.substring(2, source.length - 2)
   140     }
   141 
   142     def verbatim_context(ctxt: Context): Parser[(String, Context)] =
   143     {
   144       ctxt match {
   145         case Finished =>
   146           "{*" ~ verbatim_body ~ opt_term("*}") ^^
   147             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   148               case x ~ y ~ None => (x + y, Verbatim) }
   149         case Verbatim =>
   150           verbatim_body ~ opt_term("*}") ^^
   151             { case x ~ Some(y) => (x + y, Finished)
   152               case x ~ None => (x, Verbatim) }
   153         case _ => failure("")
   154       }
   155     }.named("verbatim_context")
   156 
   157     val recover_verbatim: Parser[String] =
   158       "{*" ~ verbatim_body ^^ { case x ~ y => x + y }
   159 
   160 
   161     /* nested text cartouches */
   162 
   163     private def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   164     {
   165       require(depth >= 0)
   166 
   167       def apply(in: Input) =
   168       {
   169         val start = in.offset
   170         val end = in.source.length
   171         val matcher = new Symbol.Matcher(in.source)
   172 
   173         var i = start
   174         var d = depth
   175         var finished = false
   176         while (!finished && i < end) {
   177           val n = matcher(i, end)
   178           val sym = in.source.subSequence(i, i + n).toString
   179           if (Symbol.is_open(sym)) { i += n; d += 1 }
   180           else if (d > 0) { i += n; if (Symbol.is_close(sym)) d -= 1 }
   181           else finished = true
   182         }
   183         if (i == start) Failure("bad input", in)
   184         else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start))
   185       }
   186     }.named("cartouche_depth")
   187 
   188     def cartouche: Parser[String] =
   189       cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
   190 
   191     def cartouche_context(ctxt: Context): Parser[(String, Context)] =
   192     {
   193       val depth =
   194         ctxt match {
   195           case Finished => 0
   196           case Cartouche(d) => d
   197           case _ => -1
   198         }
   199       if (depth >= 0)
   200         cartouche_depth(depth) ^^
   201           { case (x, 0) => (x, Finished)
   202             case (x, d) => (x, Cartouche(d)) }
   203       else failure("")
   204     }
   205 
   206     val recover_cartouche: Parser[String] =
   207       cartouche_depth(0) ^^ (_._1)
   208 
   209     def cartouche_content(source: String): String =
   210     {
   211       def err(): Nothing = error("Malformed text cartouche: " + quote(source))
   212       val source1 =
   213         Library.try_unprefix(Symbol.open_decoded, source) orElse
   214           Library.try_unprefix(Symbol.open, source) getOrElse err()
   215       Library.try_unsuffix(Symbol.close_decoded, source1) orElse
   216         Library.try_unsuffix(Symbol.close, source1) getOrElse err()
   217     }
   218 
   219 
   220     /* nested comments */
   221 
   222     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   223     {
   224       require(depth >= 0)
   225 
   226       val comment_text =
   227         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
   228 
   229       def apply(in: Input) =
   230       {
   231         var rest = in
   232         def try_parse[A](p: Parser[A]): Boolean =
   233         {
   234           parse(p ^^^ (), rest) match {
   235             case Success(_, next) => { rest = next; true }
   236             case _ => false
   237           }
   238         }
   239         var d = depth
   240         var finished = false
   241         while (!finished) {
   242           if (try_parse("(*")) d += 1
   243           else if (d > 0 && try_parse("*)")) d -= 1
   244           else if (d == 0 || !try_parse(comment_text)) finished = true
   245         }
   246         if (in.offset < rest.offset)
   247           Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)
   248         else Failure("comment expected", in)
   249       }
   250     }.named("comment_depth")
   251 
   252     def comment: Parser[String] =
   253       comment_depth(0) ^? { case (x, d) if d == 0 => x }
   254 
   255     def comment_context(ctxt: Context): Parser[(String, Context)] =
   256     {
   257       val depth =
   258         ctxt match {
   259           case Finished => 0
   260           case Comment(d) => d
   261           case _ => -1
   262         }
   263       if (depth >= 0)
   264         comment_depth(depth) ^^
   265           { case (x, 0) => (x, Finished)
   266             case (x, d) => (x, Comment(d)) }
   267       else failure("")
   268     }
   269 
   270     val recover_comment: Parser[String] =
   271       comment_depth(0) ^^ (_._1)
   272 
   273     def comment_content(source: String): String =
   274     {
   275       require(parseAll(comment, source).successful)
   276       source.substring(2, source.length - 2)
   277     }
   278 
   279 
   280     /* keyword */
   281 
   282     def keyword(lexicon: Lexicon): Parser[String] = new Parser[String]
   283     {
   284       def apply(in: Input) =
   285       {
   286         val result = lexicon.scan(in)
   287         if (result.isEmpty) Failure("keyword expected", in)
   288         else Success(result, in.drop(result.length))
   289       }
   290     }.named("keyword")
   291 
   292 
   293     /* outer syntax tokens */
   294 
   295     private def delimited_token: Parser[Token] =
   296     {
   297       val string = quoted("\"") ^^ (x => Token(Token.Kind.STRING, x))
   298       val alt_string = quoted("`") ^^ (x => Token(Token.Kind.ALT_STRING, x))
   299       val verb = verbatim ^^ (x => Token(Token.Kind.VERBATIM, x))
   300       val cart = cartouche ^^ (x => Token(Token.Kind.CARTOUCHE, x))
   301       val cmt = comment ^^ (x => Token(Token.Kind.COMMENT, x))
   302 
   303       string | (alt_string | (verb | (cart | cmt)))
   304     }
   305 
   306     private def other_token(lexicon: Lexicon, is_command: String => Boolean)
   307       : Parser[Token] =
   308     {
   309       val letdigs1 = many1(Symbol.is_letdig)
   310       val sub = one(s => s == Symbol.sub_decoded || s == "\\<^sub>")
   311       val id =
   312         one(Symbol.is_letter) ~
   313           (rep(letdigs1 | (sub ~ letdigs1 ^^ { case x ~ y => x + y })) ^^ (_.mkString)) ^^
   314         { case x ~ y => x + y }
   315 
   316       val nat = many1(Symbol.is_digit)
   317       val natdot = nat ~ "." ~ nat ^^ { case x ~ y ~ z => x + y + z }
   318       val id_nat = id ~ opt("." ~ nat) ^^ { case x ~ Some(y ~ z) => x + y + z case x ~ None => x }
   319 
   320       val ident = id ~ rep("." ~> id) ^^
   321         { case x ~ Nil => Token(Token.Kind.IDENT, x)
   322           case x ~ ys => Token(Token.Kind.LONG_IDENT, (x :: ys).mkString(".")) }
   323 
   324       val var_ = "?" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.VAR, x + y) }
   325       val type_ident = "'" ~ id ^^ { case x ~ y => Token(Token.Kind.TYPE_IDENT, x + y) }
   326       val type_var = "?'" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.TYPE_VAR, x + y) }
   327       val nat_ = nat ^^ (x => Token(Token.Kind.NAT, x))
   328       val float =
   329         ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
   330 
   331       val sym_ident =
   332         (many1(Symbol.is_symbolic_char) | one(sym => Symbol.is_symbolic(sym))) ^^
   333         (x => Token(Token.Kind.SYM_IDENT, x))
   334 
   335       val command_keyword =
   336         keyword(lexicon) ^^
   337           (x => Token(if (is_command(x)) Token.Kind.COMMAND else Token.Kind.KEYWORD, x))
   338 
   339       val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
   340 
   341       val recover_delimited =
   342         (recover_quoted("\"") |
   343           (recover_quoted("`") |
   344             (recover_verbatim |
   345               (recover_cartouche | recover_comment)))) ^^ (x => Token(Token.Kind.ERROR, x))
   346 
   347       val bad = one(_ => true) ^^ (x => Token(Token.Kind.ERROR, x))
   348 
   349       space | (recover_delimited |
   350         (((ident | (var_ | (type_ident | (type_var | (float | (nat_ | sym_ident)))))) |||
   351           command_keyword) | bad))
   352     }
   353 
   354     def token(lexicon: Lexicon, is_command: String => Boolean): Parser[Token] =
   355       delimited_token | other_token(lexicon, is_command)
   356 
   357     def token_context(lexicon: Lexicon, is_command: String => Boolean, ctxt: Context)
   358       : Parser[(Token, Context)] =
   359     {
   360       val string =
   361         quoted_context("\"", ctxt) ^^ { case (x, c) => (Token(Token.Kind.STRING, x), c) }
   362       val alt_string =
   363         quoted_context("`", ctxt) ^^ { case (x, c) => (Token(Token.Kind.ALT_STRING, x), c) }
   364       val verb = verbatim_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.VERBATIM, x), c) }
   365       val cart = cartouche_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.CARTOUCHE, x), c) }
   366       val cmt = comment_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.COMMENT, x), c) }
   367       val other = other_token(lexicon, is_command) ^^ { case x => (x, Finished) }
   368 
   369       string | (alt_string | (verb | (cart | (cmt | other))))
   370     }
   371   }
   372 
   373 
   374 
   375   /** Lexicon -- position tree **/
   376 
   377   object Lexicon
   378   {
   379     /* representation */
   380 
   381     private sealed case class Tree(val branches: Map[Char, (String, Tree)])
   382     private val empty_tree = Tree(Map())
   383 
   384     val empty: Lexicon = new Lexicon(empty_tree)
   385     def apply(elems: String*): Lexicon = empty ++ elems
   386   }
   387 
   388   final class Lexicon private(rep: Lexicon.Tree)
   389   {
   390     /* auxiliary operations */
   391 
   392     private def content(tree: Lexicon.Tree, result: List[String]): List[String] =
   393       (result /: tree.branches.toList) ((res, entry) =>
   394         entry match { case (_, (s, tr)) =>
   395           if (s.isEmpty) content(tr, res) else content(tr, s :: res) })
   396 
   397     private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] =
   398     {
   399       val len = str.length
   400       def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] =
   401       {
   402         if (i < len) {
   403           tree.branches.get(str.charAt(i)) match {
   404             case Some((s, tr)) => look(tr, !s.isEmpty, i + 1)
   405             case None => None
   406           }
   407         } else Some(tip, tree)
   408       }
   409       look(rep, false, 0)
   410     }
   411 
   412     def completions(str: CharSequence): List[String] =
   413       lookup(str) match {
   414         case Some((true, tree)) => content(tree, List(str.toString))
   415         case Some((false, tree)) => content(tree, Nil)
   416         case None => Nil
   417       }
   418 
   419 
   420     /* pseudo Set methods */
   421 
   422     def iterator: Iterator[String] = content(rep, Nil).sorted.iterator
   423 
   424     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
   425 
   426     def empty: Lexicon = Lexicon.empty
   427     def isEmpty: Boolean = rep.branches.isEmpty
   428 
   429     def contains(elem: String): Boolean =
   430       lookup(elem) match {
   431         case Some((tip, _)) => tip
   432         case _ => false
   433       }
   434 
   435 
   436     /* add elements */
   437 
   438     def + (elem: String): Lexicon =
   439       if (contains(elem)) this
   440       else {
   441         val len = elem.length
   442         def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree =
   443           if (i < len) {
   444             val c = elem.charAt(i)
   445             val end = (i + 1 == len)
   446             tree.branches.get(c) match {
   447               case Some((s, tr)) =>
   448                 Lexicon.Tree(tree.branches +
   449                   (c -> (if (end) elem else s, extend(tr, i + 1))))
   450               case None =>
   451                 Lexicon.Tree(tree.branches +
   452                   (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))
   453             }
   454           }
   455           else tree
   456         new Lexicon(extend(rep, 0))
   457       }
   458 
   459     def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _)
   460 
   461 
   462     /* scan */
   463 
   464     def scan(in: Reader[Char]): String =
   465     {
   466       val source = in.source
   467       val offset = in.offset
   468       val len = source.length - offset
   469 
   470       def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String =
   471       {
   472         if (i < len) {
   473           tree.branches.get(source.charAt(offset + i)) match {
   474             case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1)
   475             case None => result
   476           }
   477         }
   478         else result
   479       }
   480       scan_tree(rep, "", 0)
   481     }
   482   }
   483 }