src/Pure/General/scan.scala
author wenzelm
Fri Jul 20 22:29:25 2012 +0200 (2012-07-20)
changeset 48409 0d2114eb412a
parent 48342 4a8f06cbf8bb
child 48743 a72f8ffecf31
permissions -rw-r--r--
more explicit java.io.{File => JFile};
     1 /*  Title:      Pure/General/scan.scala
     2     Author:     Makarius
     3 
     4 Efficient scanning of keywords and tokens.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.{IndexedSeq, TraversableOnce}
    11 import scala.collection.immutable.PagedSeq
    12 import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
    13 import scala.util.parsing.combinator.RegexParsers
    14 
    15 import java.io.{File => JFile, BufferedInputStream, FileInputStream}
    16 
    17 
    18 object Scan
    19 {
    20   /** context of partial scans **/
    21 
    22   sealed abstract class Context
    23   case object Finished extends Context
    24   case class Quoted(quote: String) extends Context
    25   case object Verbatim extends Context
    26   case class Comment(depth: Int) extends Context
    27 
    28 
    29 
    30   /** Lexicon -- position tree **/
    31 
    32   object Lexicon
    33   {
    34     /* representation */
    35 
    36     sealed case class Tree(val branches: Map[Char, (String, Tree)])
    37     private val empty_tree = Tree(Map())
    38 
    39     val empty: Lexicon = new Lexicon(empty_tree)
    40     def apply(elems: String*): Lexicon = empty ++ elems
    41   }
    42 
    43   final class Lexicon private(main_tree: Lexicon.Tree) extends RegexParsers
    44   {
    45     import Lexicon.Tree
    46 
    47 
    48     /* auxiliary operations */
    49 
    50     private def content(tree: Tree, result: List[String]): List[String] =
    51       (result /: tree.branches.toList) ((res, entry) =>
    52         entry match { case (_, (s, tr)) =>
    53           if (s.isEmpty) content(tr, res) else content(tr, s :: res) })
    54 
    55     private def lookup(str: CharSequence): Option[(Boolean, Tree)] =
    56     {
    57       val len = str.length
    58       def look(tree: Tree, tip: Boolean, i: Int): Option[(Boolean, Tree)] =
    59       {
    60         if (i < len) {
    61           tree.branches.get(str.charAt(i)) match {
    62             case Some((s, tr)) => look(tr, !s.isEmpty, i + 1)
    63             case None => None
    64           }
    65         } else Some(tip, tree)
    66       }
    67       look(main_tree, false, 0)
    68     }
    69 
    70     def completions(str: CharSequence): List[String] =
    71       lookup(str) match {
    72         case Some((true, tree)) => content(tree, List(str.toString))
    73         case Some((false, tree)) => content(tree, Nil)
    74         case None => Nil
    75       }
    76 
    77 
    78     /* pseudo Set methods */
    79 
    80     def iterator: Iterator[String] = content(main_tree, Nil).sorted.iterator
    81 
    82     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
    83 
    84     def empty: Lexicon = Lexicon.empty
    85     def isEmpty: Boolean = main_tree.branches.isEmpty
    86 
    87     def contains(elem: String): Boolean =
    88       lookup(elem) match {
    89         case Some((tip, _)) => tip
    90         case _ => false
    91       }
    92 
    93 
    94     /* add elements */
    95 
    96     def + (elem: String): Lexicon =
    97       if (contains(elem)) this
    98       else {
    99         val len = elem.length
   100         def extend(tree: Tree, i: Int): Tree =
   101           if (i < len) {
   102             val c = elem.charAt(i)
   103             val end = (i + 1 == len)
   104             tree.branches.get(c) match {
   105               case Some((s, tr)) =>
   106                 Tree(tree.branches +
   107                   (c -> (if (end) elem else s, extend(tr, i + 1))))
   108               case None =>
   109                 Tree(tree.branches +
   110                   (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))
   111             }
   112           }
   113           else tree
   114         new Lexicon(extend(main_tree, 0))
   115       }
   116 
   117     def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _)
   118 
   119 
   120 
   121     /** RegexParsers methods **/
   122 
   123     override val whiteSpace = "".r
   124 
   125 
   126     /* optional termination */
   127 
   128     def opt_term[T](p: => Parser[T]): Parser[Option[T]] =
   129       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
   130 
   131 
   132     /* keywords from lexicon */
   133 
   134     def keyword: Parser[String] = new Parser[String]
   135     {
   136       def apply(in: Input) =
   137       {
   138         val source = in.source
   139         val offset = in.offset
   140         val len = source.length - offset
   141 
   142         def scan(tree: Tree, result: String, i: Int): String =
   143         {
   144           if (i < len) {
   145             tree.branches.get(source.charAt(offset + i)) match {
   146               case Some((s, tr)) => scan(tr, if (s.isEmpty) result else s, i + 1)
   147               case None => result
   148             }
   149           }
   150           else result
   151         }
   152         val result = scan(main_tree, "", 0)
   153         if (result.isEmpty) Failure("keyword expected", in)
   154         else Success(result, in.drop(result.length))
   155       }
   156     }.named("keyword")
   157 
   158 
   159     /* symbol range */
   160 
   161     def symbol_range(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
   162       new Parser[String]
   163       {
   164         def apply(in: Input) =
   165         {
   166           val start = in.offset
   167           val end = in.source.length
   168           val matcher = new Symbol.Matcher(in.source)
   169 
   170           var i = start
   171           var count = 0
   172           var finished = false
   173           while (!finished) {
   174             if (i < end && count < max_count) {
   175               val n = matcher(i, end)
   176               val sym = in.source.subSequence(i, i + n).toString
   177               if (pred(sym)) { i += n; count += 1 }
   178               else finished = true
   179             }
   180             else finished = true
   181           }
   182           if (count < min_count) Failure("bad input", in)
   183           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
   184         }
   185       }.named("symbol_range")
   186 
   187     def one(pred: Symbol.Symbol => Boolean): Parser[String] =
   188       symbol_range(pred, 1, 1)
   189 
   190     def many(pred: Symbol.Symbol => Boolean): Parser[String] =
   191       symbol_range(pred, 0, Integer.MAX_VALUE)
   192 
   193     def many1(pred: Symbol.Symbol => Boolean): Parser[String] =
   194       symbol_range(pred, 1, Integer.MAX_VALUE)
   195 
   196 
   197     /* quoted strings */
   198 
   199     private def quoted_body(quote: Symbol.Symbol): Parser[String] =
   200     {
   201       rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
   202         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
   203     }
   204 
   205     private def quoted(quote: Symbol.Symbol): Parser[String] =
   206     {
   207       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
   208     }.named("quoted")
   209 
   210     def quoted_content(quote: Symbol.Symbol, source: String): String =
   211     {
   212       require(parseAll(quoted(quote), source).successful)
   213       val body = source.substring(1, source.length - 1)
   214       if (body.exists(_ == '\\')) {
   215         val content =
   216           rep(many1(sym => sym != quote && sym != "\\") |
   217               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
   218         parseAll(content ^^ (_.mkString), body).get
   219       }
   220       else body
   221     }
   222 
   223     def quoted_context(quote: Symbol.Symbol, ctxt: Context): Parser[(String, Context)] =
   224     {
   225       ctxt match {
   226         case Finished =>
   227           quote ~ quoted_body(quote) ~ opt_term(quote) ^^
   228             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   229               case x ~ y ~ None => (x + y, Quoted(quote)) }
   230         case Quoted(q) if q == quote =>
   231           quoted_body(quote) ~ opt_term(quote) ^^
   232             { case x ~ Some(y) => (x + y, Finished)
   233               case x ~ None => (x, ctxt) }
   234         case _ => failure("")
   235       }
   236     }.named("quoted_context")
   237 
   238 
   239     /* verbatim text */
   240 
   241     private def verbatim_body: Parser[String] =
   242       rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)
   243 
   244     private def verbatim: Parser[String] =
   245     {
   246       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
   247     }.named("verbatim")
   248 
   249     def verbatim_content(source: String): String =
   250     {
   251       require(parseAll(verbatim, source).successful)
   252       source.substring(2, source.length - 2)
   253     }
   254 
   255     def verbatim_context(ctxt: Context): Parser[(String, Context)] =
   256     {
   257       ctxt match {
   258         case Finished =>
   259           "{*" ~ verbatim_body ~ opt_term("*}") ^^
   260             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   261               case x ~ y ~ None => (x + y, Verbatim) }
   262         case Verbatim =>
   263           verbatim_body ~ opt_term("*}") ^^
   264             { case x ~ Some(y) => (x + y, Finished)
   265               case x ~ None => (x, Verbatim) }
   266         case _ => failure("")
   267       }
   268     }.named("verbatim_context")
   269 
   270 
   271     /* nested comments */
   272 
   273     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   274     {
   275       require(depth >= 0)
   276 
   277       val comment_text =
   278         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
   279 
   280       def apply(in: Input) =
   281       {
   282         var rest = in
   283         def try_parse[A](p: Parser[A]): Boolean =
   284         {
   285           parse(p ^^^ (), rest) match {
   286             case Success(_, next) => { rest = next; true }
   287             case _ => false
   288           }
   289         }
   290         var d = depth
   291         var finished = false
   292         while (!finished) {
   293           if (try_parse("(*")) d += 1
   294           else if (d > 0 && try_parse("*)")) d -= 1
   295           else if (d == 0 || !try_parse(comment_text)) finished = true
   296         }
   297         if (in.offset < rest.offset)
   298           Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)
   299         else Failure("comment expected", in)
   300       }
   301     }.named("comment_depth")
   302 
   303     def comment: Parser[String] =
   304       comment_depth(0) ^? { case (x, d) if d == 0 => x }
   305 
   306     def comment_context(ctxt: Context): Parser[(String, Context)] =
   307     {
   308       val depth =
   309         ctxt match {
   310           case Finished => 0
   311           case Comment(d) => d
   312           case _ => -1
   313         }
   314       if (depth >= 0)
   315         comment_depth(depth) ^^
   316           { case (x, 0) => (x, Finished)
   317             case (x, d) => (x, Comment(d)) }
   318       else failure("")
   319     }
   320 
   321     def comment_content(source: String): String =
   322     {
   323       require(parseAll(comment, source).successful)
   324       source.substring(2, source.length - 2)
   325     }
   326 
   327 
   328     /* outer syntax tokens */
   329 
   330     private def delimited_token: Parser[Token] =
   331     {
   332       val string = quoted("\"") ^^ (x => Token(Token.Kind.STRING, x))
   333       val alt_string = quoted("`") ^^ (x => Token(Token.Kind.ALT_STRING, x))
   334       val verb = verbatim ^^ (x => Token(Token.Kind.VERBATIM, x))
   335       val cmt = comment ^^ (x => Token(Token.Kind.COMMENT, x))
   336 
   337       string | (alt_string | (verb | cmt))
   338     }
   339 
   340     private def other_token(is_command: String => Boolean)
   341       : Parser[Token] =
   342     {
   343       val id = one(Symbol.is_letter) ~ many(Symbol.is_letdig) ^^ { case x ~ y => x + y }
   344       val nat = many1(Symbol.is_digit)
   345       val natdot = nat ~ "." ~ nat ^^ { case x ~ y ~ z => x + y + z }
   346       val id_nat = id ~ opt("." ~ nat) ^^ { case x ~ Some(y ~ z) => x + y + z case x ~ None => x }
   347 
   348       val ident = id ~ rep("." ~> id) ^^
   349         { case x ~ Nil => Token(Token.Kind.IDENT, x)
   350           case x ~ ys => Token(Token.Kind.LONG_IDENT, (x :: ys).mkString(".")) }
   351 
   352       val var_ = "?" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.VAR, x + y) }
   353       val type_ident = "'" ~ id ^^ { case x ~ y => Token(Token.Kind.TYPE_IDENT, x + y) }
   354       val type_var = "?'" ~ id_nat ^^ { case x ~ y => Token(Token.Kind.TYPE_VAR, x + y) }
   355       val nat_ = nat ^^ (x => Token(Token.Kind.NAT, x))
   356       val float =
   357         ("-" ~ natdot ^^ { case x ~ y => x + y } | natdot) ^^ (x => Token(Token.Kind.FLOAT, x))
   358 
   359       val sym_ident =
   360         (many1(Symbol.is_symbolic_char) | one(sym => Symbol.is_symbolic(sym))) ^^
   361         (x => Token(Token.Kind.SYM_IDENT, x))
   362 
   363       val space = many1(Symbol.is_blank) ^^ (x => Token(Token.Kind.SPACE, x))
   364 
   365       // FIXME check
   366       val junk = many(sym => !(Symbol.is_blank(sym)))
   367       val junk1 = many1(sym => !(Symbol.is_blank(sym)))
   368 
   369       val bad_delimiter =
   370         ("\"" | "`" | "{*" | "(*") ~ junk ^^ { case x ~ y => Token(Token.Kind.UNPARSED, x + y) }
   371       val bad = junk1 ^^ (x => Token(Token.Kind.UNPARSED, x))
   372 
   373       val command_keyword =
   374         keyword ^^ (x => Token(if (is_command(x)) Token.Kind.COMMAND else Token.Kind.KEYWORD, x))
   375 
   376       space | (bad_delimiter |
   377         (((ident | (var_ | (type_ident | (type_var | (float | (nat_ | sym_ident)))))) |||
   378           command_keyword) | bad))
   379     }
   380 
   381     def token(is_command: String => Boolean): Parser[Token] =
   382       delimited_token | other_token(is_command)
   383 
   384     def token_context(is_command: String => Boolean, ctxt: Context): Parser[(Token, Context)] =
   385     {
   386       val string =
   387         quoted_context("\"", ctxt) ^^ { case (x, c) => (Token(Token.Kind.STRING, x), c) }
   388       val alt_string =
   389         quoted_context("`", ctxt) ^^ { case (x, c) => (Token(Token.Kind.ALT_STRING, x), c) }
   390       val verb = verbatim_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.VERBATIM, x), c) }
   391       val cmt = comment_context(ctxt) ^^ { case (x, c) => (Token(Token.Kind.COMMENT, x), c) }
   392       val other = other_token(is_command) ^^ { case x => (x, Finished) }
   393 
   394       string | (alt_string | (verb | (cmt | other)))
   395     }
   396   }
   397 
   398 
   399 
   400   /** read file without decoding -- enables efficient length operation **/
   401 
   402   private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int)
   403     extends CharSequence
   404   {
   405     def charAt(i: Int): Char =
   406       if (0 <= i && i < length) seq(start + i)
   407       else throw new IndexOutOfBoundsException
   408 
   409     def length: Int = end - start  // avoid potentially expensive seq.length
   410 
   411     def subSequence(i: Int, j: Int): CharSequence =
   412       if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j)
   413       else throw new IndexOutOfBoundsException
   414 
   415     override def toString: String =
   416     {
   417       val buf = new StringBuilder(length)
   418       for (offset <- start until end) buf.append(seq(offset))
   419       buf.toString
   420     }
   421   }
   422 
   423   abstract class Byte_Reader extends Reader[Char] { def close: Unit }
   424 
   425   def byte_reader(file: JFile): Byte_Reader =
   426   {
   427     val stream = new BufferedInputStream(new FileInputStream(file))
   428     val seq = new PagedSeq(
   429       (buf: Array[Char], offset: Int, length: Int) =>
   430         {
   431           var i = 0
   432           var c = 0
   433           var eof = false
   434           while (!eof && i < length) {
   435             c = stream.read
   436             if (c == -1) eof = true
   437             else { buf(offset + i) = c.toChar; i += 1 }
   438           }
   439           if (i > 0) i else -1
   440         })
   441     val restricted_seq = new Restricted_Seq(seq, 0, file.length.toInt)
   442 
   443     class Paged_Reader(override val offset: Int) extends Byte_Reader
   444     {
   445       override lazy val source: CharSequence = restricted_seq
   446       def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\032'
   447       def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this
   448       def pos: InputPosition = new OffsetPosition(source, offset)
   449       def atEnd: Boolean = !seq.isDefinedAt(offset)
   450       override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n)
   451       def close { stream.close }
   452     }
   453     new Paged_Reader(0)
   454   }
   455 }