src/Pure/General/scan.scala
author wenzelm
Mon Dec 01 15:21:49 2014 +0100 (2014-12-01)
changeset 59073 dcecfcc56dce
parent 58505 d1fe47fe5401
child 59319 677615cba30d
permissions -rw-r--r--
more merge operations;
     1 /*  Title:      Pure/General/scan.scala
     2     Author:     Makarius
     3 
     4 Efficient scanning of keywords and tokens.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.annotation.tailrec
    11 import scala.collection.{IndexedSeq, TraversableOnce}
    12 import scala.collection.immutable.PagedSeq
    13 import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
    14 import scala.util.parsing.combinator.RegexParsers
    15 
    16 import java.io.{File => JFile, BufferedInputStream, FileInputStream, InputStream}
    17 import java.net.URL
    18 
    19 
    20 object Scan
    21 {
    22   /** context of partial line-oriented scans **/
    23 
    24   abstract class Line_Context
    25   case object Finished extends Line_Context
    26   case class Quoted(quote: String) extends Line_Context
    27   case object Verbatim extends Line_Context
    28   case class Cartouche(depth: Int) extends Line_Context
    29   case class Comment(depth: Int) extends Line_Context
    30 
    31 
    32 
    33   /** parser combinators **/
    34 
    35   object Parsers extends Parsers
    36 
    37   trait Parsers extends RegexParsers
    38   {
    39     override val whiteSpace = "".r
    40 
    41 
    42     /* optional termination */
    43 
    44     def opt_term[T](p: => Parser[T]): Parser[Option[T]] =
    45       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
    46 
    47 
    48     /* repeated symbols */
    49 
    50     def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
    51       new Parser[String]
    52       {
    53         def apply(in: Input) =
    54         {
    55           val start = in.offset
    56           val end = in.source.length
    57           val matcher = new Symbol.Matcher(in.source)
    58 
    59           var i = start
    60           var count = 0
    61           var finished = false
    62           while (!finished && i < end && count < max_count) {
    63             val n = matcher(i, end)
    64             val sym = in.source.subSequence(i, i + n).toString
    65             if (pred(sym)) { i += n; count += 1 }
    66             else finished = true
    67           }
    68           if (count < min_count) Failure("bad input", in)
    69           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
    70         }
    71       }.named("repeated")
    72 
    73     def one(pred: Symbol.Symbol => Boolean): Parser[String] =
    74       repeated(pred, 1, 1)
    75 
    76     def many(pred: Symbol.Symbol => Boolean): Parser[String] =
    77       repeated(pred, 0, Integer.MAX_VALUE)
    78 
    79     def many1(pred: Symbol.Symbol => Boolean): Parser[String] =
    80       repeated(pred, 1, Integer.MAX_VALUE)
    81 
    82 
    83     /* character */
    84 
    85     def character(pred: Char => Boolean): Symbol.Symbol => Boolean =
    86       (s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0))
    87 
    88 
    89     /* quoted strings */
    90 
    91     private def quoted_body(quote: Symbol.Symbol): Parser[String] =
    92     {
    93       rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
    94         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
    95     }
    96 
    97     def quoted(quote: Symbol.Symbol): Parser[String] =
    98     {
    99       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
   100     }.named("quoted")
   101 
   102     def quoted_content(quote: Symbol.Symbol, source: String): String =
   103     {
   104       require(parseAll(quoted(quote), source).successful)
   105       val body = source.substring(1, source.length - 1)
   106       if (body.exists(_ == '\\')) {
   107         val content =
   108           rep(many1(sym => sym != quote && sym != "\\") |
   109               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
   110         parseAll(content ^^ (_.mkString), body).get
   111       }
   112       else body
   113     }
   114 
   115     def quoted_line(quote: Symbol.Symbol, ctxt: Line_Context): Parser[(String, Line_Context)] =
   116     {
   117       ctxt match {
   118         case Finished =>
   119           quote ~ quoted_body(quote) ~ opt_term(quote) ^^
   120             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   121               case x ~ y ~ None => (x + y, Quoted(quote)) }
   122         case Quoted(q) if q == quote =>
   123           quoted_body(quote) ~ opt_term(quote) ^^
   124             { case x ~ Some(y) => (x + y, Finished)
   125               case x ~ None => (x, ctxt) }
   126         case _ => failure("")
   127       }
   128     }.named("quoted_line")
   129 
   130     def recover_quoted(quote: Symbol.Symbol): Parser[String] =
   131       quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
   132 
   133 
   134     /* verbatim text */
   135 
   136     private def verbatim_body: Parser[String] =
   137       rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)
   138 
   139     def verbatim: Parser[String] =
   140     {
   141       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
   142     }.named("verbatim")
   143 
   144     def verbatim_content(source: String): String =
   145     {
   146       require(parseAll(verbatim, source).successful)
   147       source.substring(2, source.length - 2)
   148     }
   149 
   150     def verbatim_line(ctxt: Line_Context): Parser[(String, Line_Context)] =
   151     {
   152       ctxt match {
   153         case Finished =>
   154           "{*" ~ verbatim_body ~ opt_term("*}") ^^
   155             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   156               case x ~ y ~ None => (x + y, Verbatim) }
   157         case Verbatim =>
   158           verbatim_body ~ opt_term("*}") ^^
   159             { case x ~ Some(y) => (x + y, Finished)
   160               case x ~ None => (x, Verbatim) }
   161         case _ => failure("")
   162       }
   163     }.named("verbatim_line")
   164 
   165     val recover_verbatim: Parser[String] =
   166       "{*" ~ verbatim_body ^^ { case x ~ y => x + y }
   167 
   168 
   169     /* nested text cartouches */
   170 
   171     private def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   172     {
   173       require(depth >= 0)
   174 
   175       def apply(in: Input) =
   176       {
   177         val start = in.offset
   178         val end = in.source.length
   179         val matcher = new Symbol.Matcher(in.source)
   180 
   181         var i = start
   182         var d = depth
   183         var finished = false
   184         while (!finished && i < end) {
   185           val n = matcher(i, end)
   186           val sym = in.source.subSequence(i, i + n).toString
   187           if (Symbol.is_open(sym)) { i += n; d += 1 }
   188           else if (Symbol.is_close(sym) && d > 0) { i += n; d -= 1; if (d == 0) finished = true }
   189           else if (d > 0) i += n
   190           else finished = true
   191         }
   192         if (i == start) Failure("bad input", in)
   193         else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start))
   194       }
   195     }.named("cartouche_depth")
   196 
   197     def cartouche: Parser[String] =
   198       cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
   199 
   200     def cartouche_line(ctxt: Line_Context): Parser[(String, Line_Context)] =
   201     {
   202       val depth =
   203         ctxt match {
   204           case Finished => 0
   205           case Cartouche(d) => d
   206           case _ => -1
   207         }
   208       if (depth >= 0)
   209         cartouche_depth(depth) ^^
   210           { case (x, 0) => (x, Finished)
   211             case (x, d) => (x, Cartouche(d)) }
   212       else failure("")
   213     }
   214 
   215     val recover_cartouche: Parser[String] =
   216       cartouche_depth(0) ^^ (_._1)
   217 
   218     def cartouche_content(source: String): String =
   219     {
   220       def err(): Nothing = error("Malformed text cartouche: " + quote(source))
   221       val source1 =
   222         Library.try_unprefix(Symbol.open_decoded, source) orElse
   223           Library.try_unprefix(Symbol.open, source) getOrElse err()
   224       Library.try_unsuffix(Symbol.close_decoded, source1) orElse
   225         Library.try_unsuffix(Symbol.close, source1) getOrElse err()
   226     }
   227 
   228 
   229     /* nested comments */
   230 
   231     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   232     {
   233       require(depth >= 0)
   234 
   235       val comment_text =
   236         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
   237 
   238       def apply(in: Input) =
   239       {
   240         var rest = in
   241         def try_parse[A](p: Parser[A]): Boolean =
   242         {
   243           parse(p ^^^ (()), rest) match {
   244             case Success(_, next) => { rest = next; true }
   245             case _ => false
   246           }
   247         }
   248         var d = depth
   249         var finished = false
   250         while (!finished) {
   251           if (try_parse("(*")) d += 1
   252           else if (d > 0 && try_parse("*)")) { d -= 1; if (d == 0) finished = true }
   253           else if (d == 0 || !try_parse(comment_text)) finished = true
   254         }
   255         if (in.offset < rest.offset)
   256           Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)
   257         else Failure("comment expected", in)
   258       }
   259     }.named("comment_depth")
   260 
   261     def comment: Parser[String] =
   262       comment_depth(0) ^? { case (x, d) if d == 0 => x }
   263 
   264     def comment_line(ctxt: Line_Context): Parser[(String, Line_Context)] =
   265     {
   266       val depth =
   267         ctxt match {
   268           case Finished => 0
   269           case Comment(d) => d
   270           case _ => -1
   271         }
   272       if (depth >= 0)
   273         comment_depth(depth) ^^
   274           { case (x, 0) => (x, Finished)
   275             case (x, d) => (x, Comment(d)) }
   276       else failure("")
   277     }
   278 
   279     val recover_comment: Parser[String] =
   280       comment_depth(0) ^^ (_._1)
   281 
   282     def comment_content(source: String): String =
   283     {
   284       require(parseAll(comment, source).successful)
   285       source.substring(2, source.length - 2)
   286     }
   287 
   288 
   289     /* keyword */
   290 
   291     def literal(lexicon: Lexicon): Parser[String] = new Parser[String]
   292     {
   293       def apply(in: Input) =
   294       {
   295         val result = lexicon.scan(in)
   296         if (result.isEmpty) Failure("keyword expected", in)
   297         else Success(result, in.drop(result.length))
   298       }
   299     }.named("keyword")
   300   }
   301 
   302 
   303 
   304   /** Lexicon -- position tree **/
   305 
   306   object Lexicon
   307   {
   308     /* representation */
   309 
   310     private sealed case class Tree(val branches: Map[Char, (String, Tree)])
   311     private val empty_tree = Tree(Map())
   312 
   313     val empty: Lexicon = new Lexicon(empty_tree)
   314     def apply(elems: String*): Lexicon = empty ++ elems
   315   }
   316 
   317   final class Lexicon private(rep: Lexicon.Tree)
   318   {
   319     /* auxiliary operations */
   320 
   321     private def dest(tree: Lexicon.Tree, result: List[String]): List[String] =
   322       (result /: tree.branches.toList) ((res, entry) =>
   323         entry match { case (_, (s, tr)) =>
   324           if (s.isEmpty) dest(tr, res) else dest(tr, s :: res) })
   325 
   326     private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] =
   327     {
   328       val len = str.length
   329       @tailrec def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] =
   330       {
   331         if (i < len) {
   332           tree.branches.get(str.charAt(i)) match {
   333             case Some((s, tr)) => look(tr, !s.isEmpty, i + 1)
   334             case None => None
   335           }
   336         }
   337         else Some(tip, tree)
   338       }
   339       look(rep, false, 0)
   340     }
   341 
   342     def completions(str: CharSequence): List[String] =
   343       lookup(str) match {
   344         case Some((true, tree)) => dest(tree, List(str.toString))
   345         case Some((false, tree)) => dest(tree, Nil)
   346         case None => Nil
   347       }
   348 
   349 
   350     /* pseudo Set methods */
   351 
   352     def raw_iterator: Iterator[String] = dest(rep, Nil).iterator
   353     def iterator: Iterator[String] = dest(rep, Nil).sorted.iterator
   354 
   355     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
   356 
   357     def is_empty: Boolean = rep.branches.isEmpty
   358 
   359     def contains(elem: String): Boolean =
   360       lookup(elem) match {
   361         case Some((tip, _)) => tip
   362         case _ => false
   363       }
   364 
   365 
   366     /* build lexicon */
   367 
   368     def + (elem: String): Lexicon =
   369       if (contains(elem)) this
   370       else {
   371         val len = elem.length
   372         def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree =
   373           if (i < len) {
   374             val c = elem.charAt(i)
   375             val end = (i + 1 == len)
   376             tree.branches.get(c) match {
   377               case Some((s, tr)) =>
   378                 Lexicon.Tree(tree.branches +
   379                   (c -> (if (end) elem else s, extend(tr, i + 1))))
   380               case None =>
   381                 Lexicon.Tree(tree.branches +
   382                   (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))
   383             }
   384           }
   385           else tree
   386         new Lexicon(extend(rep, 0))
   387       }
   388 
   389     def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _)
   390 
   391     def ++ (other: Lexicon): Lexicon =
   392       if (this eq other) this
   393       else if (is_empty) other
   394       else this ++ other.raw_iterator
   395 
   396 
   397     /* scan */
   398 
   399     def scan(in: Reader[Char]): String =
   400     {
   401       val source = in.source
   402       val offset = in.offset
   403       val len = source.length - offset
   404 
   405       def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String =
   406       {
   407         if (i < len) {
   408           tree.branches.get(source.charAt(offset + i)) match {
   409             case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1)
   410             case None => result
   411           }
   412         }
   413         else result
   414       }
   415       scan_tree(rep, "", 0)
   416     }
   417   }
   418 
   419 
   420 
   421   /** read stream without decoding: efficient length operation **/
   422 
   423   private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int)
   424     extends CharSequence
   425   {
   426     def charAt(i: Int): Char =
   427       if (0 <= i && i < length) seq(start + i)
   428       else throw new IndexOutOfBoundsException
   429 
   430     def length: Int = end - start  // avoid expensive seq.length
   431 
   432     def subSequence(i: Int, j: Int): CharSequence =
   433       if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j)
   434       else throw new IndexOutOfBoundsException
   435 
   436     override def toString: String =
   437     {
   438       val buf = new StringBuilder(length)
   439       for (offset <- start until end) buf.append(seq(offset))
   440       buf.toString
   441     }
   442   }
   443 
   444   abstract class Byte_Reader extends Reader[Char] { def close: Unit }
   445 
   446   private def make_byte_reader(stream: InputStream, stream_length: Int): Byte_Reader =
   447   {
   448     val buffered_stream = new BufferedInputStream(stream)
   449     val seq = new PagedSeq(
   450       (buf: Array[Char], offset: Int, length: Int) =>
   451         {
   452           var i = 0
   453           var c = 0
   454           var eof = false
   455           while (!eof && i < length) {
   456             c = buffered_stream.read
   457             if (c == -1) eof = true
   458             else { buf(offset + i) = c.toChar; i += 1 }
   459           }
   460           if (i > 0) i else -1
   461         })
   462     val restricted_seq = new Restricted_Seq(seq, 0, stream_length)
   463 
   464     class Paged_Reader(override val offset: Int) extends Byte_Reader
   465     {
   466       override lazy val source: CharSequence = restricted_seq
   467       def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\u001a'
   468       def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this
   469       def pos: InputPosition = new OffsetPosition(source, offset)
   470       def atEnd: Boolean = !seq.isDefinedAt(offset)
   471       override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n)
   472       def close { buffered_stream.close }
   473     }
   474     new Paged_Reader(0)
   475   }
   476 
   477   def byte_reader(file: JFile): Byte_Reader =
   478     make_byte_reader(new FileInputStream(file), file.length.toInt)
   479 
   480   def byte_reader(url: URL): Byte_Reader =
   481   {
   482     val connection = url.openConnection
   483     val stream = connection.getInputStream
   484     val stream_length = connection.getContentLength
   485     make_byte_reader(stream, stream_length)
   486   }
   487 }