src/Pure/General/scan.scala
author wenzelm
Fri Feb 14 16:25:30 2014 +0100 (2014-02-14)
changeset 55494 009b71c1ed23
parent 55492 28d4db6c6e79
child 55497 c0f8aebfb43d
permissions -rw-r--r--
tuned signature (in accordance to ML version);
     1 /*  Title:      Pure/General/scan.scala
     2     Author:     Makarius
     3 
     4 Efficient scanning of keywords and tokens.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.{IndexedSeq, TraversableOnce}
    11 import scala.collection.immutable.PagedSeq
    12 import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
    13 import scala.util.parsing.combinator.RegexParsers
    14 
    15 import java.io.{File => JFile, BufferedInputStream, FileInputStream}
    16 
    17 
    18 object Scan
    19 {
    20   /** context of partial scans **/
    21 
    22   sealed abstract class Context
    23   case object Finished extends Context
    24   case class Quoted(quote: String) extends Context
    25   case object Verbatim extends Context
    26   case class Cartouche(depth: Int) extends Context
    27   case class Comment(depth: Int) extends Context
    28 
    29 
    30 
    31   /** parser combinators **/
    32 
    33   object Parsers extends Parsers
    34 
    35   trait Parsers extends RegexParsers
    36   {
    37     override val whiteSpace = "".r
    38 
    39 
    40     /* optional termination */
    41 
    42     def opt_term[T](p: => Parser[T]): Parser[Option[T]] =
    43       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
    44 
    45 
    46     /* symbol range */
    47 
    48     def symbol_range(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
    49       new Parser[String]
    50       {
    51         def apply(in: Input) =
    52         {
    53           val start = in.offset
    54           val end = in.source.length
    55           val matcher = new Symbol.Matcher(in.source)
    56 
    57           var i = start
    58           var count = 0
    59           var finished = false
    60           while (!finished && i < end && count < max_count) {
    61             val n = matcher(i, end)
    62             val sym = in.source.subSequence(i, i + n).toString
    63             if (pred(sym)) { i += n; count += 1 }
    64             else finished = true
    65           }
    66           if (count < min_count) Failure("bad input", in)
    67           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
    68         }
    69       }.named("symbol_range")
    70 
    71     def one(pred: Symbol.Symbol => Boolean): Parser[String] =
    72       symbol_range(pred, 1, 1)
    73 
    74     def many(pred: Symbol.Symbol => Boolean): Parser[String] =
    75       symbol_range(pred, 0, Integer.MAX_VALUE)
    76 
    77     def many1(pred: Symbol.Symbol => Boolean): Parser[String] =
    78       symbol_range(pred, 1, Integer.MAX_VALUE)
    79 
    80 
    81     /* quoted strings */
    82 
    83     private def quoted_body(quote: Symbol.Symbol): Parser[String] =
    84     {
    85       rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
    86         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
    87     }
    88 
    89     def quoted(quote: Symbol.Symbol): Parser[String] =
    90     {
    91       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
    92     }.named("quoted")
    93 
    94     def quoted_content(quote: Symbol.Symbol, source: String): String =
    95     {
    96       require(parseAll(quoted(quote), source).successful)
    97       val body = source.substring(1, source.length - 1)
    98       if (body.exists(_ == '\\')) {
    99         val content =
   100           rep(many1(sym => sym != quote && sym != "\\") |
   101               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
   102         parseAll(content ^^ (_.mkString), body).get
   103       }
   104       else body
   105     }
   106 
   107     def quoted_context(quote: Symbol.Symbol, ctxt: Context): Parser[(String, Context)] =
   108     {
   109       ctxt match {
   110         case Finished =>
   111           quote ~ quoted_body(quote) ~ opt_term(quote) ^^
   112             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   113               case x ~ y ~ None => (x + y, Quoted(quote)) }
   114         case Quoted(q) if q == quote =>
   115           quoted_body(quote) ~ opt_term(quote) ^^
   116             { case x ~ Some(y) => (x + y, Finished)
   117               case x ~ None => (x, ctxt) }
   118         case _ => failure("")
   119       }
   120     }.named("quoted_context")
   121 
   122     def recover_quoted(quote: Symbol.Symbol): Parser[String] =
   123       quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
   124 
   125 
   126     /* verbatim text */
   127 
   128     private def verbatim_body: Parser[String] =
   129       rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)
   130 
   131     def verbatim: Parser[String] =
   132     {
   133       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
   134     }.named("verbatim")
   135 
   136     def verbatim_content(source: String): String =
   137     {
   138       require(parseAll(verbatim, source).successful)
   139       source.substring(2, source.length - 2)
   140     }
   141 
   142     def verbatim_context(ctxt: Context): Parser[(String, Context)] =
   143     {
   144       ctxt match {
   145         case Finished =>
   146           "{*" ~ verbatim_body ~ opt_term("*}") ^^
   147             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   148               case x ~ y ~ None => (x + y, Verbatim) }
   149         case Verbatim =>
   150           verbatim_body ~ opt_term("*}") ^^
   151             { case x ~ Some(y) => (x + y, Finished)
   152               case x ~ None => (x, Verbatim) }
   153         case _ => failure("")
   154       }
   155     }.named("verbatim_context")
   156 
   157     val recover_verbatim: Parser[String] =
   158       "{*" ~ verbatim_body ^^ { case x ~ y => x + y }
   159 
   160 
   161     /* nested text cartouches */
   162 
   163     private def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   164     {
   165       require(depth >= 0)
   166 
   167       def apply(in: Input) =
   168       {
   169         val start = in.offset
   170         val end = in.source.length
   171         val matcher = new Symbol.Matcher(in.source)
   172 
   173         var i = start
   174         var d = depth
   175         var finished = false
   176         while (!finished && i < end) {
   177           val n = matcher(i, end)
   178           val sym = in.source.subSequence(i, i + n).toString
   179           if (Symbol.is_open(sym)) { i += n; d += 1 }
   180           else if (d > 0) { i += n; if (Symbol.is_close(sym)) d -= 1 }
   181           else finished = true
   182         }
   183         if (i == start) Failure("bad input", in)
   184         else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start))
   185       }
   186     }.named("cartouche_depth")
   187 
   188     def cartouche: Parser[String] =
   189       cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
   190 
   191     def cartouche_context(ctxt: Context): Parser[(String, Context)] =
   192     {
   193       val depth =
   194         ctxt match {
   195           case Finished => 0
   196           case Cartouche(d) => d
   197           case _ => -1
   198         }
   199       if (depth >= 0)
   200         cartouche_depth(depth) ^^
   201           { case (x, 0) => (x, Finished)
   202             case (x, d) => (x, Cartouche(d)) }
   203       else failure("")
   204     }
   205 
   206     val recover_cartouche: Parser[String] =
   207       cartouche_depth(0) ^^ (_._1)
   208 
   209     def cartouche_content(source: String): String =
   210     {
   211       def err(): Nothing = error("Malformed text cartouche: " + quote(source))
   212       val source1 =
   213         Library.try_unprefix(Symbol.open_decoded, source) orElse
   214           Library.try_unprefix(Symbol.open, source) getOrElse err()
   215       Library.try_unsuffix(Symbol.close_decoded, source1) orElse
   216         Library.try_unsuffix(Symbol.close, source1) getOrElse err()
   217     }
   218 
   219 
   220     /* nested comments */
   221 
   222     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   223     {
   224       require(depth >= 0)
   225 
   226       val comment_text =
   227         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
   228 
   229       def apply(in: Input) =
   230       {
   231         var rest = in
   232         def try_parse[A](p: Parser[A]): Boolean =
   233         {
   234           parse(p ^^^ (), rest) match {
   235             case Success(_, next) => { rest = next; true }
   236             case _ => false
   237           }
   238         }
   239         var d = depth
   240         var finished = false
   241         while (!finished) {
   242           if (try_parse("(*")) d += 1
   243           else if (d > 0 && try_parse("*)")) d -= 1
   244           else if (d == 0 || !try_parse(comment_text)) finished = true
   245         }
   246         if (in.offset < rest.offset)
   247           Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)
   248         else Failure("comment expected", in)
   249       }
   250     }.named("comment_depth")
   251 
   252     def comment: Parser[String] =
   253       comment_depth(0) ^? { case (x, d) if d == 0 => x }
   254 
   255     def comment_context(ctxt: Context): Parser[(String, Context)] =
   256     {
   257       val depth =
   258         ctxt match {
   259           case Finished => 0
   260           case Comment(d) => d
   261           case _ => -1
   262         }
   263       if (depth >= 0)
   264         comment_depth(depth) ^^
   265           { case (x, 0) => (x, Finished)
   266             case (x, d) => (x, Comment(d)) }
   267       else failure("")
   268     }
   269 
   270     val recover_comment: Parser[String] =
   271       comment_depth(0) ^^ (_._1)
   272 
   273     def comment_content(source: String): String =
   274     {
   275       require(parseAll(comment, source).successful)
   276       source.substring(2, source.length - 2)
   277     }
   278 
   279 
   280     /* keyword */
   281 
   282     def keyword(lexicon: Lexicon): Parser[String] = new Parser[String]
   283     {
   284       def apply(in: Input) =
   285       {
   286         val result = lexicon.scan(in)
   287         if (result.isEmpty) Failure("keyword expected", in)
   288         else Success(result, in.drop(result.length))
   289       }
   290     }.named("keyword")
   291   }
   292 
   293 
   294 
   295   /** Lexicon -- position tree **/
   296 
   297   object Lexicon
   298   {
   299     /* representation */
   300 
   301     private sealed case class Tree(val branches: Map[Char, (String, Tree)])
   302     private val empty_tree = Tree(Map())
   303 
   304     val empty: Lexicon = new Lexicon(empty_tree)
   305     def apply(elems: String*): Lexicon = empty ++ elems
   306   }
   307 
   308   final class Lexicon private(rep: Lexicon.Tree)
   309   {
   310     /* auxiliary operations */
   311 
   312     private def content(tree: Lexicon.Tree, result: List[String]): List[String] =
   313       (result /: tree.branches.toList) ((res, entry) =>
   314         entry match { case (_, (s, tr)) =>
   315           if (s.isEmpty) content(tr, res) else content(tr, s :: res) })
   316 
   317     private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] =
   318     {
   319       val len = str.length
   320       def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] =
   321       {
   322         if (i < len) {
   323           tree.branches.get(str.charAt(i)) match {
   324             case Some((s, tr)) => look(tr, !s.isEmpty, i + 1)
   325             case None => None
   326           }
   327         } else Some(tip, tree)
   328       }
   329       look(rep, false, 0)
   330     }
   331 
   332     def completions(str: CharSequence): List[String] =
   333       lookup(str) match {
   334         case Some((true, tree)) => content(tree, List(str.toString))
   335         case Some((false, tree)) => content(tree, Nil)
   336         case None => Nil
   337       }
   338 
   339 
   340     /* pseudo Set methods */
   341 
   342     def iterator: Iterator[String] = content(rep, Nil).sorted.iterator
   343 
   344     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
   345 
   346     def empty: Lexicon = Lexicon.empty
   347     def isEmpty: Boolean = rep.branches.isEmpty
   348 
   349     def contains(elem: String): Boolean =
   350       lookup(elem) match {
   351         case Some((tip, _)) => tip
   352         case _ => false
   353       }
   354 
   355 
   356     /* add elements */
   357 
   358     def + (elem: String): Lexicon =
   359       if (contains(elem)) this
   360       else {
   361         val len = elem.length
   362         def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree =
   363           if (i < len) {
   364             val c = elem.charAt(i)
   365             val end = (i + 1 == len)
   366             tree.branches.get(c) match {
   367               case Some((s, tr)) =>
   368                 Lexicon.Tree(tree.branches +
   369                   (c -> (if (end) elem else s, extend(tr, i + 1))))
   370               case None =>
   371                 Lexicon.Tree(tree.branches +
   372                   (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))
   373             }
   374           }
   375           else tree
   376         new Lexicon(extend(rep, 0))
   377       }
   378 
   379     def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _)
   380 
   381 
   382     /* scan */
   383 
   384     def scan(in: Reader[Char]): String =
   385     {
   386       val source = in.source
   387       val offset = in.offset
   388       val len = source.length - offset
   389 
   390       def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String =
   391       {
   392         if (i < len) {
   393           tree.branches.get(source.charAt(offset + i)) match {
   394             case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1)
   395             case None => result
   396           }
   397         }
   398         else result
   399       }
   400       scan_tree(rep, "", 0)
   401     }
   402   }
   403 }