src/Pure/General/scan.scala
author wenzelm
Fri Feb 14 21:06:20 2014 +0100 (2014-02-14)
changeset 55497 c0f8aebfb43d
parent 55494 009b71c1ed23
child 55499 2581fbee5b95
permissions -rw-r--r--
lexical syntax for SML (in Scala);
tuned;
     1 /*  Title:      Pure/General/scan.scala
     2     Author:     Makarius
     3 
     4 Efficient scanning of keywords and tokens.
     5 */
     6 
     7 package isabelle
     8 
     9 
    10 import scala.collection.{IndexedSeq, TraversableOnce}
    11 import scala.collection.immutable.PagedSeq
    12 import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
    13 import scala.util.parsing.combinator.RegexParsers
    14 
    15 import java.io.{File => JFile, BufferedInputStream, FileInputStream}
    16 
    17 
    18 object Scan
    19 {
    20   /** context of partial scans **/
    21 
    22   sealed abstract class Context
    23   case object Finished extends Context
    24   case class Quoted(quote: String) extends Context
    25   case object Verbatim extends Context
    26   case class Cartouche(depth: Int) extends Context
    27   case class Comment(depth: Int) extends Context
    28 
    29 
    30 
    31   /** parser combinators **/
    32 
    33   object Parsers extends Parsers
    34 
    35   trait Parsers extends RegexParsers
    36   {
    37     override val whiteSpace = "".r
    38 
    39 
    40     /* optional termination */
    41 
    42     def opt_term[T](p: => Parser[T]): Parser[Option[T]] =
    43       p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)
    44 
    45 
    46     /* repeated symbols */
    47 
    48     def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =
    49       new Parser[String]
    50       {
    51         def apply(in: Input) =
    52         {
    53           val start = in.offset
    54           val end = in.source.length
    55           val matcher = new Symbol.Matcher(in.source)
    56 
    57           var i = start
    58           var count = 0
    59           var finished = false
    60           while (!finished && i < end && count < max_count) {
    61             val n = matcher(i, end)
    62             val sym = in.source.subSequence(i, i + n).toString
    63             if (pred(sym)) { i += n; count += 1 }
    64             else finished = true
    65           }
    66           if (count < min_count) Failure("bad input", in)
    67           else Success(in.source.subSequence(start, i).toString, in.drop(i - start))
    68         }
    69       }.named("repeated")
    70 
    71     def one(pred: Symbol.Symbol => Boolean): Parser[String] =
    72       repeated(pred, 1, 1)
    73 
    74     def many(pred: Symbol.Symbol => Boolean): Parser[String] =
    75       repeated(pred, 0, Integer.MAX_VALUE)
    76 
    77     def many1(pred: Symbol.Symbol => Boolean): Parser[String] =
    78       repeated(pred, 1, Integer.MAX_VALUE)
    79 
    80 
    81     /* character */
    82 
    83     def character(pred: Char => Boolean): Symbol.Symbol => Boolean =
    84       (s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0))
    85 
    86 
    87     /* quoted strings */
    88 
    89     private def quoted_body(quote: Symbol.Symbol): Parser[String] =
    90     {
    91       rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |
    92         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
    93     }
    94 
    95     def quoted(quote: Symbol.Symbol): Parser[String] =
    96     {
    97       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
    98     }.named("quoted")
    99 
   100     def quoted_content(quote: Symbol.Symbol, source: String): String =
   101     {
   102       require(parseAll(quoted(quote), source).successful)
   103       val body = source.substring(1, source.length - 1)
   104       if (body.exists(_ == '\\')) {
   105         val content =
   106           rep(many1(sym => sym != quote && sym != "\\") |
   107               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
   108         parseAll(content ^^ (_.mkString), body).get
   109       }
   110       else body
   111     }
   112 
   113     def quoted_context(quote: Symbol.Symbol, ctxt: Context): Parser[(String, Context)] =
   114     {
   115       ctxt match {
   116         case Finished =>
   117           quote ~ quoted_body(quote) ~ opt_term(quote) ^^
   118             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   119               case x ~ y ~ None => (x + y, Quoted(quote)) }
   120         case Quoted(q) if q == quote =>
   121           quoted_body(quote) ~ opt_term(quote) ^^
   122             { case x ~ Some(y) => (x + y, Finished)
   123               case x ~ None => (x, ctxt) }
   124         case _ => failure("")
   125       }
   126     }.named("quoted_context")
   127 
   128     def recover_quoted(quote: Symbol.Symbol): Parser[String] =
   129       quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
   130 
   131 
   132     /* verbatim text */
   133 
   134     private def verbatim_body: Parser[String] =
   135       rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)
   136 
   137     def verbatim: Parser[String] =
   138     {
   139       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
   140     }.named("verbatim")
   141 
   142     def verbatim_content(source: String): String =
   143     {
   144       require(parseAll(verbatim, source).successful)
   145       source.substring(2, source.length - 2)
   146     }
   147 
   148     def verbatim_context(ctxt: Context): Parser[(String, Context)] =
   149     {
   150       ctxt match {
   151         case Finished =>
   152           "{*" ~ verbatim_body ~ opt_term("*}") ^^
   153             { case x ~ y ~ Some(z) => (x + y + z, Finished)
   154               case x ~ y ~ None => (x + y, Verbatim) }
   155         case Verbatim =>
   156           verbatim_body ~ opt_term("*}") ^^
   157             { case x ~ Some(y) => (x + y, Finished)
   158               case x ~ None => (x, Verbatim) }
   159         case _ => failure("")
   160       }
   161     }.named("verbatim_context")
   162 
   163     val recover_verbatim: Parser[String] =
   164       "{*" ~ verbatim_body ^^ { case x ~ y => x + y }
   165 
   166 
   167     /* nested text cartouches */
   168 
   169     private def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   170     {
   171       require(depth >= 0)
   172 
   173       def apply(in: Input) =
   174       {
   175         val start = in.offset
   176         val end = in.source.length
   177         val matcher = new Symbol.Matcher(in.source)
   178 
   179         var i = start
   180         var d = depth
   181         var finished = false
   182         while (!finished && i < end) {
   183           val n = matcher(i, end)
   184           val sym = in.source.subSequence(i, i + n).toString
   185           if (Symbol.is_open(sym)) { i += n; d += 1 }
   186           else if (d > 0) { i += n; if (Symbol.is_close(sym)) d -= 1 }
   187           else finished = true
   188         }
   189         if (i == start) Failure("bad input", in)
   190         else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start))
   191       }
   192     }.named("cartouche_depth")
   193 
   194     def cartouche: Parser[String] =
   195       cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
   196 
   197     def cartouche_context(ctxt: Context): Parser[(String, Context)] =
   198     {
   199       val depth =
   200         ctxt match {
   201           case Finished => 0
   202           case Cartouche(d) => d
   203           case _ => -1
   204         }
   205       if (depth >= 0)
   206         cartouche_depth(depth) ^^
   207           { case (x, 0) => (x, Finished)
   208             case (x, d) => (x, Cartouche(d)) }
   209       else failure("")
   210     }
   211 
   212     val recover_cartouche: Parser[String] =
   213       cartouche_depth(0) ^^ (_._1)
   214 
   215     def cartouche_content(source: String): String =
   216     {
   217       def err(): Nothing = error("Malformed text cartouche: " + quote(source))
   218       val source1 =
   219         Library.try_unprefix(Symbol.open_decoded, source) orElse
   220           Library.try_unprefix(Symbol.open, source) getOrElse err()
   221       Library.try_unsuffix(Symbol.close_decoded, source1) orElse
   222         Library.try_unsuffix(Symbol.close, source1) getOrElse err()
   223     }
   224 
   225 
   226     /* nested comments */
   227 
   228     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]
   229     {
   230       require(depth >= 0)
   231 
   232       val comment_text =
   233         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
   234 
   235       def apply(in: Input) =
   236       {
   237         var rest = in
   238         def try_parse[A](p: Parser[A]): Boolean =
   239         {
   240           parse(p ^^^ (), rest) match {
   241             case Success(_, next) => { rest = next; true }
   242             case _ => false
   243           }
   244         }
   245         var d = depth
   246         var finished = false
   247         while (!finished) {
   248           if (try_parse("(*")) d += 1
   249           else if (d > 0 && try_parse("*)")) d -= 1
   250           else if (d == 0 || !try_parse(comment_text)) finished = true
   251         }
   252         if (in.offset < rest.offset)
   253           Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)
   254         else Failure("comment expected", in)
   255       }
   256     }.named("comment_depth")
   257 
   258     def comment: Parser[String] =
   259       comment_depth(0) ^? { case (x, d) if d == 0 => x }
   260 
   261     def comment_context(ctxt: Context): Parser[(String, Context)] =
   262     {
   263       val depth =
   264         ctxt match {
   265           case Finished => 0
   266           case Comment(d) => d
   267           case _ => -1
   268         }
   269       if (depth >= 0)
   270         comment_depth(depth) ^^
   271           { case (x, 0) => (x, Finished)
   272             case (x, d) => (x, Comment(d)) }
   273       else failure("")
   274     }
   275 
   276     val recover_comment: Parser[String] =
   277       comment_depth(0) ^^ (_._1)
   278 
   279     def comment_content(source: String): String =
   280     {
   281       require(parseAll(comment, source).successful)
   282       source.substring(2, source.length - 2)
   283     }
   284 
   285 
   286     /* keyword */
   287 
   288     def literal(lexicon: Lexicon): Parser[String] = new Parser[String]
   289     {
   290       def apply(in: Input) =
   291       {
   292         val result = lexicon.scan(in)
   293         if (result.isEmpty) Failure("keyword expected", in)
   294         else Success(result, in.drop(result.length))
   295       }
   296     }.named("keyword")
   297   }
   298 
   299 
   300 
   301   /** Lexicon -- position tree **/
   302 
   303   object Lexicon
   304   {
   305     /* representation */
   306 
   307     private sealed case class Tree(val branches: Map[Char, (String, Tree)])
   308     private val empty_tree = Tree(Map())
   309 
   310     val empty: Lexicon = new Lexicon(empty_tree)
   311     def apply(elems: String*): Lexicon = empty ++ elems
   312   }
   313 
   314   final class Lexicon private(rep: Lexicon.Tree)
   315   {
   316     /* auxiliary operations */
   317 
   318     private def content(tree: Lexicon.Tree, result: List[String]): List[String] =
   319       (result /: tree.branches.toList) ((res, entry) =>
   320         entry match { case (_, (s, tr)) =>
   321           if (s.isEmpty) content(tr, res) else content(tr, s :: res) })
   322 
   323     private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] =
   324     {
   325       val len = str.length
   326       def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] =
   327       {
   328         if (i < len) {
   329           tree.branches.get(str.charAt(i)) match {
   330             case Some((s, tr)) => look(tr, !s.isEmpty, i + 1)
   331             case None => None
   332           }
   333         } else Some(tip, tree)
   334       }
   335       look(rep, false, 0)
   336     }
   337 
   338     def completions(str: CharSequence): List[String] =
   339       lookup(str) match {
   340         case Some((true, tree)) => content(tree, List(str.toString))
   341         case Some((false, tree)) => content(tree, Nil)
   342         case None => Nil
   343       }
   344 
   345 
   346     /* pseudo Set methods */
   347 
   348     def iterator: Iterator[String] = content(rep, Nil).sorted.iterator
   349 
   350     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
   351 
   352     def empty: Lexicon = Lexicon.empty
   353     def isEmpty: Boolean = rep.branches.isEmpty
   354 
   355     def contains(elem: String): Boolean =
   356       lookup(elem) match {
   357         case Some((tip, _)) => tip
   358         case _ => false
   359       }
   360 
   361 
   362     /* add elements */
   363 
   364     def + (elem: String): Lexicon =
   365       if (contains(elem)) this
   366       else {
   367         val len = elem.length
   368         def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree =
   369           if (i < len) {
   370             val c = elem.charAt(i)
   371             val end = (i + 1 == len)
   372             tree.branches.get(c) match {
   373               case Some((s, tr)) =>
   374                 Lexicon.Tree(tree.branches +
   375                   (c -> (if (end) elem else s, extend(tr, i + 1))))
   376               case None =>
   377                 Lexicon.Tree(tree.branches +
   378                   (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))
   379             }
   380           }
   381           else tree
   382         new Lexicon(extend(rep, 0))
   383       }
   384 
   385     def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _)
   386 
   387 
   388     /* scan */
   389 
   390     def scan(in: Reader[Char]): String =
   391     {
   392       val source = in.source
   393       val offset = in.offset
   394       val len = source.length - offset
   395 
   396       def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String =
   397       {
   398         if (i < len) {
   399           tree.branches.get(source.charAt(offset + i)) match {
   400             case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1)
   401             case None => result
   402           }
   403         }
   404         else result
   405       }
   406       scan_tree(rep, "", 0)
   407     }
   408   }
   409 }