| author | wenzelm | 
| Sat, 01 Mar 2014 18:33:49 +0100 | |
| changeset 55824 | 22bc50a19afa | 
| parent 55510 | 1585a65aad64 | 
| child 55980 | 36fd4981c119 | 
| permissions | -rw-r--r-- | 
| 31648 | 1 | /* Title: Pure/General/scan.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 40523 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 wenzelm parents: 
40290diff
changeset | 4 | Efficient scanning of keywords and tokens. | 
| 31648 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 9 | |
| 46627 
fbe2cb05bdb3
avoid trait Addable, which is deprecated in scala-2.9.x;
 wenzelm parents: 
45900diff
changeset | 10 | import scala.collection.{IndexedSeq, TraversableOnce}
 | 
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 11 | import scala.collection.immutable.PagedSeq | 
| 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 12 | import scala.util.parsing.input.{OffsetPosition, Position => InputPosition, Reader}
 | 
| 31648 | 13 | import scala.util.parsing.combinator.RegexParsers | 
| 14 | ||
| 48409 | 15 | import java.io.{File => JFile, BufferedInputStream, FileInputStream}
 | 
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 16 | |
| 31648 | 17 | |
| 18 | object Scan | |
| 19 | {
 | |
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 20 | /** context of partial line-oriented scans **/ | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 21 | |
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 22 | abstract class Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 23 | case object Finished extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 24 | case class Quoted(quote: String) extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 25 | case object Verbatim extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 26 | case class Cartouche(depth: Int) extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 27 | case class Comment(depth: Int) extends Line_Context | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 28 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 29 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 30 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 31 | /** parser combinators **/ | 
| 31650 | 32 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 33 | object Parsers extends Parsers | 
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 34 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 35 | trait Parsers extends RegexParsers | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 36 |   {
 | 
| 31649 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 37 | override val whiteSpace = "".r | 
| 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 38 | |
| 34137 | 39 | |
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 40 | /* optional termination */ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 41 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 42 | def opt_term[T](p: => Parser[T]): Parser[Option[T]] = | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 43 | p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None) | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 44 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 45 | |
| 55497 | 46 | /* repeated symbols */ | 
| 34137 | 47 | |
| 55497 | 48 | def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] = | 
| 34140 | 49 | new Parser[String] | 
| 34135 | 50 |       {
 | 
| 34137 | 51 | def apply(in: Input) = | 
| 52 |         {
 | |
| 53 | val start = in.offset | |
| 54 | val end = in.source.length | |
| 55 | val matcher = new Symbol.Matcher(in.source) | |
| 56 | ||
| 57 | var i = start | |
| 58 | var count = 0 | |
| 59 | var finished = false | |
| 55034 | 60 |           while (!finished && i < end && count < max_count) {
 | 
| 61 | val n = matcher(i, end) | |
| 62 | val sym = in.source.subSequence(i, i + n).toString | |
| 63 |             if (pred(sym)) { i += n; count += 1 }
 | |
| 34137 | 64 | else finished = true | 
| 65 | } | |
| 34140 | 66 |           if (count < min_count) Failure("bad input", in)
 | 
| 67 | else Success(in.source.subSequence(start, i).toString, in.drop(i - start)) | |
| 34137 | 68 | } | 
| 55497 | 69 |       }.named("repeated")
 | 
| 34137 | 70 | |
| 43696 | 71 | def one(pred: Symbol.Symbol => Boolean): Parser[String] = | 
| 55497 | 72 | repeated(pred, 1, 1) | 
| 43696 | 73 | |
| 74 | def many(pred: Symbol.Symbol => Boolean): Parser[String] = | |
| 55497 | 75 | repeated(pred, 0, Integer.MAX_VALUE) | 
| 43696 | 76 | |
| 77 | def many1(pred: Symbol.Symbol => Boolean): Parser[String] = | |
| 55497 | 78 | repeated(pred, 1, Integer.MAX_VALUE) | 
| 79 | ||
| 80 | ||
| 81 | /* character */ | |
| 82 | ||
| 83 | def character(pred: Char => Boolean): Symbol.Symbol => Boolean = | |
| 84 | (s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0)) | |
| 34140 | 85 | |
| 86 | ||
| 34143 | 87 | /* quoted strings */ | 
| 34140 | 88 | |
| 43696 | 89 | private def quoted_body(quote: Symbol.Symbol): Parser[String] = | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 90 |     {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 91 | rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" | | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 92 |         (("""\\\d\d\d""".r) ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 93 | } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 94 | |
| 55494 | 95 | def quoted(quote: Symbol.Symbol): Parser[String] = | 
| 34143 | 96 |     {
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 97 |       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 98 |     }.named("quoted")
 | 
| 34140 | 99 | |
| 43696 | 100 | def quoted_content(quote: Symbol.Symbol, source: String): String = | 
| 34140 | 101 |     {
 | 
| 102 | require(parseAll(quoted(quote), source).successful) | |
| 34189 | 103 | val body = source.substring(1, source.length - 1) | 
| 104 |       if (body.exists(_ == '\\')) {
 | |
| 105 | val content = | |
| 40523 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 wenzelm parents: 
40290diff
changeset | 106 | rep(many1(sym => sym != quote && sym != "\\") | | 
| 34189 | 107 |               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
 | 
| 108 | parseAll(content ^^ (_.mkString), body).get | |
| 109 | } | |
| 110 | else body | |
| 34140 | 111 | } | 
| 112 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 113 | def quoted_line(quote: Symbol.Symbol, ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 114 |     {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 115 |       ctxt match {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 116 | case Finished => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 117 | quote ~ quoted_body(quote) ~ opt_term(quote) ^^ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 118 |             { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 119 | case x ~ y ~ None => (x + y, Quoted(quote)) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 120 | case Quoted(q) if q == quote => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 121 | quoted_body(quote) ~ opt_term(quote) ^^ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 122 |             { case x ~ Some(y) => (x + y, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 123 | case x ~ None => (x, ctxt) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 124 |         case _ => failure("")
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 125 | } | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 126 |     }.named("quoted_line")
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 127 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 128 | def recover_quoted(quote: Symbol.Symbol): Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 129 |       quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
 | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
48409diff
changeset | 130 | |
| 34140 | 131 | |
| 34143 | 132 | /* verbatim text */ | 
| 133 | ||
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 134 | private def verbatim_body: Parser[String] = | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 135 | rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString) | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 136 | |
| 55494 | 137 | def verbatim: Parser[String] = | 
| 34143 | 138 |     {
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 139 |       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 140 |     }.named("verbatim")
 | 
| 34140 | 141 | |
| 142 | def verbatim_content(source: String): String = | |
| 143 |     {
 | |
| 144 | require(parseAll(verbatim, source).successful) | |
| 145 | source.substring(2, source.length - 2) | |
| 146 | } | |
| 147 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 148 | def verbatim_line(ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 149 |     {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 150 |       ctxt match {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 151 | case Finished => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 152 |           "{*" ~ verbatim_body ~ opt_term("*}") ^^
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 153 |             { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 154 | case x ~ y ~ None => (x + y, Verbatim) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 155 | case Verbatim => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 156 |           verbatim_body ~ opt_term("*}") ^^
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 157 |             { case x ~ Some(y) => (x + y, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 158 | case x ~ None => (x, Verbatim) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 159 |         case _ => failure("")
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 160 | } | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 161 |     }.named("verbatim_line")
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 162 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 163 | val recover_verbatim: Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 164 |       "{*" ~ verbatim_body ^^ { case x ~ y => x + y }
 | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
48409diff
changeset | 165 | |
| 34140 | 166 | |
| 55033 | 167 | /* nested text cartouches */ | 
| 168 | ||
| 169 | private def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] | |
| 170 |     {
 | |
| 171 | require(depth >= 0) | |
| 172 | ||
| 173 | def apply(in: Input) = | |
| 174 |       {
 | |
| 175 | val start = in.offset | |
| 176 | val end = in.source.length | |
| 177 | val matcher = new Symbol.Matcher(in.source) | |
| 178 | ||
| 179 | var i = start | |
| 180 | var d = depth | |
| 181 | var finished = false | |
| 55034 | 182 |         while (!finished && i < end) {
 | 
| 183 | val n = matcher(i, end) | |
| 184 | val sym = in.source.subSequence(i, i + n).toString | |
| 185 |           if (Symbol.is_open(sym)) { i += n; d += 1 }
 | |
| 186 |           else if (d > 0) { i += n; if (Symbol.is_close(sym)) d -= 1 }
 | |
| 55033 | 187 | else finished = true | 
| 188 | } | |
| 189 |         if (i == start) Failure("bad input", in)
 | |
| 190 | else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start)) | |
| 191 | } | |
| 192 |     }.named("cartouche_depth")
 | |
| 193 | ||
| 194 | def cartouche: Parser[String] = | |
| 195 |       cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
 | |
| 196 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 197 | def cartouche_line(ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 55033 | 198 |     {
 | 
| 199 | val depth = | |
| 200 |         ctxt match {
 | |
| 201 | case Finished => 0 | |
| 202 | case Cartouche(d) => d | |
| 203 | case _ => -1 | |
| 204 | } | |
| 205 | if (depth >= 0) | |
| 206 | cartouche_depth(depth) ^^ | |
| 207 |           { case (x, 0) => (x, Finished)
 | |
| 208 | case (x, d) => (x, Cartouche(d)) } | |
| 209 |       else failure("")
 | |
| 210 | } | |
| 211 | ||
| 212 | val recover_cartouche: Parser[String] = | |
| 213 | cartouche_depth(0) ^^ (_._1) | |
| 214 | ||
| 215 | def cartouche_content(source: String): String = | |
| 216 |     {
 | |
| 217 |       def err(): Nothing = error("Malformed text cartouche: " + quote(source))
 | |
| 218 | val source1 = | |
| 219 | Library.try_unprefix(Symbol.open_decoded, source) orElse | |
| 220 | Library.try_unprefix(Symbol.open, source) getOrElse err() | |
| 221 | Library.try_unsuffix(Symbol.close_decoded, source1) orElse | |
| 222 | Library.try_unsuffix(Symbol.close, source1) getOrElse err() | |
| 223 | } | |
| 224 | ||
| 225 | ||
| 34143 | 226 | /* nested comments */ | 
| 227 | ||
| 43412 | 228 | private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] | 
| 34143 | 229 |     {
 | 
| 43412 | 230 | require(depth >= 0) | 
| 231 | ||
| 34143 | 232 | val comment_text = | 
| 42441 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 wenzelm parents: 
40523diff
changeset | 233 |         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
 | 
| 34143 | 234 | |
| 235 | def apply(in: Input) = | |
| 236 |       {
 | |
| 237 | var rest = in | |
| 42441 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 wenzelm parents: 
40523diff
changeset | 238 | def try_parse[A](p: Parser[A]): Boolean = | 
| 34143 | 239 |         {
 | 
| 42441 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 wenzelm parents: 
40523diff
changeset | 240 |           parse(p ^^^ (), rest) match {
 | 
| 34143 | 241 |             case Success(_, next) => { rest = next; true }
 | 
| 242 | case _ => false | |
| 243 | } | |
| 244 | } | |
| 43412 | 245 | var d = depth | 
| 34143 | 246 | var finished = false | 
| 247 |         while (!finished) {
 | |
| 43412 | 248 |           if (try_parse("(*")) d += 1
 | 
| 249 |           else if (d > 0 && try_parse("*)")) d -= 1
 | |
| 250 | else if (d == 0 || !try_parse(comment_text)) finished = true | |
| 34143 | 251 | } | 
| 43412 | 252 | if (in.offset < rest.offset) | 
| 253 | Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest) | |
| 34143 | 254 |         else Failure("comment expected", in)
 | 
| 255 | } | |
| 43412 | 256 |     }.named("comment_depth")
 | 
| 257 | ||
| 258 | def comment: Parser[String] = | |
| 259 |       comment_depth(0) ^? { case (x, d) if d == 0 => x }
 | |
| 260 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 261 | def comment_line(ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 43412 | 262 |     {
 | 
| 263 | val depth = | |
| 264 |         ctxt match {
 | |
| 265 | case Finished => 0 | |
| 266 | case Comment(d) => d | |
| 267 | case _ => -1 | |
| 268 | } | |
| 269 | if (depth >= 0) | |
| 270 | comment_depth(depth) ^^ | |
| 271 |           { case (x, 0) => (x, Finished)
 | |
| 272 | case (x, d) => (x, Comment(d)) } | |
| 273 |       else failure("")
 | |
| 274 | } | |
| 34143 | 275 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 276 | val recover_comment: Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 277 | comment_depth(0) ^^ (_._1) | 
| 34143 | 278 | |
| 55033 | 279 | def comment_content(source: String): String = | 
| 280 |     {
 | |
| 281 | require(parseAll(comment, source).successful) | |
| 282 | source.substring(2, source.length - 2) | |
| 283 | } | |
| 284 | ||
| 34143 | 285 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 286 | /* keyword */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 287 | |
| 55497 | 288 | def literal(lexicon: Lexicon): Parser[String] = new Parser[String] | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 289 |     {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 290 | def apply(in: Input) = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 291 |       {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 292 | val result = lexicon.scan(in) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 293 |         if (result.isEmpty) Failure("keyword expected", in)
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 294 | else Success(result, in.drop(result.length)) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 295 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 296 |     }.named("keyword")
 | 
| 31649 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 297 | } | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 298 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 299 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 300 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 301 | /** Lexicon -- position tree **/ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 302 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 303 | object Lexicon | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 304 |   {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 305 | /* representation */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 306 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 307 | private sealed case class Tree(val branches: Map[Char, (String, Tree)]) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 308 | private val empty_tree = Tree(Map()) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 309 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 310 | val empty: Lexicon = new Lexicon(empty_tree) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 311 | def apply(elems: String*): Lexicon = empty ++ elems | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 312 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 313 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 314 | final class Lexicon private(rep: Lexicon.Tree) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 315 |   {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 316 | /* auxiliary operations */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 317 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 318 | private def content(tree: Lexicon.Tree, result: List[String]): List[String] = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 319 | (result /: tree.branches.toList) ((res, entry) => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 320 |         entry match { case (_, (s, tr)) =>
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 321 | if (s.isEmpty) content(tr, res) else content(tr, s :: res) }) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 322 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 323 | private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 324 |     {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 325 | val len = str.length | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 326 | def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 327 |       {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 328 |         if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 329 |           tree.branches.get(str.charAt(i)) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 330 | case Some((s, tr)) => look(tr, !s.isEmpty, i + 1) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 331 | case None => None | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 332 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 333 | } else Some(tip, tree) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 334 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 335 | look(rep, false, 0) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 336 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 337 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 338 | def completions(str: CharSequence): List[String] = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 339 |       lookup(str) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 340 | case Some((true, tree)) => content(tree, List(str.toString)) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 341 | case Some((false, tree)) => content(tree, Nil) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 342 | case None => Nil | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 343 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 344 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 345 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 346 | /* pseudo Set methods */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 347 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 348 | def iterator: Iterator[String] = content(rep, Nil).sorted.iterator | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 349 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 350 |     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 351 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 352 | def empty: Lexicon = Lexicon.empty | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 353 | def isEmpty: Boolean = rep.branches.isEmpty | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 354 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 355 | def contains(elem: String): Boolean = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 356 |       lookup(elem) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 357 | case Some((tip, _)) => tip | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 358 | case _ => false | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 359 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 360 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 361 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 362 | /* add elements */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 363 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 364 | def + (elem: String): Lexicon = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 365 | if (contains(elem)) this | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 366 |       else {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 367 | val len = elem.length | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 368 | def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 369 |           if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 370 | val c = elem.charAt(i) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 371 | val end = (i + 1 == len) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 372 |             tree.branches.get(c) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 373 | case Some((s, tr)) => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 374 | Lexicon.Tree(tree.branches + | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 375 | (c -> (if (end) elem else s, extend(tr, i + 1)))) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 376 | case None => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 377 | Lexicon.Tree(tree.branches + | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 378 | (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1)))) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 379 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 380 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 381 | else tree | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 382 | new Lexicon(extend(rep, 0)) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 383 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 384 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 385 | def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 386 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 387 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 388 | /* scan */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 389 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 390 | def scan(in: Reader[Char]): String = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 391 |     {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 392 | val source = in.source | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 393 | val offset = in.offset | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 394 | val len = source.length - offset | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 395 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 396 | def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 397 |       {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 398 |         if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 399 |           tree.branches.get(source.charAt(offset + i)) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 400 | case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 401 | case None => result | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 402 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 403 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 404 | else result | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 405 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 406 | scan_tree(rep, "", 0) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 407 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 408 | } | 
| 31648 | 409 | } |