| author | Lars Hupel <lars.hupel@mytum.de> | 
| Tue, 14 Nov 2017 16:09:59 +0100 | |
| changeset 67076 | fc877448602e | 
| parent 66922 | 5a476a87a535 | 
| child 67364 | f74672cf83c6 | 
| permissions | -rw-r--r-- | 
| 31648 | 1  | 
/* Title: Pure/General/scan.scala  | 
2  | 
Author: Makarius  | 
|
3  | 
||
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40290 
diff
changeset
 | 
4  | 
Efficient scanning of keywords and tokens.  | 
| 31648 | 5  | 
*/  | 
6  | 
||
7  | 
package isabelle  | 
|
8  | 
||
| 
34187
 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 
wenzelm 
parents: 
34157 
diff
changeset
 | 
9  | 
|
| 55980 | 10  | 
import scala.annotation.tailrec  | 
| 63579 | 11  | 
import scala.collection.{IndexedSeq, Traversable, TraversableOnce}
 | 
| 
34187
 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 
wenzelm 
parents: 
34157 
diff
changeset
 | 
12  | 
import scala.collection.immutable.PagedSeq  | 
| 64824 | 13  | 
import scala.util.parsing.input.{OffsetPosition, Position => InputPosition,
 | 
14  | 
Reader, CharSequenceReader}  | 
|
| 31648 | 15  | 
import scala.util.parsing.combinator.RegexParsers  | 
16  | 
||
| 56822 | 17  | 
import java.io.{File => JFile, BufferedInputStream, FileInputStream, InputStream}
 | 
18  | 
import java.net.URL  | 
|
| 
34187
 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 
wenzelm 
parents: 
34157 
diff
changeset
 | 
19  | 
|
| 31648 | 20  | 
|
21  | 
object Scan  | 
|
22  | 
{
 | 
|
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
23  | 
/** context of partial line-oriented scans **/  | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
24  | 
|
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
25  | 
abstract class Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
26  | 
case object Finished extends Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
27  | 
case class Quoted(quote: String) extends Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
28  | 
case object Verbatim extends Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
29  | 
case class Cartouche(depth: Int) extends Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
30  | 
case class Comment(depth: Int) extends Line_Context  | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
31  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
32  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
33  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
34  | 
/** parser combinators **/  | 
| 31650 | 35  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
36  | 
object Parsers extends Parsers  | 
| 
36011
 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 
wenzelm 
parents: 
34316 
diff
changeset
 | 
37  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
38  | 
trait Parsers extends RegexParsers  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
39  | 
  {
 | 
| 
31649
 
a11ea667d676
reorganized and abstracted version, via Set trait;
 
wenzelm 
parents: 
31648 
diff
changeset
 | 
40  | 
override val whiteSpace = "".r  | 
| 
 
a11ea667d676
reorganized and abstracted version, via Set trait;
 
wenzelm 
parents: 
31648 
diff
changeset
 | 
41  | 
|
| 34137 | 42  | 
|
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
43  | 
/* optional termination */  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
44  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
45  | 
def opt_term[T](p: => Parser[T]): Parser[Option[T]] =  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
46  | 
p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
47  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
48  | 
|
| 55497 | 49  | 
/* repeated symbols */  | 
| 34137 | 50  | 
|
| 55497 | 51  | 
def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =  | 
| 34140 | 52  | 
new Parser[String]  | 
| 34135 | 53  | 
      {
 | 
| 34137 | 54  | 
def apply(in: Input) =  | 
55  | 
        {
 | 
|
56  | 
val start = in.offset  | 
|
57  | 
val end = in.source.length  | 
|
58  | 
val matcher = new Symbol.Matcher(in.source)  | 
|
59  | 
||
60  | 
var i = start  | 
|
61  | 
var count = 0  | 
|
62  | 
var finished = false  | 
|
| 55034 | 63  | 
          while (!finished && i < end && count < max_count) {
 | 
64  | 
val n = matcher(i, end)  | 
|
65  | 
val sym = in.source.subSequence(i, i + n).toString  | 
|
66  | 
            if (pred(sym)) { i += n; count += 1 }
 | 
|
| 34137 | 67  | 
else finished = true  | 
68  | 
}  | 
|
| 34140 | 69  | 
          if (count < min_count) Failure("bad input", in)
 | 
70  | 
else Success(in.source.subSequence(start, i).toString, in.drop(i - start))  | 
|
| 34137 | 71  | 
}  | 
| 55497 | 72  | 
      }.named("repeated")
 | 
| 34137 | 73  | 
|
| 43696 | 74  | 
def one(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
| 55497 | 75  | 
repeated(pred, 1, 1)  | 
| 43696 | 76  | 
|
| 66922 | 77  | 
def maybe(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
78  | 
repeated(pred, 0, 1)  | 
|
79  | 
||
| 43696 | 80  | 
def many(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
| 55497 | 81  | 
repeated(pred, 0, Integer.MAX_VALUE)  | 
| 43696 | 82  | 
|
83  | 
def many1(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
|
| 55497 | 84  | 
repeated(pred, 1, Integer.MAX_VALUE)  | 
85  | 
||
86  | 
||
87  | 
/* character */  | 
|
88  | 
||
89  | 
def character(pred: Char => Boolean): Symbol.Symbol => Boolean =  | 
|
90  | 
(s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0))  | 
|
| 34140 | 91  | 
|
92  | 
||
| 34143 | 93  | 
/* quoted strings */  | 
| 34140 | 94  | 
|
| 43696 | 95  | 
private def quoted_body(quote: Symbol.Symbol): Parser[String] =  | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
96  | 
    {
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
97  | 
rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |  | 
| 60215 | 98  | 
        ("""\\\d\d\d""".r ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
99  | 
}  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
100  | 
|
| 55494 | 101  | 
def quoted(quote: Symbol.Symbol): Parser[String] =  | 
| 34143 | 102  | 
    {
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
103  | 
      quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 104  | 
    }.named("quoted")
 | 
| 34140 | 105  | 
|
| 43696 | 106  | 
def quoted_content(quote: Symbol.Symbol, source: String): String =  | 
| 34140 | 107  | 
    {
 | 
108  | 
require(parseAll(quoted(quote), source).successful)  | 
|
| 34189 | 109  | 
val body = source.substring(1, source.length - 1)  | 
110  | 
      if (body.exists(_ == '\\')) {
 | 
|
111  | 
val content =  | 
|
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40290 
diff
changeset
 | 
112  | 
rep(many1(sym => sym != quote && sym != "\\") |  | 
| 34189 | 113  | 
              "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
 | 
114  | 
parseAll(content ^^ (_.mkString), body).get  | 
|
115  | 
}  | 
|
116  | 
else body  | 
|
| 34140 | 117  | 
}  | 
118  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
119  | 
def quoted_line(quote: Symbol.Symbol, ctxt: Line_Context): Parser[(String, Line_Context)] =  | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
120  | 
    {
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
121  | 
      ctxt match {
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
122  | 
case Finished =>  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
123  | 
quote ~ quoted_body(quote) ~ opt_term(quote) ^^  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
124  | 
            { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
125  | 
case x ~ y ~ None => (x + y, Quoted(quote)) }  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
126  | 
case Quoted(q) if q == quote =>  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
127  | 
quoted_body(quote) ~ opt_term(quote) ^^  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
128  | 
            { case x ~ Some(y) => (x + y, Finished)
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
129  | 
case x ~ None => (x, ctxt) }  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
130  | 
        case _ => failure("")
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
131  | 
}  | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
132  | 
    }.named("quoted_line")
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
133  | 
|
| 
48763
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
134  | 
def recover_quoted(quote: Symbol.Symbol): Parser[String] =  | 
| 
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
135  | 
      quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
 | 
| 
48743
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
48409 
diff
changeset
 | 
136  | 
|
| 34140 | 137  | 
|
| 34143 | 138  | 
/* verbatim text */  | 
139  | 
||
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
140  | 
private def verbatim_body: Parser[String] =  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
141  | 
rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString)  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
142  | 
|
| 55494 | 143  | 
def verbatim: Parser[String] =  | 
| 34143 | 144  | 
    {
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
145  | 
      "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 146  | 
    }.named("verbatim")
 | 
| 34140 | 147  | 
|
148  | 
def verbatim_content(source: String): String =  | 
|
149  | 
    {
 | 
|
150  | 
require(parseAll(verbatim, source).successful)  | 
|
151  | 
source.substring(2, source.length - 2)  | 
|
152  | 
}  | 
|
153  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
154  | 
def verbatim_line(ctxt: Line_Context): Parser[(String, Line_Context)] =  | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
155  | 
    {
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
156  | 
      ctxt match {
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
157  | 
case Finished =>  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
158  | 
          "{*" ~ verbatim_body ~ opt_term("*}") ^^
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
159  | 
            { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
160  | 
case x ~ y ~ None => (x + y, Verbatim) }  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
161  | 
case Verbatim =>  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
162  | 
          verbatim_body ~ opt_term("*}") ^^
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
163  | 
            { case x ~ Some(y) => (x + y, Finished)
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
164  | 
case x ~ None => (x, Verbatim) }  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
165  | 
        case _ => failure("")
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
166  | 
}  | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
167  | 
    }.named("verbatim_line")
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
168  | 
|
| 
48763
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
169  | 
val recover_verbatim: Parser[String] =  | 
| 
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
170  | 
      "{*" ~ verbatim_body ^^ { case x ~ y => x + y }
 | 
| 
48743
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
48409 
diff
changeset
 | 
171  | 
|
| 34140 | 172  | 
|
| 55033 | 173  | 
/* nested text cartouches */  | 
174  | 
||
175  | 
private def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]  | 
|
176  | 
    {
 | 
|
177  | 
require(depth >= 0)  | 
|
178  | 
||
179  | 
def apply(in: Input) =  | 
|
180  | 
      {
 | 
|
181  | 
val start = in.offset  | 
|
182  | 
val end = in.source.length  | 
|
183  | 
val matcher = new Symbol.Matcher(in.source)  | 
|
184  | 
||
185  | 
var i = start  | 
|
186  | 
var d = depth  | 
|
187  | 
var finished = false  | 
|
| 55034 | 188  | 
        while (!finished && i < end) {
 | 
189  | 
val n = matcher(i, end)  | 
|
190  | 
val sym = in.source.subSequence(i, i + n).toString  | 
|
191  | 
          if (Symbol.is_open(sym)) { i += n; d += 1 }
 | 
|
| 
58505
 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 
wenzelm 
parents: 
56827 
diff
changeset
 | 
192  | 
          else if (Symbol.is_close(sym) && d > 0) { i += n; d -= 1; if (d == 0) finished = true }
 | 
| 
 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 
wenzelm 
parents: 
56827 
diff
changeset
 | 
193  | 
else if (d > 0) i += n  | 
| 55033 | 194  | 
else finished = true  | 
195  | 
}  | 
|
196  | 
        if (i == start) Failure("bad input", in)
 | 
|
197  | 
else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start))  | 
|
198  | 
}  | 
|
199  | 
    }.named("cartouche_depth")
 | 
|
200  | 
||
201  | 
def cartouche: Parser[String] =  | 
|
202  | 
      cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
 | 
|
203  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
204  | 
def cartouche_line(ctxt: Line_Context): Parser[(String, Line_Context)] =  | 
| 55033 | 205  | 
    {
 | 
206  | 
val depth =  | 
|
207  | 
        ctxt match {
 | 
|
208  | 
case Finished => 0  | 
|
209  | 
case Cartouche(d) => d  | 
|
210  | 
case _ => -1  | 
|
211  | 
}  | 
|
212  | 
if (depth >= 0)  | 
|
213  | 
cartouche_depth(depth) ^^  | 
|
214  | 
          { case (x, 0) => (x, Finished)
 | 
|
215  | 
case (x, d) => (x, Cartouche(d)) }  | 
|
216  | 
      else failure("")
 | 
|
217  | 
}  | 
|
218  | 
||
219  | 
val recover_cartouche: Parser[String] =  | 
|
220  | 
cartouche_depth(0) ^^ (_._1)  | 
|
221  | 
||
222  | 
def cartouche_content(source: String): String =  | 
|
223  | 
    {
 | 
|
224  | 
      def err(): Nothing = error("Malformed text cartouche: " + quote(source))
 | 
|
225  | 
val source1 =  | 
|
226  | 
Library.try_unprefix(Symbol.open_decoded, source) orElse  | 
|
227  | 
Library.try_unprefix(Symbol.open, source) getOrElse err()  | 
|
228  | 
Library.try_unsuffix(Symbol.close_decoded, source1) orElse  | 
|
229  | 
Library.try_unsuffix(Symbol.close, source1) getOrElse err()  | 
|
230  | 
}  | 
|
231  | 
||
232  | 
||
| 34143 | 233  | 
/* nested comments */  | 
234  | 
||
| 43412 | 235  | 
private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)]  | 
| 34143 | 236  | 
    {
 | 
| 43412 | 237  | 
require(depth >= 0)  | 
238  | 
||
| 34143 | 239  | 
val comment_text =  | 
| 
42441
 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
240  | 
        rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
 | 
| 34143 | 241  | 
|
242  | 
def apply(in: Input) =  | 
|
243  | 
      {
 | 
|
244  | 
var rest = in  | 
|
| 
42441
 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
245  | 
def try_parse[A](p: Parser[A]): Boolean =  | 
| 34143 | 246  | 
        {
 | 
| 
56663
 
2d09b437c168
avoid "Adaptation of argument list by inserting ()" -- deprecated in scala-2.11.0;
 
wenzelm 
parents: 
55980 
diff
changeset
 | 
247  | 
          parse(p ^^^ (()), rest) match {
 | 
| 34143 | 248  | 
            case Success(_, next) => { rest = next; true }
 | 
249  | 
case _ => false  | 
|
250  | 
}  | 
|
251  | 
}  | 
|
| 43412 | 252  | 
var d = depth  | 
| 34143 | 253  | 
var finished = false  | 
254  | 
        while (!finished) {
 | 
|
| 43412 | 255  | 
          if (try_parse("(*")) d += 1
 | 
| 
58505
 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 
wenzelm 
parents: 
56827 
diff
changeset
 | 
256  | 
          else if (d > 0 && try_parse("*)")) { d -= 1; if (d == 0) finished = true }
 | 
| 43412 | 257  | 
else if (d == 0 || !try_parse(comment_text)) finished = true  | 
| 34143 | 258  | 
}  | 
| 43412 | 259  | 
if (in.offset < rest.offset)  | 
260  | 
Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)  | 
|
| 34143 | 261  | 
        else Failure("comment expected", in)
 | 
262  | 
}  | 
|
| 43412 | 263  | 
    }.named("comment_depth")
 | 
264  | 
||
265  | 
def comment: Parser[String] =  | 
|
266  | 
      comment_depth(0) ^? { case (x, d) if d == 0 => x }
 | 
|
267  | 
||
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
268  | 
def comment_line(ctxt: Line_Context): Parser[(String, Line_Context)] =  | 
| 43412 | 269  | 
    {
 | 
270  | 
val depth =  | 
|
271  | 
        ctxt match {
 | 
|
272  | 
case Finished => 0  | 
|
273  | 
case Comment(d) => d  | 
|
274  | 
case _ => -1  | 
|
275  | 
}  | 
|
276  | 
if (depth >= 0)  | 
|
277  | 
comment_depth(depth) ^^  | 
|
278  | 
          { case (x, 0) => (x, Finished)
 | 
|
279  | 
case (x, d) => (x, Comment(d)) }  | 
|
280  | 
      else failure("")
 | 
|
281  | 
}  | 
|
| 34143 | 282  | 
|
| 
48763
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
283  | 
val recover_comment: Parser[String] =  | 
| 
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
284  | 
comment_depth(0) ^^ (_._1)  | 
| 34143 | 285  | 
|
| 55033 | 286  | 
def comment_content(source: String): String =  | 
287  | 
    {
 | 
|
288  | 
require(parseAll(comment, source).successful)  | 
|
289  | 
source.substring(2, source.length - 2)  | 
|
290  | 
}  | 
|
291  | 
||
| 34143 | 292  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
293  | 
/* keyword */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
294  | 
|
| 55497 | 295  | 
def literal(lexicon: Lexicon): Parser[String] = new Parser[String]  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
296  | 
    {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
297  | 
def apply(in: Input) =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
298  | 
      {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
299  | 
val result = lexicon.scan(in)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
300  | 
        if (result.isEmpty) Failure("keyword expected", in)
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
301  | 
else Success(result, in.drop(result.length))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
302  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
303  | 
    }.named("keyword")
 | 
| 
31649
 
a11ea667d676
reorganized and abstracted version, via Set trait;
 
wenzelm 
parents: 
31648 
diff
changeset
 | 
304  | 
}  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
305  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
306  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
307  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
308  | 
/** Lexicon -- position tree **/  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
309  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
310  | 
object Lexicon  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
311  | 
  {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
312  | 
/* representation */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
313  | 
|
| 60215 | 314  | 
private sealed case class Tree(branches: Map[Char, (String, Tree)])  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
315  | 
private val empty_tree = Tree(Map())  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
316  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
317  | 
val empty: Lexicon = new Lexicon(empty_tree)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
318  | 
def apply(elems: String*): Lexicon = empty ++ elems  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
319  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
320  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
321  | 
final class Lexicon private(rep: Lexicon.Tree)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
322  | 
  {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
323  | 
/* auxiliary operations */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
324  | 
|
| 59073 | 325  | 
private def dest(tree: Lexicon.Tree, result: List[String]): List[String] =  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
326  | 
(result /: tree.branches.toList) ((res, entry) =>  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
327  | 
        entry match { case (_, (s, tr)) =>
 | 
| 59073 | 328  | 
if (s.isEmpty) dest(tr, res) else dest(tr, s :: res) })  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
329  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
330  | 
private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
331  | 
    {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
332  | 
val len = str.length  | 
| 55980 | 333  | 
@tailrec def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] =  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
334  | 
      {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
335  | 
        if (i < len) {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
336  | 
          tree.branches.get(str.charAt(i)) match {
 | 
| 59319 | 337  | 
case Some((s, tr)) => look(tr, s.nonEmpty, i + 1)  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
338  | 
case None => None  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
339  | 
}  | 
| 55980 | 340  | 
}  | 
341  | 
else Some(tip, tree)  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
342  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
343  | 
look(rep, false, 0)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
344  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
345  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
346  | 
def completions(str: CharSequence): List[String] =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
347  | 
      lookup(str) match {
 | 
| 59073 | 348  | 
case Some((true, tree)) => dest(tree, List(str.toString))  | 
349  | 
case Some((false, tree)) => dest(tree, Nil)  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
350  | 
case None => Nil  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
351  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
352  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
353  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
354  | 
/* pseudo Set methods */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
355  | 
|
| 59073 | 356  | 
def raw_iterator: Iterator[String] = dest(rep, Nil).iterator  | 
357  | 
def iterator: Iterator[String] = dest(rep, Nil).sorted.iterator  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
358  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
359  | 
    override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
360  | 
|
| 59073 | 361  | 
def is_empty: Boolean = rep.branches.isEmpty  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
362  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
363  | 
def contains(elem: String): Boolean =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
364  | 
      lookup(elem) match {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
365  | 
case Some((tip, _)) => tip  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
366  | 
case _ => false  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
367  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
368  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
369  | 
|
| 59073 | 370  | 
/* build lexicon */  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
371  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
372  | 
def + (elem: String): Lexicon =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
373  | 
if (contains(elem)) this  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
374  | 
      else {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
375  | 
val len = elem.length  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
376  | 
def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
377  | 
          if (i < len) {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
378  | 
val c = elem.charAt(i)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
379  | 
val end = (i + 1 == len)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
380  | 
            tree.branches.get(c) match {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
381  | 
case Some((s, tr)) =>  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
382  | 
Lexicon.Tree(tree.branches +  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
383  | 
(c -> (if (end) elem else s, extend(tr, i + 1))))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
384  | 
case None =>  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
385  | 
Lexicon.Tree(tree.branches +  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
386  | 
(c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
387  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
388  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
389  | 
else tree  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
390  | 
new Lexicon(extend(rep, 0))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
391  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
392  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
393  | 
def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
394  | 
|
| 59073 | 395  | 
def ++ (other: Lexicon): Lexicon =  | 
396  | 
if (this eq other) this  | 
|
397  | 
else if (is_empty) other  | 
|
398  | 
else this ++ other.raw_iterator  | 
|
399  | 
||
| 63579 | 400  | 
def -- (remove: Traversable[String]): Lexicon =  | 
401  | 
if (remove.exists(contains(_)))  | 
|
402  | 
Lexicon.empty ++ iterator.filterNot(a => remove.exists(b => a == b))  | 
|
403  | 
else this  | 
|
404  | 
||
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
405  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
406  | 
/* scan */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
407  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
408  | 
def scan(in: Reader[Char]): String =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
409  | 
    {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
410  | 
val source = in.source  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
411  | 
val offset = in.offset  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
412  | 
val len = source.length - offset  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
413  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
414  | 
def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
415  | 
      {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
416  | 
        if (i < len) {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
417  | 
          tree.branches.get(source.charAt(offset + i)) match {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
418  | 
case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
419  | 
case None => result  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
420  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
421  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
422  | 
else result  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
423  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
424  | 
scan_tree(rep, "", 0)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
425  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
426  | 
}  | 
| 56821 | 427  | 
|
428  | 
||
429  | 
||
| 56822 | 430  | 
/** read stream without decoding: efficient length operation **/  | 
| 56821 | 431  | 
|
432  | 
private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int)  | 
|
433  | 
extends CharSequence  | 
|
434  | 
  {
 | 
|
435  | 
def charAt(i: Int): Char =  | 
|
436  | 
if (0 <= i && i < length) seq(start + i)  | 
|
437  | 
else throw new IndexOutOfBoundsException  | 
|
438  | 
||
| 56822 | 439  | 
def length: Int = end - start // avoid expensive seq.length  | 
| 56821 | 440  | 
|
441  | 
def subSequence(i: Int, j: Int): CharSequence =  | 
|
442  | 
if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j)  | 
|
443  | 
else throw new IndexOutOfBoundsException  | 
|
444  | 
||
445  | 
override def toString: String =  | 
|
446  | 
    {
 | 
|
447  | 
val buf = new StringBuilder(length)  | 
|
448  | 
for (offset <- start until end) buf.append(seq(offset))  | 
|
449  | 
buf.toString  | 
|
450  | 
}  | 
|
451  | 
}  | 
|
452  | 
||
453  | 
  abstract class Byte_Reader extends Reader[Char] { def close: Unit }
 | 
|
454  | 
||
| 56822 | 455  | 
private def make_byte_reader(stream: InputStream, stream_length: Int): Byte_Reader =  | 
| 56821 | 456  | 
  {
 | 
| 56822 | 457  | 
val buffered_stream = new BufferedInputStream(stream)  | 
| 56821 | 458  | 
val seq = new PagedSeq(  | 
459  | 
(buf: Array[Char], offset: Int, length: Int) =>  | 
|
460  | 
        {
 | 
|
461  | 
var i = 0  | 
|
462  | 
var c = 0  | 
|
463  | 
var eof = false  | 
|
464  | 
          while (!eof && i < length) {
 | 
|
| 56822 | 465  | 
c = buffered_stream.read  | 
| 56821 | 466  | 
if (c == -1) eof = true  | 
467  | 
            else { buf(offset + i) = c.toChar; i += 1 }
 | 
|
468  | 
}  | 
|
469  | 
if (i > 0) i else -1  | 
|
470  | 
})  | 
|
| 56822 | 471  | 
val restricted_seq = new Restricted_Seq(seq, 0, stream_length)  | 
| 56821 | 472  | 
|
473  | 
class Paged_Reader(override val offset: Int) extends Byte_Reader  | 
|
474  | 
    {
 | 
|
475  | 
override lazy val source: CharSequence = restricted_seq  | 
|
| 56827 | 476  | 
def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\u001a'  | 
| 56821 | 477  | 
def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this  | 
478  | 
def pos: InputPosition = new OffsetPosition(source, offset)  | 
|
479  | 
def atEnd: Boolean = !seq.isDefinedAt(offset)  | 
|
480  | 
override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n)  | 
|
| 56822 | 481  | 
      def close { buffered_stream.close }
 | 
| 56821 | 482  | 
}  | 
483  | 
new Paged_Reader(0)  | 
|
484  | 
}  | 
|
| 56822 | 485  | 
|
486  | 
def byte_reader(file: JFile): Byte_Reader =  | 
|
487  | 
make_byte_reader(new FileInputStream(file), file.length.toInt)  | 
|
488  | 
||
489  | 
def byte_reader(url: URL): Byte_Reader =  | 
|
490  | 
  {
 | 
|
491  | 
val connection = url.openConnection  | 
|
492  | 
val stream = connection.getInputStream  | 
|
493  | 
val stream_length = connection.getContentLength  | 
|
494  | 
make_byte_reader(stream, stream_length)  | 
|
495  | 
}  | 
|
| 64824 | 496  | 
|
| 
66918
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
497  | 
def reader_is_utf8(reader: Reader[Char]): Boolean =  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
498  | 
reader.isInstanceOf[Byte_Reader]  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
499  | 
|
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
500  | 
def reader_decode_utf8(is_utf8: Boolean, s: String): String =  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
501  | 
if (is_utf8) UTF8.decode_permissive(s) else s  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
502  | 
|
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
503  | 
def reader_decode_utf8(reader: Reader[Char], s: String): String =  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
504  | 
reader_decode_utf8(reader_is_utf8(reader), s)  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
505  | 
|
| 64824 | 506  | 
|
507  | 
/* plain text reader */  | 
|
508  | 
||
509  | 
def char_reader(text: CharSequence): CharSequenceReader =  | 
|
510  | 
new CharSequenceReader(text)  | 
|
| 31648 | 511  | 
}  |