| author | wenzelm | 
| Fri, 01 Apr 2022 23:19:12 +0200 | |
| changeset 75394 | 42267c650205 | 
| parent 75393 | 87ebf5a50283 | 
| child 78243 | 0e221a8128e4 | 
| permissions | -rw-r--r-- | 
| 31648 | 1  | 
/* Title: Pure/General/scan.scala  | 
2  | 
Author: Makarius  | 
|
3  | 
||
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40290 
diff
changeset
 | 
4  | 
Efficient scanning of keywords and tokens.  | 
| 31648 | 5  | 
*/  | 
6  | 
||
7  | 
package isabelle  | 
|
8  | 
||
| 
34187
 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 
wenzelm 
parents: 
34157 
diff
changeset
 | 
9  | 
|
| 55980 | 10  | 
import scala.annotation.tailrec  | 
| 73357 | 11  | 
import scala.collection.IndexedSeq  | 
| 71601 | 12  | 
import scala.util.matching.Regex  | 
| 64824 | 13  | 
import scala.util.parsing.input.{OffsetPosition, Position => InputPosition,
 | 
| 73136 | 14  | 
Reader, CharSequenceReader, PagedSeq}  | 
| 31648 | 15  | 
import scala.util.parsing.combinator.RegexParsers  | 
| 56822 | 16  | 
import java.io.{File => JFile, BufferedInputStream, FileInputStream, InputStream}
 | 
17  | 
import java.net.URL  | 
|
| 
34187
 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 
wenzelm 
parents: 
34157 
diff
changeset
 | 
18  | 
|
| 31648 | 19  | 
|
| 75393 | 20  | 
object Scan {
 | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
21  | 
/** context of partial line-oriented scans **/  | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
22  | 
|
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
23  | 
abstract class Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
24  | 
case object Finished extends Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
25  | 
case class Quoted(quote: String) extends Line_Context  | 
| 
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
26  | 
case class Cartouche(depth: Int) extends Line_Context  | 
| 67438 | 27  | 
case class Comment_Prefix(symbol: Symbol.Symbol) extends Line_Context  | 
| 
67365
 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 
wenzelm 
parents: 
67364 
diff
changeset
 | 
28  | 
case class Cartouche_Comment(depth: Int) extends Line_Context  | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
29  | 
case class Comment(depth: Int) extends Line_Context  | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
30  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
31  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
32  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
33  | 
/** parser combinators **/  | 
| 31650 | 34  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
35  | 
object Parsers extends Parsers  | 
| 
36011
 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 
wenzelm 
parents: 
34316 
diff
changeset
 | 
36  | 
|
| 75393 | 37  | 
  trait Parsers extends RegexParsers {
 | 
| 71601 | 38  | 
override val whiteSpace: Regex = "".r  | 
| 
31649
 
a11ea667d676
reorganized and abstracted version, via Set trait;
 
wenzelm 
parents: 
31648 
diff
changeset
 | 
39  | 
|
| 34137 | 40  | 
|
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
41  | 
/* optional termination */  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
42  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
43  | 
def opt_term[T](p: => Parser[T]): Parser[Option[T]] =  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
44  | 
p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None)  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
45  | 
|
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
46  | 
|
| 55497 | 47  | 
/* repeated symbols */  | 
| 34137 | 48  | 
|
| 55497 | 49  | 
def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] =  | 
| 75393 | 50  | 
      new Parser[String] {
 | 
51  | 
        def apply(in: Input) = {
 | 
|
| 34137 | 52  | 
val start = in.offset  | 
53  | 
val end = in.source.length  | 
|
54  | 
val matcher = new Symbol.Matcher(in.source)  | 
|
55  | 
||
56  | 
var i = start  | 
|
57  | 
var count = 0  | 
|
58  | 
var finished = false  | 
|
| 55034 | 59  | 
          while (!finished && i < end && count < max_count) {
 | 
| 
75237
 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 
wenzelm 
parents: 
74887 
diff
changeset
 | 
60  | 
val sym = matcher.match_symbol(i)  | 
| 
 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 
wenzelm 
parents: 
74887 
diff
changeset
 | 
61  | 
            if (pred(sym)) { i += sym.length; count += 1 }
 | 
| 34137 | 62  | 
else finished = true  | 
63  | 
}  | 
|
| 34140 | 64  | 
          if (count < min_count) Failure("bad input", in)
 | 
65  | 
else Success(in.source.subSequence(start, i).toString, in.drop(i - start))  | 
|
| 34137 | 66  | 
}  | 
| 55497 | 67  | 
      }.named("repeated")
 | 
| 34137 | 68  | 
|
| 43696 | 69  | 
def one(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
| 55497 | 70  | 
repeated(pred, 1, 1)  | 
| 43696 | 71  | 
|
| 66922 | 72  | 
def maybe(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
73  | 
repeated(pred, 0, 1)  | 
|
74  | 
||
| 43696 | 75  | 
def many(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
| 55497 | 76  | 
repeated(pred, 0, Integer.MAX_VALUE)  | 
| 43696 | 77  | 
|
78  | 
def many1(pred: Symbol.Symbol => Boolean): Parser[String] =  | 
|
| 55497 | 79  | 
repeated(pred, 1, Integer.MAX_VALUE)  | 
80  | 
||
81  | 
||
82  | 
/* character */  | 
|
83  | 
||
84  | 
def character(pred: Char => Boolean): Symbol.Symbol => Boolean =  | 
|
85  | 
(s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0))  | 
|
| 34140 | 86  | 
|
87  | 
||
| 34143 | 88  | 
/* quoted strings */  | 
| 34140 | 89  | 
|
| 75393 | 90  | 
    private def quoted_body(quote: Symbol.Symbol): Parser[String] = {
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
91  | 
rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" |  | 
| 60215 | 92  | 
        ("""\\\d\d\d""".r ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
93  | 
}  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
94  | 
|
| 75393 | 95  | 
    def quoted(quote: Symbol.Symbol): Parser[String] = {
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
96  | 
      quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 97  | 
    }.named("quoted")
 | 
| 34140 | 98  | 
|
| 75393 | 99  | 
    def quoted_content(quote: Symbol.Symbol, source: String): String = {
 | 
| 
73120
 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 
wenzelm 
parents: 
71601 
diff
changeset
 | 
100  | 
require(parseAll(quoted(quote), source).successful, "no quoted text")  | 
| 34189 | 101  | 
val body = source.substring(1, source.length - 1)  | 
102  | 
      if (body.exists(_ == '\\')) {
 | 
|
103  | 
val content =  | 
|
| 
40523
 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 
wenzelm 
parents: 
40290 
diff
changeset
 | 
104  | 
rep(many1(sym => sym != quote && sym != "\\") |  | 
| 34189 | 105  | 
              "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
 | 
106  | 
parseAll(content ^^ (_.mkString), body).get  | 
|
107  | 
}  | 
|
108  | 
else body  | 
|
| 34140 | 109  | 
}  | 
110  | 
||
| 75393 | 111  | 
    def quoted_line(quote: Symbol.Symbol, ctxt: Line_Context): Parser[(String, Line_Context)] = {
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
112  | 
      ctxt match {
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
113  | 
case Finished =>  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
114  | 
quote ~ quoted_body(quote) ~ opt_term(quote) ^^  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
115  | 
            { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
116  | 
case x ~ y ~ None => (x + y, Quoted(quote)) }  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
117  | 
case Quoted(q) if q == quote =>  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
118  | 
quoted_body(quote) ~ opt_term(quote) ^^  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
119  | 
            { case x ~ Some(y) => (x + y, Finished)
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
120  | 
case x ~ None => (x, ctxt) }  | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
121  | 
        case _ => failure("")
 | 
| 
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
122  | 
}  | 
| 
55510
 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 
wenzelm 
parents: 
55499 
diff
changeset
 | 
123  | 
    }.named("quoted_line")
 | 
| 
43411
 
0206466ee473
some support for partial scans with explicit context;
 
wenzelm 
parents: 
42718 
diff
changeset
 | 
124  | 
|
| 
48763
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
125  | 
def recover_quoted(quote: Symbol.Symbol): Parser[String] =  | 
| 
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
126  | 
      quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
 | 
| 
48743
 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 
wenzelm 
parents: 
48409 
diff
changeset
 | 
127  | 
|
| 34140 | 128  | 
|
| 55033 | 129  | 
/* nested text cartouches */  | 
130  | 
||
| 75393 | 131  | 
    def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] {
 | 
| 
73120
 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 
wenzelm 
parents: 
71601 
diff
changeset
 | 
132  | 
require(depth >= 0, "bad cartouche depth")  | 
| 55033 | 133  | 
|
| 75393 | 134  | 
      def apply(in: Input) = {
 | 
| 55033 | 135  | 
val start = in.offset  | 
136  | 
val end = in.source.length  | 
|
137  | 
val matcher = new Symbol.Matcher(in.source)  | 
|
138  | 
||
139  | 
var i = start  | 
|
140  | 
var d = depth  | 
|
141  | 
var finished = false  | 
|
| 55034 | 142  | 
        while (!finished && i < end) {
 | 
| 
75237
 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 
wenzelm 
parents: 
74887 
diff
changeset
 | 
143  | 
val sym = matcher.match_symbol(i)  | 
| 
 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 
wenzelm 
parents: 
74887 
diff
changeset
 | 
144  | 
val n = sym.length  | 
| 55034 | 145  | 
          if (Symbol.is_open(sym)) { i += n; d += 1 }
 | 
| 
58505
 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 
wenzelm 
parents: 
56827 
diff
changeset
 | 
146  | 
          else if (Symbol.is_close(sym) && d > 0) { i += n; d -= 1; if (d == 0) finished = true }
 | 
| 
 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 
wenzelm 
parents: 
56827 
diff
changeset
 | 
147  | 
else if (d > 0) i += n  | 
| 55033 | 148  | 
else finished = true  | 
149  | 
}  | 
|
150  | 
        if (i == start) Failure("bad input", in)
 | 
|
151  | 
else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start))  | 
|
152  | 
}  | 
|
153  | 
    }.named("cartouche_depth")
 | 
|
154  | 
||
155  | 
def cartouche: Parser[String] =  | 
|
156  | 
      cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
 | 
|
157  | 
||
| 75393 | 158  | 
    def cartouche_line(ctxt: Line_Context): Parser[(String, Line_Context)] = {
 | 
| 67364 | 159  | 
def cartouche_context(d: Int): Line_Context =  | 
160  | 
if (d == 0) Finished else Cartouche(d)  | 
|
161  | 
||
162  | 
      ctxt match {
 | 
|
163  | 
case Finished =>  | 
|
164  | 
          cartouche_depth(0) ^^ { case (c, d) => (c, cartouche_context(d)) }
 | 
|
165  | 
case Cartouche(depth) =>  | 
|
166  | 
          cartouche_depth(depth) ^^ { case (c, d) => (c, cartouche_context(d)) }
 | 
|
167  | 
        case _ => failure("")
 | 
|
168  | 
}  | 
|
| 55033 | 169  | 
}  | 
170  | 
||
171  | 
val recover_cartouche: Parser[String] =  | 
|
172  | 
cartouche_depth(0) ^^ (_._1)  | 
|
173  | 
||
| 75393 | 174  | 
    def cartouche_content(source: String): String = {
 | 
| 55033 | 175  | 
      def err(): Nothing = error("Malformed text cartouche: " + quote(source))
 | 
176  | 
val source1 =  | 
|
177  | 
Library.try_unprefix(Symbol.open_decoded, source) orElse  | 
|
178  | 
Library.try_unprefix(Symbol.open, source) getOrElse err()  | 
|
179  | 
Library.try_unsuffix(Symbol.close_decoded, source1) orElse  | 
|
180  | 
Library.try_unsuffix(Symbol.close, source1) getOrElse err()  | 
|
181  | 
}  | 
|
182  | 
||
183  | 
||
| 34143 | 184  | 
/* nested comments */  | 
185  | 
||
| 75393 | 186  | 
    private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] {
 | 
| 
73120
 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 
wenzelm 
parents: 
71601 
diff
changeset
 | 
187  | 
require(depth >= 0, "bad comment depth")  | 
| 43412 | 188  | 
|
| 71601 | 189  | 
val comment_text: Parser[List[String]] =  | 
| 
42441
 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 
wenzelm 
parents: 
40523 
diff
changeset
 | 
190  | 
        rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
 | 
| 34143 | 191  | 
|
| 75393 | 192  | 
      def apply(in: Input) = {
 | 
| 34143 | 193  | 
var rest = in  | 
| 75393 | 194  | 
        def try_parse[A](p: Parser[A]): Boolean = {
 | 
| 
56663
 
2d09b437c168
avoid "Adaptation of argument list by inserting ()" -- deprecated in scala-2.11.0;
 
wenzelm 
parents: 
55980 
diff
changeset
 | 
195  | 
          parse(p ^^^ (()), rest) match {
 | 
| 34143 | 196  | 
            case Success(_, next) => { rest = next; true }
 | 
197  | 
case _ => false  | 
|
198  | 
}  | 
|
199  | 
}  | 
|
| 43412 | 200  | 
var d = depth  | 
| 34143 | 201  | 
var finished = false  | 
202  | 
        while (!finished) {
 | 
|
| 43412 | 203  | 
          if (try_parse("(*")) d += 1
 | 
| 
58505
 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 
wenzelm 
parents: 
56827 
diff
changeset
 | 
204  | 
          else if (d > 0 && try_parse("*)")) { d -= 1; if (d == 0) finished = true }
 | 
| 43412 | 205  | 
else if (d == 0 || !try_parse(comment_text)) finished = true  | 
| 34143 | 206  | 
}  | 
| 43412 | 207  | 
if (in.offset < rest.offset)  | 
208  | 
Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest)  | 
|
| 34143 | 209  | 
        else Failure("comment expected", in)
 | 
210  | 
}  | 
|
| 43412 | 211  | 
    }.named("comment_depth")
 | 
212  | 
||
213  | 
def comment: Parser[String] =  | 
|
214  | 
      comment_depth(0) ^? { case (x, d) if d == 0 => x }
 | 
|
215  | 
||
| 75393 | 216  | 
    def comment_line(ctxt: Line_Context): Parser[(String, Line_Context)] = {
 | 
| 43412 | 217  | 
val depth =  | 
218  | 
        ctxt match {
 | 
|
219  | 
case Finished => 0  | 
|
220  | 
case Comment(d) => d  | 
|
221  | 
case _ => -1  | 
|
222  | 
}  | 
|
223  | 
if (depth >= 0)  | 
|
224  | 
comment_depth(depth) ^^  | 
|
225  | 
          { case (x, 0) => (x, Finished)
 | 
|
226  | 
case (x, d) => (x, Comment(d)) }  | 
|
227  | 
      else failure("")
 | 
|
228  | 
}  | 
|
| 34143 | 229  | 
|
| 
48763
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
230  | 
val recover_comment: Parser[String] =  | 
| 
 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 
wenzelm 
parents: 
48754 
diff
changeset
 | 
231  | 
comment_depth(0) ^^ (_._1)  | 
| 34143 | 232  | 
|
| 75393 | 233  | 
    def comment_content(source: String): String = {
 | 
| 
73120
 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 
wenzelm 
parents: 
71601 
diff
changeset
 | 
234  | 
require(parseAll(comment, source).successful, "no comment")  | 
| 55033 | 235  | 
source.substring(2, source.length - 2)  | 
236  | 
}  | 
|
237  | 
||
| 34143 | 238  | 
|
| 
74373
 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 
wenzelm 
parents: 
73367 
diff
changeset
 | 
239  | 
/* control cartouches */  | 
| 
 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 
wenzelm 
parents: 
73367 
diff
changeset
 | 
240  | 
|
| 
 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 
wenzelm 
parents: 
73367 
diff
changeset
 | 
241  | 
val control_symbol: Parser[String] = one(Symbol.is_control)  | 
| 
 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 
wenzelm 
parents: 
73367 
diff
changeset
 | 
242  | 
|
| 
 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 
wenzelm 
parents: 
73367 
diff
changeset
 | 
243  | 
    val control_cartouche: Parser[String] = control_symbol ~ cartouche ^^ { case a ~ b => a + b }
 | 
| 
 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 
wenzelm 
parents: 
73367 
diff
changeset
 | 
244  | 
|
| 
 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 
wenzelm 
parents: 
73367 
diff
changeset
 | 
245  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
246  | 
/* keyword */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
247  | 
|
| 75393 | 248  | 
    def literal(lexicon: Lexicon): Parser[String] = new Parser[String] {
 | 
249  | 
      def apply(in: Input) = {
 | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
250  | 
val result = lexicon.scan(in)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
251  | 
        if (result.isEmpty) Failure("keyword expected", in)
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
252  | 
else Success(result, in.drop(result.length))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
253  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
254  | 
    }.named("keyword")
 | 
| 
31649
 
a11ea667d676
reorganized and abstracted version, via Set trait;
 
wenzelm 
parents: 
31648 
diff
changeset
 | 
255  | 
}  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
256  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
257  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
258  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
259  | 
/** Lexicon -- position tree **/  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
260  | 
|
| 75393 | 261  | 
  object Lexicon {
 | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
262  | 
/* representation */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
263  | 
|
| 60215 | 264  | 
private sealed case class Tree(branches: Map[Char, (String, Tree)])  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
265  | 
private val empty_tree = Tree(Map())  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
266  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
267  | 
val empty: Lexicon = new Lexicon(empty_tree)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
268  | 
def apply(elems: String*): Lexicon = empty ++ elems  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
269  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
270  | 
|
| 75393 | 271  | 
  final class Lexicon private(rep: Lexicon.Tree) {
 | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
272  | 
/* auxiliary operations */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
273  | 
|
| 59073 | 274  | 
private def dest(tree: Lexicon.Tree, result: List[String]): List[String] =  | 
| 73359 | 275  | 
      tree.branches.toList.foldLeft(result) {
 | 
276  | 
case (res, (_, (s, tr))) => if (s.isEmpty) dest(tr, res) else dest(tr, s :: res)  | 
|
277  | 
}  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
278  | 
|
| 75393 | 279  | 
    private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] = {
 | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
280  | 
val len = str.length  | 
| 75393 | 281  | 
@tailrec  | 
282  | 
      def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] = {
 | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
283  | 
        if (i < len) {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
284  | 
          tree.branches.get(str.charAt(i)) match {
 | 
| 59319 | 285  | 
case Some((s, tr)) => look(tr, s.nonEmpty, i + 1)  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
286  | 
case None => None  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
287  | 
}  | 
| 55980 | 288  | 
}  | 
289  | 
else Some(tip, tree)  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
290  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
291  | 
look(rep, false, 0)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
292  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
293  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
294  | 
def completions(str: CharSequence): List[String] =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
295  | 
      lookup(str) match {
 | 
| 59073 | 296  | 
case Some((true, tree)) => dest(tree, List(str.toString))  | 
297  | 
case Some((false, tree)) => dest(tree, Nil)  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
298  | 
case None => Nil  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
299  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
300  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
301  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
302  | 
/* pseudo Set methods */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
303  | 
|
| 59073 | 304  | 
def raw_iterator: Iterator[String] = dest(rep, Nil).iterator  | 
305  | 
def iterator: Iterator[String] = dest(rep, Nil).sorted.iterator  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
306  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
307  | 
    override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
308  | 
|
| 59073 | 309  | 
def is_empty: Boolean = rep.branches.isEmpty  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
310  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
311  | 
def contains(elem: String): Boolean =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
312  | 
      lookup(elem) match {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
313  | 
case Some((tip, _)) => tip  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
314  | 
case _ => false  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
315  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
316  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
317  | 
|
| 59073 | 318  | 
/* build lexicon */  | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
319  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
320  | 
def + (elem: String): Lexicon =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
321  | 
if (contains(elem)) this  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
322  | 
      else {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
323  | 
val len = elem.length  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
324  | 
def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree =  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
325  | 
          if (i < len) {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
326  | 
val c = elem.charAt(i)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
327  | 
val end = (i + 1 == len)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
328  | 
            tree.branches.get(c) match {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
329  | 
case Some((s, tr)) =>  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
330  | 
Lexicon.Tree(tree.branches +  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
331  | 
(c -> (if (end) elem else s, extend(tr, i + 1))))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
332  | 
case None =>  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
333  | 
Lexicon.Tree(tree.branches +  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
334  | 
(c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1))))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
335  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
336  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
337  | 
else tree  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
338  | 
new Lexicon(extend(rep, 0))  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
339  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
340  | 
|
| 73362 | 341  | 
def ++ (elems: IterableOnce[String]): Lexicon =  | 
342  | 
elems.iterator.foldLeft(this)(_ + _)  | 
|
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
343  | 
|
| 59073 | 344  | 
def ++ (other: Lexicon): Lexicon =  | 
345  | 
if (this eq other) this  | 
|
346  | 
else if (is_empty) other  | 
|
347  | 
else this ++ other.raw_iterator  | 
|
348  | 
||
| 73357 | 349  | 
def -- (remove: Iterable[String]): Lexicon =  | 
| 71601 | 350  | 
if (remove.exists(contains))  | 
| 63579 | 351  | 
Lexicon.empty ++ iterator.filterNot(a => remove.exists(b => a == b))  | 
352  | 
else this  | 
|
353  | 
||
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
354  | 
|
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
355  | 
/* scan */  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
356  | 
|
| 75393 | 357  | 
    def scan(in: Reader[Char]): String = {
 | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
358  | 
val source = in.source  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
359  | 
val offset = in.offset  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
360  | 
val len = source.length - offset  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
361  | 
|
| 75393 | 362  | 
      @tailrec def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String = {
 | 
| 
55492
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
363  | 
        if (i < len) {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
364  | 
          tree.branches.get(source.charAt(offset + i)) match {
 | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
365  | 
case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
366  | 
case None => result  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
367  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
368  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
369  | 
else result  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
370  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
371  | 
scan_tree(rep, "", 0)  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
372  | 
}  | 
| 
 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 
wenzelm 
parents: 
55034 
diff
changeset
 | 
373  | 
}  | 
| 56821 | 374  | 
|
375  | 
||
376  | 
||
| 56822 | 377  | 
/** read stream without decoding: efficient length operation **/  | 
| 56821 | 378  | 
|
| 75393 | 379  | 
  private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int) extends CharSequence {
 | 
| 56821 | 380  | 
def charAt(i: Int): Char =  | 
381  | 
if (0 <= i && i < length) seq(start + i)  | 
|
382  | 
else throw new IndexOutOfBoundsException  | 
|
383  | 
||
| 56822 | 384  | 
def length: Int = end - start // avoid expensive seq.length  | 
| 56821 | 385  | 
|
386  | 
def subSequence(i: Int, j: Int): CharSequence =  | 
|
387  | 
if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j)  | 
|
388  | 
else throw new IndexOutOfBoundsException  | 
|
389  | 
||
| 75393 | 390  | 
    override def toString: String = {
 | 
| 56821 | 391  | 
val buf = new StringBuilder(length)  | 
392  | 
for (offset <- start until end) buf.append(seq(offset))  | 
|
393  | 
buf.toString  | 
|
394  | 
}  | 
|
395  | 
}  | 
|
396  | 
||
| 
69393
 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 
wenzelm 
parents: 
67438 
diff
changeset
 | 
397  | 
abstract class Byte_Reader extends Reader[Char] with AutoCloseable  | 
| 56821 | 398  | 
|
| 75393 | 399  | 
  private def make_byte_reader(stream: InputStream, stream_length: Int): Byte_Reader = {
 | 
| 56822 | 400  | 
val buffered_stream = new BufferedInputStream(stream)  | 
| 75394 | 401  | 
val seq =  | 
402  | 
      new PagedSeq({ (buf: Array[Char], offset: Int, length: Int) =>
 | 
|
403  | 
var i = 0  | 
|
404  | 
var c = 0  | 
|
405  | 
var eof = false  | 
|
406  | 
        while (!eof && i < length) {
 | 
|
407  | 
c = buffered_stream.read  | 
|
408  | 
if (c == -1) eof = true  | 
|
409  | 
          else { buf(offset + i) = c.toChar; i += 1 }
 | 
|
410  | 
}  | 
|
411  | 
if (i > 0) i else -1  | 
|
412  | 
})  | 
|
| 56822 | 413  | 
val restricted_seq = new Restricted_Seq(seq, 0, stream_length)  | 
| 56821 | 414  | 
|
| 75393 | 415  | 
    class Paged_Reader(override val offset: Int) extends Byte_Reader {
 | 
| 56821 | 416  | 
override lazy val source: CharSequence = restricted_seq  | 
| 56827 | 417  | 
def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\u001a'  | 
| 56821 | 418  | 
def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this  | 
419  | 
def pos: InputPosition = new OffsetPosition(source, offset)  | 
|
420  | 
def atEnd: Boolean = !seq.isDefinedAt(offset)  | 
|
421  | 
override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n)  | 
|
| 73367 | 422  | 
def close(): Unit = buffered_stream.close()  | 
| 56821 | 423  | 
}  | 
424  | 
new Paged_Reader(0)  | 
|
425  | 
}  | 
|
| 56822 | 426  | 
|
427  | 
def byte_reader(file: JFile): Byte_Reader =  | 
|
428  | 
make_byte_reader(new FileInputStream(file), file.length.toInt)  | 
|
429  | 
||
| 75393 | 430  | 
  def byte_reader(url: URL): Byte_Reader = {
 | 
| 56822 | 431  | 
val connection = url.openConnection  | 
432  | 
val stream = connection.getInputStream  | 
|
433  | 
val stream_length = connection.getContentLength  | 
|
434  | 
make_byte_reader(stream, stream_length)  | 
|
435  | 
}  | 
|
| 64824 | 436  | 
|
| 
66918
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
437  | 
def reader_is_utf8(reader: Reader[Char]): Boolean =  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
438  | 
reader.isInstanceOf[Byte_Reader]  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
439  | 
|
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
440  | 
def reader_decode_utf8(is_utf8: Boolean, s: String): String =  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
441  | 
if (is_utf8) UTF8.decode_permissive(s) else s  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
442  | 
|
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
443  | 
def reader_decode_utf8(reader: Reader[Char], s: String): String =  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
444  | 
reader_decode_utf8(reader_is_utf8(reader), s)  | 
| 
 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 
wenzelm 
parents: 
64824 
diff
changeset
 | 
445  | 
|
| 64824 | 446  | 
|
447  | 
/* plain text reader */  | 
|
448  | 
||
449  | 
def char_reader(text: CharSequence): CharSequenceReader =  | 
|
450  | 
new CharSequenceReader(text)  | 
|
| 31648 | 451  | 
}  |