| author | wenzelm | 
| Tue, 14 Mar 2017 16:20:07 +0100 | |
| changeset 65232 | ca571c8c0788 | 
| parent 64824 | 330ec9bc4b75 | 
| child 66918 | ec2b50aeb0dd | 
| permissions | -rw-r--r-- | 
| 31648 | 1 | /* Title: Pure/General/scan.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 40523 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 wenzelm parents: 
40290diff
changeset | 4 | Efficient scanning of keywords and tokens. | 
| 31648 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 9 | |
| 55980 | 10 | import scala.annotation.tailrec | 
| 63579 | 11 | import scala.collection.{IndexedSeq, Traversable, TraversableOnce}
 | 
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 12 | import scala.collection.immutable.PagedSeq | 
| 64824 | 13 | import scala.util.parsing.input.{OffsetPosition, Position => InputPosition,
 | 
| 14 | Reader, CharSequenceReader} | |
| 31648 | 15 | import scala.util.parsing.combinator.RegexParsers | 
| 16 | ||
| 56822 | 17 | import java.io.{File => JFile, BufferedInputStream, FileInputStream, InputStream}
 | 
| 18 | import java.net.URL | |
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 19 | |
| 31648 | 20 | |
| 21 | object Scan | |
| 22 | {
 | |
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 23 | /** context of partial line-oriented scans **/ | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 24 | |
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 25 | abstract class Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 26 | case object Finished extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 27 | case class Quoted(quote: String) extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 28 | case object Verbatim extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 29 | case class Cartouche(depth: Int) extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 30 | case class Comment(depth: Int) extends Line_Context | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 31 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 32 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 33 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 34 | /** parser combinators **/ | 
| 31650 | 35 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 36 | object Parsers extends Parsers | 
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 37 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 38 | trait Parsers extends RegexParsers | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 39 |   {
 | 
| 31649 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 40 | override val whiteSpace = "".r | 
| 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 41 | |
| 34137 | 42 | |
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 43 | /* optional termination */ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 44 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 45 | def opt_term[T](p: => Parser[T]): Parser[Option[T]] = | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 46 | p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None) | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 47 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 48 | |
| 55497 | 49 | /* repeated symbols */ | 
| 34137 | 50 | |
| 55497 | 51 | def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] = | 
| 34140 | 52 | new Parser[String] | 
| 34135 | 53 |       {
 | 
| 34137 | 54 | def apply(in: Input) = | 
| 55 |         {
 | |
| 56 | val start = in.offset | |
| 57 | val end = in.source.length | |
| 58 | val matcher = new Symbol.Matcher(in.source) | |
| 59 | ||
| 60 | var i = start | |
| 61 | var count = 0 | |
| 62 | var finished = false | |
| 55034 | 63 |           while (!finished && i < end && count < max_count) {
 | 
| 64 | val n = matcher(i, end) | |
| 65 | val sym = in.source.subSequence(i, i + n).toString | |
| 66 |             if (pred(sym)) { i += n; count += 1 }
 | |
| 34137 | 67 | else finished = true | 
| 68 | } | |
| 34140 | 69 |           if (count < min_count) Failure("bad input", in)
 | 
| 70 | else Success(in.source.subSequence(start, i).toString, in.drop(i - start)) | |
| 34137 | 71 | } | 
| 55497 | 72 |       }.named("repeated")
 | 
| 34137 | 73 | |
| 43696 | 74 | def one(pred: Symbol.Symbol => Boolean): Parser[String] = | 
| 55497 | 75 | repeated(pred, 1, 1) | 
| 43696 | 76 | |
| 77 | def many(pred: Symbol.Symbol => Boolean): Parser[String] = | |
| 55497 | 78 | repeated(pred, 0, Integer.MAX_VALUE) | 
| 43696 | 79 | |
| 80 | def many1(pred: Symbol.Symbol => Boolean): Parser[String] = | |
| 55497 | 81 | repeated(pred, 1, Integer.MAX_VALUE) | 
| 82 | ||
| 83 | ||
| 84 | /* character */ | |
| 85 | ||
| 86 | def character(pred: Char => Boolean): Symbol.Symbol => Boolean = | |
| 87 | (s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0)) | |
| 34140 | 88 | |
| 89 | ||
| 34143 | 90 | /* quoted strings */ | 
| 34140 | 91 | |
| 43696 | 92 | private def quoted_body(quote: Symbol.Symbol): Parser[String] = | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 93 |     {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 94 | rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" | | 
| 60215 | 95 |         ("""\\\d\d\d""".r ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 96 | } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 97 | |
| 55494 | 98 | def quoted(quote: Symbol.Symbol): Parser[String] = | 
| 34143 | 99 |     {
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 100 |       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 101 |     }.named("quoted")
 | 
| 34140 | 102 | |
| 43696 | 103 | def quoted_content(quote: Symbol.Symbol, source: String): String = | 
| 34140 | 104 |     {
 | 
| 105 | require(parseAll(quoted(quote), source).successful) | |
| 34189 | 106 | val body = source.substring(1, source.length - 1) | 
| 107 |       if (body.exists(_ == '\\')) {
 | |
| 108 | val content = | |
| 40523 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 wenzelm parents: 
40290diff
changeset | 109 | rep(many1(sym => sym != quote && sym != "\\") | | 
| 34189 | 110 |               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
 | 
| 111 | parseAll(content ^^ (_.mkString), body).get | |
| 112 | } | |
| 113 | else body | |
| 34140 | 114 | } | 
| 115 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 116 | def quoted_line(quote: Symbol.Symbol, ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 117 |     {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 118 |       ctxt match {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 119 | case Finished => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 120 | quote ~ quoted_body(quote) ~ opt_term(quote) ^^ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 121 |             { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 122 | case x ~ y ~ None => (x + y, Quoted(quote)) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 123 | case Quoted(q) if q == quote => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 124 | quoted_body(quote) ~ opt_term(quote) ^^ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 125 |             { case x ~ Some(y) => (x + y, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 126 | case x ~ None => (x, ctxt) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 127 |         case _ => failure("")
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 128 | } | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 129 |     }.named("quoted_line")
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 130 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 131 | def recover_quoted(quote: Symbol.Symbol): Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 132 |       quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
 | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
48409diff
changeset | 133 | |
| 34140 | 134 | |
| 34143 | 135 | /* verbatim text */ | 
| 136 | ||
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 137 | private def verbatim_body: Parser[String] = | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 138 | rep(many1(sym => sym != "*") | """\*(?!\})""".r) ^^ (_.mkString) | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 139 | |
| 55494 | 140 | def verbatim: Parser[String] = | 
| 34143 | 141 |     {
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 142 |       "{*" ~ verbatim_body ~ "*}" ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 143 |     }.named("verbatim")
 | 
| 34140 | 144 | |
| 145 | def verbatim_content(source: String): String = | |
| 146 |     {
 | |
| 147 | require(parseAll(verbatim, source).successful) | |
| 148 | source.substring(2, source.length - 2) | |
| 149 | } | |
| 150 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 151 | def verbatim_line(ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 152 |     {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 153 |       ctxt match {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 154 | case Finished => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 155 |           "{*" ~ verbatim_body ~ opt_term("*}") ^^
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 156 |             { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 157 | case x ~ y ~ None => (x + y, Verbatim) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 158 | case Verbatim => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 159 |           verbatim_body ~ opt_term("*}") ^^
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 160 |             { case x ~ Some(y) => (x + y, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 161 | case x ~ None => (x, Verbatim) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 162 |         case _ => failure("")
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 163 | } | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 164 |     }.named("verbatim_line")
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 165 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 166 | val recover_verbatim: Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 167 |       "{*" ~ verbatim_body ^^ { case x ~ y => x + y }
 | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
48409diff
changeset | 168 | |
| 34140 | 169 | |
| 55033 | 170 | /* nested text cartouches */ | 
| 171 | ||
| 172 | private def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] | |
| 173 |     {
 | |
| 174 | require(depth >= 0) | |
| 175 | ||
| 176 | def apply(in: Input) = | |
| 177 |       {
 | |
| 178 | val start = in.offset | |
| 179 | val end = in.source.length | |
| 180 | val matcher = new Symbol.Matcher(in.source) | |
| 181 | ||
| 182 | var i = start | |
| 183 | var d = depth | |
| 184 | var finished = false | |
| 55034 | 185 |         while (!finished && i < end) {
 | 
| 186 | val n = matcher(i, end) | |
| 187 | val sym = in.source.subSequence(i, i + n).toString | |
| 188 |           if (Symbol.is_open(sym)) { i += n; d += 1 }
 | |
| 58505 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 wenzelm parents: 
56827diff
changeset | 189 |           else if (Symbol.is_close(sym) && d > 0) { i += n; d -= 1; if (d == 0) finished = true }
 | 
| 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 wenzelm parents: 
56827diff
changeset | 190 | else if (d > 0) i += n | 
| 55033 | 191 | else finished = true | 
| 192 | } | |
| 193 |         if (i == start) Failure("bad input", in)
 | |
| 194 | else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start)) | |
| 195 | } | |
| 196 |     }.named("cartouche_depth")
 | |
| 197 | ||
| 198 | def cartouche: Parser[String] = | |
| 199 |       cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
 | |
| 200 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 201 | def cartouche_line(ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 55033 | 202 |     {
 | 
| 203 | val depth = | |
| 204 |         ctxt match {
 | |
| 205 | case Finished => 0 | |
| 206 | case Cartouche(d) => d | |
| 207 | case _ => -1 | |
| 208 | } | |
| 209 | if (depth >= 0) | |
| 210 | cartouche_depth(depth) ^^ | |
| 211 |           { case (x, 0) => (x, Finished)
 | |
| 212 | case (x, d) => (x, Cartouche(d)) } | |
| 213 |       else failure("")
 | |
| 214 | } | |
| 215 | ||
| 216 | val recover_cartouche: Parser[String] = | |
| 217 | cartouche_depth(0) ^^ (_._1) | |
| 218 | ||
| 219 | def cartouche_content(source: String): String = | |
| 220 |     {
 | |
| 221 |       def err(): Nothing = error("Malformed text cartouche: " + quote(source))
 | |
| 222 | val source1 = | |
| 223 | Library.try_unprefix(Symbol.open_decoded, source) orElse | |
| 224 | Library.try_unprefix(Symbol.open, source) getOrElse err() | |
| 225 | Library.try_unsuffix(Symbol.close_decoded, source1) orElse | |
| 226 | Library.try_unsuffix(Symbol.close, source1) getOrElse err() | |
| 227 | } | |
| 228 | ||
| 229 | ||
| 34143 | 230 | /* nested comments */ | 
| 231 | ||
| 43412 | 232 | private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] | 
| 34143 | 233 |     {
 | 
| 43412 | 234 | require(depth >= 0) | 
| 235 | ||
| 34143 | 236 | val comment_text = | 
| 42441 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 wenzelm parents: 
40523diff
changeset | 237 |         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
 | 
| 34143 | 238 | |
| 239 | def apply(in: Input) = | |
| 240 |       {
 | |
| 241 | var rest = in | |
| 42441 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 wenzelm parents: 
40523diff
changeset | 242 | def try_parse[A](p: Parser[A]): Boolean = | 
| 34143 | 243 |         {
 | 
| 56663 
2d09b437c168
avoid "Adaptation of argument list by inserting ()" -- deprecated in scala-2.11.0;
 wenzelm parents: 
55980diff
changeset | 244 |           parse(p ^^^ (()), rest) match {
 | 
| 34143 | 245 |             case Success(_, next) => { rest = next; true }
 | 
| 246 | case _ => false | |
| 247 | } | |
| 248 | } | |
| 43412 | 249 | var d = depth | 
| 34143 | 250 | var finished = false | 
| 251 |         while (!finished) {
 | |
| 43412 | 252 |           if (try_parse("(*")) d += 1
 | 
| 58505 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 wenzelm parents: 
56827diff
changeset | 253 |           else if (d > 0 && try_parse("*)")) { d -= 1; if (d == 0) finished = true }
 | 
| 43412 | 254 | else if (d == 0 || !try_parse(comment_text)) finished = true | 
| 34143 | 255 | } | 
| 43412 | 256 | if (in.offset < rest.offset) | 
| 257 | Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest) | |
| 34143 | 258 |         else Failure("comment expected", in)
 | 
| 259 | } | |
| 43412 | 260 |     }.named("comment_depth")
 | 
| 261 | ||
| 262 | def comment: Parser[String] = | |
| 263 |       comment_depth(0) ^? { case (x, d) if d == 0 => x }
 | |
| 264 | ||
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 265 | def comment_line(ctxt: Line_Context): Parser[(String, Line_Context)] = | 
| 43412 | 266 |     {
 | 
| 267 | val depth = | |
| 268 |         ctxt match {
 | |
| 269 | case Finished => 0 | |
| 270 | case Comment(d) => d | |
| 271 | case _ => -1 | |
| 272 | } | |
| 273 | if (depth >= 0) | |
| 274 | comment_depth(depth) ^^ | |
| 275 |           { case (x, 0) => (x, Finished)
 | |
| 276 | case (x, d) => (x, Comment(d)) } | |
| 277 |       else failure("")
 | |
| 278 | } | |
| 34143 | 279 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 280 | val recover_comment: Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 281 | comment_depth(0) ^^ (_._1) | 
| 34143 | 282 | |
| 55033 | 283 | def comment_content(source: String): String = | 
| 284 |     {
 | |
| 285 | require(parseAll(comment, source).successful) | |
| 286 | source.substring(2, source.length - 2) | |
| 287 | } | |
| 288 | ||
| 34143 | 289 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 290 | /* keyword */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 291 | |
| 55497 | 292 | def literal(lexicon: Lexicon): Parser[String] = new Parser[String] | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 293 |     {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 294 | def apply(in: Input) = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 295 |       {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 296 | val result = lexicon.scan(in) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 297 |         if (result.isEmpty) Failure("keyword expected", in)
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 298 | else Success(result, in.drop(result.length)) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 299 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 300 |     }.named("keyword")
 | 
| 31649 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 301 | } | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 302 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 303 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 304 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 305 | /** Lexicon -- position tree **/ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 306 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 307 | object Lexicon | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 308 |   {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 309 | /* representation */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 310 | |
| 60215 | 311 | private sealed case class Tree(branches: Map[Char, (String, Tree)]) | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 312 | private val empty_tree = Tree(Map()) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 313 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 314 | val empty: Lexicon = new Lexicon(empty_tree) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 315 | def apply(elems: String*): Lexicon = empty ++ elems | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 316 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 317 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 318 | final class Lexicon private(rep: Lexicon.Tree) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 319 |   {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 320 | /* auxiliary operations */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 321 | |
| 59073 | 322 | private def dest(tree: Lexicon.Tree, result: List[String]): List[String] = | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 323 | (result /: tree.branches.toList) ((res, entry) => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 324 |         entry match { case (_, (s, tr)) =>
 | 
| 59073 | 325 | if (s.isEmpty) dest(tr, res) else dest(tr, s :: res) }) | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 326 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 327 | private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 328 |     {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 329 | val len = str.length | 
| 55980 | 330 | @tailrec def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] = | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 331 |       {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 332 |         if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 333 |           tree.branches.get(str.charAt(i)) match {
 | 
| 59319 | 334 | case Some((s, tr)) => look(tr, s.nonEmpty, i + 1) | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 335 | case None => None | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 336 | } | 
| 55980 | 337 | } | 
| 338 | else Some(tip, tree) | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 339 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 340 | look(rep, false, 0) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 341 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 342 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 343 | def completions(str: CharSequence): List[String] = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 344 |       lookup(str) match {
 | 
| 59073 | 345 | case Some((true, tree)) => dest(tree, List(str.toString)) | 
| 346 | case Some((false, tree)) => dest(tree, Nil) | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 347 | case None => Nil | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 348 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 349 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 350 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 351 | /* pseudo Set methods */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 352 | |
| 59073 | 353 | def raw_iterator: Iterator[String] = dest(rep, Nil).iterator | 
| 354 | def iterator: Iterator[String] = dest(rep, Nil).sorted.iterator | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 355 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 356 |     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 357 | |
| 59073 | 358 | def is_empty: Boolean = rep.branches.isEmpty | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 359 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 360 | def contains(elem: String): Boolean = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 361 |       lookup(elem) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 362 | case Some((tip, _)) => tip | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 363 | case _ => false | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 364 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 365 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 366 | |
| 59073 | 367 | /* build lexicon */ | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 368 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 369 | def + (elem: String): Lexicon = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 370 | if (contains(elem)) this | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 371 |       else {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 372 | val len = elem.length | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 373 | def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 374 |           if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 375 | val c = elem.charAt(i) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 376 | val end = (i + 1 == len) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 377 |             tree.branches.get(c) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 378 | case Some((s, tr)) => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 379 | Lexicon.Tree(tree.branches + | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 380 | (c -> (if (end) elem else s, extend(tr, i + 1)))) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 381 | case None => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 382 | Lexicon.Tree(tree.branches + | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 383 | (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1)))) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 384 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 385 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 386 | else tree | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 387 | new Lexicon(extend(rep, 0)) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 388 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 389 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 390 | def ++ (elems: TraversableOnce[String]): Lexicon = (this /: elems)(_ + _) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 391 | |
| 59073 | 392 | def ++ (other: Lexicon): Lexicon = | 
| 393 | if (this eq other) this | |
| 394 | else if (is_empty) other | |
| 395 | else this ++ other.raw_iterator | |
| 396 | ||
| 63579 | 397 | def -- (remove: Traversable[String]): Lexicon = | 
| 398 | if (remove.exists(contains(_))) | |
| 399 | Lexicon.empty ++ iterator.filterNot(a => remove.exists(b => a == b)) | |
| 400 | else this | |
| 401 | ||
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 402 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 403 | /* scan */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 404 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 405 | def scan(in: Reader[Char]): String = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 406 |     {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 407 | val source = in.source | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 408 | val offset = in.offset | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 409 | val len = source.length - offset | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 410 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 411 | def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 412 |       {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 413 |         if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 414 |           tree.branches.get(source.charAt(offset + i)) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 415 | case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 416 | case None => result | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 417 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 418 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 419 | else result | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 420 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 421 | scan_tree(rep, "", 0) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 422 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 423 | } | 
| 56821 | 424 | |
| 425 | ||
| 426 | ||
| 56822 | 427 | /** read stream without decoding: efficient length operation **/ | 
| 56821 | 428 | |
| 429 | private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int) | |
| 430 | extends CharSequence | |
| 431 |   {
 | |
| 432 | def charAt(i: Int): Char = | |
| 433 | if (0 <= i && i < length) seq(start + i) | |
| 434 | else throw new IndexOutOfBoundsException | |
| 435 | ||
| 56822 | 436 | def length: Int = end - start // avoid expensive seq.length | 
| 56821 | 437 | |
| 438 | def subSequence(i: Int, j: Int): CharSequence = | |
| 439 | if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j) | |
| 440 | else throw new IndexOutOfBoundsException | |
| 441 | ||
| 442 | override def toString: String = | |
| 443 |     {
 | |
| 444 | val buf = new StringBuilder(length) | |
| 445 | for (offset <- start until end) buf.append(seq(offset)) | |
| 446 | buf.toString | |
| 447 | } | |
| 448 | } | |
| 449 | ||
| 450 |   abstract class Byte_Reader extends Reader[Char] { def close: Unit }
 | |
| 451 | ||
| 56822 | 452 | private def make_byte_reader(stream: InputStream, stream_length: Int): Byte_Reader = | 
| 56821 | 453 |   {
 | 
| 56822 | 454 | val buffered_stream = new BufferedInputStream(stream) | 
| 56821 | 455 | val seq = new PagedSeq( | 
| 456 | (buf: Array[Char], offset: Int, length: Int) => | |
| 457 |         {
 | |
| 458 | var i = 0 | |
| 459 | var c = 0 | |
| 460 | var eof = false | |
| 461 |           while (!eof && i < length) {
 | |
| 56822 | 462 | c = buffered_stream.read | 
| 56821 | 463 | if (c == -1) eof = true | 
| 464 |             else { buf(offset + i) = c.toChar; i += 1 }
 | |
| 465 | } | |
| 466 | if (i > 0) i else -1 | |
| 467 | }) | |
| 56822 | 468 | val restricted_seq = new Restricted_Seq(seq, 0, stream_length) | 
| 56821 | 469 | |
| 470 | class Paged_Reader(override val offset: Int) extends Byte_Reader | |
| 471 |     {
 | |
| 472 | override lazy val source: CharSequence = restricted_seq | |
| 56827 | 473 | def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\u001a' | 
| 56821 | 474 | def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this | 
| 475 | def pos: InputPosition = new OffsetPosition(source, offset) | |
| 476 | def atEnd: Boolean = !seq.isDefinedAt(offset) | |
| 477 | override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n) | |
| 56822 | 478 |       def close { buffered_stream.close }
 | 
| 56821 | 479 | } | 
| 480 | new Paged_Reader(0) | |
| 481 | } | |
| 56822 | 482 | |
| 483 | def byte_reader(file: JFile): Byte_Reader = | |
| 484 | make_byte_reader(new FileInputStream(file), file.length.toInt) | |
| 485 | ||
| 486 | def byte_reader(url: URL): Byte_Reader = | |
| 487 |   {
 | |
| 488 | val connection = url.openConnection | |
| 489 | val stream = connection.getInputStream | |
| 490 | val stream_length = connection.getContentLength | |
| 491 | make_byte_reader(stream, stream_length) | |
| 492 | } | |
| 64824 | 493 | |
| 494 | ||
| 495 | /* plain text reader */ | |
| 496 | ||
| 497 | def char_reader(text: CharSequence): CharSequenceReader = | |
| 498 | new CharSequenceReader(text) | |
| 31648 | 499 | } |