| author | wenzelm | 
| Tue, 23 Jan 2024 19:56:52 +0100 | |
| changeset 79521 | db2b5c04075d | 
| parent 79510 | d8330439823a | 
| child 80441 | c420429fdf4c | 
| permissions | -rw-r--r-- | 
| 31648 | 1 | /* Title: Pure/General/scan.scala | 
| 2 | Author: Makarius | |
| 3 | ||
| 40523 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 wenzelm parents: 
40290diff
changeset | 4 | Efficient scanning of keywords and tokens. | 
| 31648 | 5 | */ | 
| 6 | ||
| 7 | package isabelle | |
| 8 | ||
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 9 | |
| 55980 | 10 | import scala.annotation.tailrec | 
| 73357 | 11 | import scala.collection.IndexedSeq | 
| 71601 | 12 | import scala.util.matching.Regex | 
| 64824 | 13 | import scala.util.parsing.input.{OffsetPosition, Position => InputPosition,
 | 
| 73136 | 14 | Reader, CharSequenceReader, PagedSeq} | 
| 31648 | 15 | import scala.util.parsing.combinator.RegexParsers | 
| 56822 | 16 | import java.io.{File => JFile, BufferedInputStream, FileInputStream, InputStream}
 | 
| 34187 
7b659c1561f1
added byte_reader, which works without decoding and enables efficient length operation (for scala.util.parsing.input.Reader);
 wenzelm parents: 
34157diff
changeset | 17 | |
| 31648 | 18 | |
| 75393 | 19 | object Scan {
 | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 20 | /** context of partial line-oriented scans **/ | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 21 | |
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 22 | abstract class Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 23 | case object Finished extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 24 | case class Quoted(quote: String) extends Line_Context | 
| 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 25 | case class Cartouche(depth: Int) extends Line_Context | 
| 67438 | 26 | case class Comment_Prefix(symbol: Symbol.Symbol) extends Line_Context | 
| 67365 
fb539f83683a
support for formal comments in ML in Isabelle/Scala;
 wenzelm parents: 
67364diff
changeset | 27 | case class Cartouche_Comment(depth: Int) extends Line_Context | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 28 | case class Comment(depth: Int) extends Line_Context | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 29 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 30 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 31 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 32 | /** parser combinators **/ | 
| 31650 | 33 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 34 | object Parsers extends Parsers | 
| 36011 
3ff725ac13a4
adapted to Scala 2.8.0 Beta1 -- with notable changes to scala.collection;
 wenzelm parents: 
34316diff
changeset | 35 | |
| 75393 | 36 |   trait Parsers extends RegexParsers {
 | 
| 71601 | 37 | override val whiteSpace: Regex = "".r | 
| 31649 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 38 | |
| 34137 | 39 | |
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 40 | /* optional termination */ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 41 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 42 | def opt_term[T](p: => Parser[T]): Parser[Option[T]] = | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 43 | p ^^ (x => Some(x)) | """\z""".r ^^ (_ => None) | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 44 | |
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 45 | |
| 55497 | 46 | /* repeated symbols */ | 
| 34137 | 47 | |
| 55497 | 48 | def repeated(pred: Symbol.Symbol => Boolean, min_count: Int, max_count: Int): Parser[String] = | 
| 75393 | 49 |       new Parser[String] {
 | 
| 50 |         def apply(in: Input) = {
 | |
| 34137 | 51 | val start = in.offset | 
| 52 | val end = in.source.length | |
| 53 | val matcher = new Symbol.Matcher(in.source) | |
| 54 | ||
| 55 | var i = start | |
| 56 | var count = 0 | |
| 57 | var finished = false | |
| 55034 | 58 |           while (!finished && i < end && count < max_count) {
 | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
74887diff
changeset | 59 | val sym = matcher.match_symbol(i) | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
74887diff
changeset | 60 |             if (pred(sym)) { i += sym.length; count += 1 }
 | 
| 34137 | 61 | else finished = true | 
| 62 | } | |
| 34140 | 63 |           if (count < min_count) Failure("bad input", in)
 | 
| 64 | else Success(in.source.subSequence(start, i).toString, in.drop(i - start)) | |
| 34137 | 65 | } | 
| 55497 | 66 |       }.named("repeated")
 | 
| 34137 | 67 | |
| 43696 | 68 | def one(pred: Symbol.Symbol => Boolean): Parser[String] = | 
| 55497 | 69 | repeated(pred, 1, 1) | 
| 43696 | 70 | |
| 66922 | 71 | def maybe(pred: Symbol.Symbol => Boolean): Parser[String] = | 
| 72 | repeated(pred, 0, 1) | |
| 73 | ||
| 43696 | 74 | def many(pred: Symbol.Symbol => Boolean): Parser[String] = | 
| 78243 | 75 | repeated(pred, 0, Int.MaxValue) | 
| 43696 | 76 | |
| 77 | def many1(pred: Symbol.Symbol => Boolean): Parser[String] = | |
| 78243 | 78 | repeated(pred, 1, Int.MaxValue) | 
| 55497 | 79 | |
| 80 | ||
| 81 | /* character */ | |
| 82 | ||
| 83 | def character(pred: Char => Boolean): Symbol.Symbol => Boolean = | |
| 84 | (s: Symbol. Symbol) => s.length == 1 && pred(s.charAt(0)) | |
| 34140 | 85 | |
| 86 | ||
| 34143 | 87 | /* quoted strings */ | 
| 34140 | 88 | |
| 75393 | 89 |     private def quoted_body(quote: Symbol.Symbol): Parser[String] = {
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 90 | rep(many1(sym => sym != quote && sym != "\\") | "\\" + quote | "\\\\" | | 
| 60215 | 91 |         ("""\\\d\d\d""".r ^? { case x if x.substring(1, 4).toInt <= 255 => x })) ^^ (_.mkString)
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 92 | } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 93 | |
| 75393 | 94 |     def quoted(quote: Symbol.Symbol): Parser[String] = {
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 95 |       quote ~ quoted_body(quote) ~ quote ^^ { case x ~ y ~ z => x + y + z }
 | 
| 34143 | 96 |     }.named("quoted")
 | 
| 34140 | 97 | |
| 75393 | 98 |     def quoted_content(quote: Symbol.Symbol, source: String): String = {
 | 
| 73120 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 wenzelm parents: 
71601diff
changeset | 99 | require(parseAll(quoted(quote), source).successful, "no quoted text") | 
| 34189 | 100 | val body = source.substring(1, source.length - 1) | 
| 101 |       if (body.exists(_ == '\\')) {
 | |
| 102 | val content = | |
| 40523 
1050315f6ee2
simplified/robustified treatment of malformed symbols, which are now fully internalized (total Symbol.explode etc.);
 wenzelm parents: 
40290diff
changeset | 103 | rep(many1(sym => sym != quote && sym != "\\") | | 
| 34189 | 104 |               "\\" ~> (quote | "\\" | """\d\d\d""".r ^^ { case x => x.toInt.toChar.toString }))
 | 
| 105 | parseAll(content ^^ (_.mkString), body).get | |
| 106 | } | |
| 107 | else body | |
| 34140 | 108 | } | 
| 109 | ||
| 75393 | 110 |     def quoted_line(quote: Symbol.Symbol, ctxt: Line_Context): Parser[(String, Line_Context)] = {
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 111 |       ctxt match {
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 112 | case Finished => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 113 | quote ~ quoted_body(quote) ~ opt_term(quote) ^^ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 114 |             { case x ~ y ~ Some(z) => (x + y + z, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 115 | case x ~ y ~ None => (x + y, Quoted(quote)) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 116 | case Quoted(q) if q == quote => | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 117 | quoted_body(quote) ~ opt_term(quote) ^^ | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 118 |             { case x ~ Some(y) => (x + y, Finished)
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 119 | case x ~ None => (x, ctxt) } | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 120 |         case _ => failure("")
 | 
| 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 121 | } | 
| 55510 
1585a65aad64
tuned signature -- emphasize line-oriented aspect;
 wenzelm parents: 
55499diff
changeset | 122 |     }.named("quoted_line")
 | 
| 43411 
0206466ee473
some support for partial scans with explicit context;
 wenzelm parents: 
42718diff
changeset | 123 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 124 | def recover_quoted(quote: Symbol.Symbol): Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 125 |       quote ~ quoted_body(quote) ^^ { case x ~ y => x + y }
 | 
| 48743 
a72f8ffecf31
refined recovery of scan errors: longest prefix of delimited token after failure, otherwise just one symbol;
 wenzelm parents: 
48409diff
changeset | 126 | |
| 34140 | 127 | |
| 55033 | 128 | /* nested text cartouches */ | 
| 129 | ||
| 75393 | 130 |     def cartouche_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] {
 | 
| 73120 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 wenzelm parents: 
71601diff
changeset | 131 | require(depth >= 0, "bad cartouche depth") | 
| 55033 | 132 | |
| 75393 | 133 |       def apply(in: Input) = {
 | 
| 55033 | 134 | val start = in.offset | 
| 135 | val end = in.source.length | |
| 136 | val matcher = new Symbol.Matcher(in.source) | |
| 137 | ||
| 138 | var i = start | |
| 139 | var d = depth | |
| 140 | var finished = false | |
| 55034 | 141 |         while (!finished && i < end) {
 | 
| 75237 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
74887diff
changeset | 142 | val sym = matcher.match_symbol(i) | 
| 
90eaac98b3fa
more elementary Symbol.Matcher without detour via Regex (see also Pure/General/symbol_explode.ML);
 wenzelm parents: 
74887diff
changeset | 143 | val n = sym.length | 
| 55034 | 144 |           if (Symbol.is_open(sym)) { i += n; d += 1 }
 | 
| 58505 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 wenzelm parents: 
56827diff
changeset | 145 |           else if (Symbol.is_close(sym) && d > 0) { i += n; d -= 1; if (d == 0) finished = true }
 | 
| 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 wenzelm parents: 
56827diff
changeset | 146 | else if (d > 0) i += n | 
| 55033 | 147 | else finished = true | 
| 148 | } | |
| 149 |         if (i == start) Failure("bad input", in)
 | |
| 150 | else Success((in.source.subSequence(start, i).toString, d), in.drop(i - start)) | |
| 151 | } | |
| 152 |     }.named("cartouche_depth")
 | |
| 153 | ||
| 154 | def cartouche: Parser[String] = | |
| 155 |       cartouche_depth(0) ^? { case (x, d) if d == 0 => x }
 | |
| 156 | ||
| 75393 | 157 |     def cartouche_line(ctxt: Line_Context): Parser[(String, Line_Context)] = {
 | 
| 67364 | 158 | def cartouche_context(d: Int): Line_Context = | 
| 159 | if (d == 0) Finished else Cartouche(d) | |
| 160 | ||
| 161 |       ctxt match {
 | |
| 162 | case Finished => | |
| 163 |           cartouche_depth(0) ^^ { case (c, d) => (c, cartouche_context(d)) }
 | |
| 164 | case Cartouche(depth) => | |
| 165 |           cartouche_depth(depth) ^^ { case (c, d) => (c, cartouche_context(d)) }
 | |
| 166 |         case _ => failure("")
 | |
| 167 | } | |
| 55033 | 168 | } | 
| 169 | ||
| 170 | val recover_cartouche: Parser[String] = | |
| 171 | cartouche_depth(0) ^^ (_._1) | |
| 172 | ||
| 75393 | 173 |     def cartouche_content(source: String): String = {
 | 
| 55033 | 174 |       def err(): Nothing = error("Malformed text cartouche: " + quote(source))
 | 
| 175 | val source1 = | |
| 176 | Library.try_unprefix(Symbol.open_decoded, source) orElse | |
| 177 | Library.try_unprefix(Symbol.open, source) getOrElse err() | |
| 178 | Library.try_unsuffix(Symbol.close_decoded, source1) orElse | |
| 179 | Library.try_unsuffix(Symbol.close, source1) getOrElse err() | |
| 180 | } | |
| 181 | ||
| 182 | ||
| 34143 | 183 | /* nested comments */ | 
| 184 | ||
| 75393 | 185 |     private def comment_depth(depth: Int): Parser[(String, Int)] = new Parser[(String, Int)] {
 | 
| 73120 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 wenzelm parents: 
71601diff
changeset | 186 | require(depth >= 0, "bad comment depth") | 
| 43412 | 187 | |
| 71601 | 188 | val comment_text: Parser[List[String]] = | 
| 42441 
781c622af16a
more robust scanning of iterated comments, such as "(* (**) (**) *)";
 wenzelm parents: 
40523diff
changeset | 189 |         rep1(many1(sym => sym != "*" && sym != "(") | """\*(?!\))|\((?!\*)""".r)
 | 
| 34143 | 190 | |
| 75393 | 191 |       def apply(in: Input) = {
 | 
| 34143 | 192 | var rest = in | 
| 75393 | 193 |         def try_parse[A](p: Parser[A]): Boolean = {
 | 
| 56663 
2d09b437c168
avoid "Adaptation of argument list by inserting ()" -- deprecated in scala-2.11.0;
 wenzelm parents: 
55980diff
changeset | 194 |           parse(p ^^^ (()), rest) match {
 | 
| 34143 | 195 |             case Success(_, next) => { rest = next; true }
 | 
| 196 | case _ => false | |
| 197 | } | |
| 198 | } | |
| 43412 | 199 | var d = depth | 
| 34143 | 200 | var finished = false | 
| 201 |         while (!finished) {
 | |
| 43412 | 202 |           if (try_parse("(*")) d += 1
 | 
| 58505 
d1fe47fe5401
actually finish after closing, e.g. relevant for consecutive (**)(**);
 wenzelm parents: 
56827diff
changeset | 203 |           else if (d > 0 && try_parse("*)")) { d -= 1; if (d == 0) finished = true }
 | 
| 43412 | 204 | else if (d == 0 || !try_parse(comment_text)) finished = true | 
| 34143 | 205 | } | 
| 43412 | 206 | if (in.offset < rest.offset) | 
| 207 | Success((in.source.subSequence(in.offset, rest.offset).toString, d), rest) | |
| 34143 | 208 |         else Failure("comment expected", in)
 | 
| 209 | } | |
| 43412 | 210 |     }.named("comment_depth")
 | 
| 211 | ||
| 212 | def comment: Parser[String] = | |
| 213 |       comment_depth(0) ^? { case (x, d) if d == 0 => x }
 | |
| 214 | ||
| 75393 | 215 |     def comment_line(ctxt: Line_Context): Parser[(String, Line_Context)] = {
 | 
| 43412 | 216 | val depth = | 
| 217 |         ctxt match {
 | |
| 218 | case Finished => 0 | |
| 219 | case Comment(d) => d | |
| 220 | case _ => -1 | |
| 221 | } | |
| 222 | if (depth >= 0) | |
| 223 | comment_depth(depth) ^^ | |
| 224 |           { case (x, 0) => (x, Finished)
 | |
| 225 | case (x, d) => (x, Comment(d)) } | |
| 226 |       else failure("")
 | |
| 227 | } | |
| 34143 | 228 | |
| 48763 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 229 | val recover_comment: Parser[String] = | 
| 
de68fc11c245
more precise recover_quoted, recover_verbatim, recover_comment (cf. ML version) -- NB: context parsers expect explicit termination;
 wenzelm parents: 
48754diff
changeset | 230 | comment_depth(0) ^^ (_._1) | 
| 34143 | 231 | |
| 75393 | 232 |     def comment_content(source: String): String = {
 | 
| 73120 
c3589f2dff31
more informative errors: simplify diagnosis of spurious failures reported by users;
 wenzelm parents: 
71601diff
changeset | 233 | require(parseAll(comment, source).successful, "no comment") | 
| 55033 | 234 | source.substring(2, source.length - 2) | 
| 235 | } | |
| 236 | ||
| 34143 | 237 | |
| 74373 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
73367diff
changeset | 238 | /* control cartouches */ | 
| 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
73367diff
changeset | 239 | |
| 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
73367diff
changeset | 240 | val control_symbol: Parser[String] = one(Symbol.is_control) | 
| 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
73367diff
changeset | 241 | |
| 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
73367diff
changeset | 242 |     val control_cartouche: Parser[String] = control_symbol ~ cartouche ^^ { case a ~ b => a + b }
 | 
| 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
73367diff
changeset | 243 | |
| 
6e4093927dbb
outer syntax: support for control-cartouche tokens;
 wenzelm parents: 
73367diff
changeset | 244 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 245 | /* keyword */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 246 | |
| 75393 | 247 |     def literal(lexicon: Lexicon): Parser[String] = new Parser[String] {
 | 
| 248 |       def apply(in: Input) = {
 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 249 | val result = lexicon.scan(in) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 250 |         if (result.isEmpty) Failure("keyword expected", in)
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 251 | else Success(result, in.drop(result.length)) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 252 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 253 |     }.named("keyword")
 | 
| 31649 
a11ea667d676
reorganized and abstracted version, via Set trait;
 wenzelm parents: 
31648diff
changeset | 254 | } | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 255 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 256 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 257 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 258 | /** Lexicon -- position tree **/ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 259 | |
| 75393 | 260 |   object Lexicon {
 | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 261 | /* representation */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 262 | |
| 60215 | 263 | private sealed case class Tree(branches: Map[Char, (String, Tree)]) | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 264 | private val empty_tree = Tree(Map()) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 265 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 266 | val empty: Lexicon = new Lexicon(empty_tree) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 267 | def apply(elems: String*): Lexicon = empty ++ elems | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 268 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 269 | |
| 75393 | 270 |   final class Lexicon private(rep: Lexicon.Tree) {
 | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 271 | /* auxiliary operations */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 272 | |
| 59073 | 273 | private def dest(tree: Lexicon.Tree, result: List[String]): List[String] = | 
| 73359 | 274 |       tree.branches.toList.foldLeft(result) {
 | 
| 275 | case (res, (_, (s, tr))) => if (s.isEmpty) dest(tr, res) else dest(tr, s :: res) | |
| 276 | } | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 277 | |
| 75393 | 278 |     private def lookup(str: CharSequence): Option[(Boolean, Lexicon.Tree)] = {
 | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 279 | val len = str.length | 
| 75393 | 280 | @tailrec | 
| 281 |       def look(tree: Lexicon.Tree, tip: Boolean, i: Int): Option[(Boolean, Lexicon.Tree)] = {
 | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 282 |         if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 283 |           tree.branches.get(str.charAt(i)) match {
 | 
| 59319 | 284 | case Some((s, tr)) => look(tr, s.nonEmpty, i + 1) | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 285 | case None => None | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 286 | } | 
| 55980 | 287 | } | 
| 288 | else Some(tip, tree) | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 289 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 290 | look(rep, false, 0) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 291 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 292 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 293 | def completions(str: CharSequence): List[String] = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 294 |       lookup(str) match {
 | 
| 59073 | 295 | case Some((true, tree)) => dest(tree, List(str.toString)) | 
| 296 | case Some((false, tree)) => dest(tree, Nil) | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 297 | case None => Nil | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 298 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 299 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 300 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 301 | /* pseudo Set methods */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 302 | |
| 59073 | 303 | def raw_iterator: Iterator[String] = dest(rep, Nil).iterator | 
| 304 | def iterator: Iterator[String] = dest(rep, Nil).sorted.iterator | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 305 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 306 |     override def toString: String = iterator.mkString("Lexicon(", ", ", ")")
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 307 | |
| 59073 | 308 | def is_empty: Boolean = rep.branches.isEmpty | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 309 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 310 | def contains(elem: String): Boolean = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 311 |       lookup(elem) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 312 | case Some((tip, _)) => tip | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 313 | case _ => false | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 314 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 315 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 316 | |
| 59073 | 317 | /* build lexicon */ | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 318 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 319 | def + (elem: String): Lexicon = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 320 | if (contains(elem)) this | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 321 |       else {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 322 | val len = elem.length | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 323 | def extend(tree: Lexicon.Tree, i: Int): Lexicon.Tree = | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 324 |           if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 325 | val c = elem.charAt(i) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 326 | val end = (i + 1 == len) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 327 |             tree.branches.get(c) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 328 | case Some((s, tr)) => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 329 | Lexicon.Tree(tree.branches + | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 330 | (c -> (if (end) elem else s, extend(tr, i + 1)))) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 331 | case None => | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 332 | Lexicon.Tree(tree.branches + | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 333 | (c -> (if (end) elem else "", extend(Lexicon.empty_tree, i + 1)))) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 334 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 335 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 336 | else tree | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 337 | new Lexicon(extend(rep, 0)) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 338 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 339 | |
| 73362 | 340 | def ++ (elems: IterableOnce[String]): Lexicon = | 
| 341 | elems.iterator.foldLeft(this)(_ + _) | |
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 342 | |
| 59073 | 343 | def ++ (other: Lexicon): Lexicon = | 
| 344 | if (this eq other) this | |
| 345 | else if (is_empty) other | |
| 346 | else this ++ other.raw_iterator | |
| 347 | ||
| 73357 | 348 | def -- (remove: Iterable[String]): Lexicon = | 
| 71601 | 349 | if (remove.exists(contains)) | 
| 63579 | 350 | Lexicon.empty ++ iterator.filterNot(a => remove.exists(b => a == b)) | 
| 351 | else this | |
| 352 | ||
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 353 | |
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 354 | /* scan */ | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 355 | |
| 75393 | 356 |     def scan(in: Reader[Char]): String = {
 | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 357 | val source = in.source | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 358 | val offset = in.offset | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 359 | val len = source.length - offset | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 360 | |
| 75393 | 361 |       @tailrec def scan_tree(tree: Lexicon.Tree, result: String, i: Int): String = {
 | 
| 55492 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 362 |         if (i < len) {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 363 |           tree.branches.get(source.charAt(offset + i)) match {
 | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 364 | case Some((s, tr)) => scan_tree(tr, if (s.isEmpty) result else s, i + 1) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 365 | case None => result | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 366 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 367 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 368 | else result | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 369 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 370 | scan_tree(rep, "", 0) | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 371 | } | 
| 
28d4db6c6e79
tuned signature -- separate Lexicon from Parsers (in accordance to ML version);
 wenzelm parents: 
55034diff
changeset | 372 | } | 
| 56821 | 373 | |
| 374 | ||
| 375 | ||
| 56822 | 376 | /** read stream without decoding: efficient length operation **/ | 
| 56821 | 377 | |
| 75393 | 378 |   private class Restricted_Seq(seq: IndexedSeq[Char], start: Int, end: Int) extends CharSequence {
 | 
| 56821 | 379 | def charAt(i: Int): Char = | 
| 380 | if (0 <= i && i < length) seq(start + i) | |
| 381 | else throw new IndexOutOfBoundsException | |
| 382 | ||
| 56822 | 383 | def length: Int = end - start // avoid expensive seq.length | 
| 56821 | 384 | |
| 385 | def subSequence(i: Int, j: Int): CharSequence = | |
| 386 | if (0 <= i && i <= j && j <= length) new Restricted_Seq(seq, start + i, start + j) | |
| 387 | else throw new IndexOutOfBoundsException | |
| 388 | ||
| 75393 | 389 |     override def toString: String = {
 | 
| 56821 | 390 | val buf = new StringBuilder(length) | 
| 391 | for (offset <- start until end) buf.append(seq(offset)) | |
| 392 | buf.toString | |
| 393 | } | |
| 394 | } | |
| 395 | ||
| 69393 
ed0824ef337e
static type for Library.using: avoid Java 11 warnings on "Illegal reflective access";
 wenzelm parents: 
67438diff
changeset | 396 | abstract class Byte_Reader extends Reader[Char] with AutoCloseable | 
| 56821 | 397 | |
| 75393 | 398 |   private def make_byte_reader(stream: InputStream, stream_length: Int): Byte_Reader = {
 | 
| 56822 | 399 | val buffered_stream = new BufferedInputStream(stream) | 
| 75394 | 400 | val seq = | 
| 401 |       new PagedSeq({ (buf: Array[Char], offset: Int, length: Int) =>
 | |
| 402 | var i = 0 | |
| 403 | var c = 0 | |
| 404 | var eof = false | |
| 405 |         while (!eof && i < length) {
 | |
| 406 | c = buffered_stream.read | |
| 407 | if (c == -1) eof = true | |
| 408 |           else { buf(offset + i) = c.toChar; i += 1 }
 | |
| 409 | } | |
| 410 | if (i > 0) i else -1 | |
| 411 | }) | |
| 56822 | 412 | val restricted_seq = new Restricted_Seq(seq, 0, stream_length) | 
| 56821 | 413 | |
| 75393 | 414 |     class Paged_Reader(override val offset: Int) extends Byte_Reader {
 | 
| 56821 | 415 | override lazy val source: CharSequence = restricted_seq | 
| 56827 | 416 | def first: Char = if (seq.isDefinedAt(offset)) seq(offset) else '\u001a' | 
| 56821 | 417 | def rest: Paged_Reader = if (seq.isDefinedAt(offset)) new Paged_Reader(offset + 1) else this | 
| 418 | def pos: InputPosition = new OffsetPosition(source, offset) | |
| 419 | def atEnd: Boolean = !seq.isDefinedAt(offset) | |
| 420 | override def drop(n: Int): Paged_Reader = new Paged_Reader(offset + n) | |
| 73367 | 421 | def close(): Unit = buffered_stream.close() | 
| 56821 | 422 | } | 
| 423 | new Paged_Reader(0) | |
| 424 | } | |
| 56822 | 425 | |
| 426 | def byte_reader(file: JFile): Byte_Reader = | |
| 427 | make_byte_reader(new FileInputStream(file), file.length.toInt) | |
| 428 | ||
| 79510 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
78243diff
changeset | 429 |   def byte_reader(url: Url): Byte_Reader = {
 | 
| 
d8330439823a
clarified signature: explicit type isabelle.Url to avoid oddities of java.net.URL (e.g. its "equals" method);
 wenzelm parents: 
78243diff
changeset | 430 | val connection = url.open_connection() | 
| 56822 | 431 | val stream = connection.getInputStream | 
| 432 | val stream_length = connection.getContentLength | |
| 433 | make_byte_reader(stream, stream_length) | |
| 434 | } | |
| 64824 | 435 | |
| 66918 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 436 | def reader_is_utf8(reader: Reader[Char]): Boolean = | 
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 437 | reader.isInstanceOf[Byte_Reader] | 
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 438 | |
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 439 | def reader_decode_utf8(is_utf8: Boolean, s: String): String = | 
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 440 | if (is_utf8) UTF8.decode_permissive(s) else s | 
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 441 | |
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 442 | def reader_decode_utf8(reader: Reader[Char], s: String): String = | 
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 443 | reader_decode_utf8(reader_is_utf8(reader), s) | 
| 
ec2b50aeb0dd
more robust treatment of UTF8 in raw byte sources;
 wenzelm parents: 
64824diff
changeset | 444 | |
| 64824 | 445 | |
| 446 | /* plain text reader */ | |
| 447 | ||
| 448 | def char_reader(text: CharSequence): CharSequenceReader = | |
| 449 | new CharSequenceReader(text) | |
| 31648 | 450 | } |