src/Tools/jEdit/src/spell_checker.scala
author wenzelm
Sun, 13 Apr 2014 15:34:54 +0200
changeset 56557 18d921496aa5
parent 56552 76cf86240cb7
child 56558 05c833d402bc
permissions -rw-r--r--
updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell; simplified dictionary file format; tuned signature;
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     1
/*  Title:      Tools/jEdit/src/spell_checker.scala
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     2
    Author:     Makarius
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     3
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
     4
Spell checker based on JOrtho (see http://sourceforge.net/projects/jortho).
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     5
*/
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     6
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     7
package isabelle.jedit
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     8
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
     9
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    10
import isabelle._
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    11
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    12
import java.lang.Class
56549
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
    13
import java.util.Locale
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
    14
import java.text.BreakIterator
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
    15
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
    16
import scala.collection.mutable
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    17
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    18
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    19
object Spell_Checker
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    20
{
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    21
  class Dictionary private [Spell_Checker](path: Path)
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    22
  {
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    23
    val lang = path.split_ext._1.base.implode
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    24
    override def toString: String = lang
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    25
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    26
    val locale: Locale =
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    27
      space_explode('_', lang) match {
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    28
        case a :: _ => Locale.forLanguageTag(a)
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    29
        case Nil => Locale.ENGLISH
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    30
      }
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    31
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    32
    def load_words: List[String] =
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    33
      path.split_ext._2 match {
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    34
        case "gz" => split_lines(File.read_gzip(path))
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    35
        case "" => split_lines(File.read(path))
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    36
        case ext => error("Bad file extension for dictionary " + path)
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    37
      }
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    38
  }
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    39
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    40
  def dictionaries: List[Dictionary] =
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    41
    for {
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    42
      path <- Path.split(Isabelle_System.getenv("JORTHO_DICTIONARIES"))
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    43
      if path.is_file
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    44
    } yield new Dictionary(path)
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    45
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    46
  def apply(dict: Dictionary): Spell_Checker = new Spell_Checker(dict)
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    47
}
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    48
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    49
class Spell_Checker private(dict: Spell_Checker.Dictionary)
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    50
{
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    51
  override def toString: String = dict.toString
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    52
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    53
  private val dictionary =
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    54
  {
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    55
    val factory_class = Class.forName("com.inet.jortho.DictionaryFactory")
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    56
    val factory_cons = factory_class.getConstructor()
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    57
    factory_cons.setAccessible(true)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    58
    val factory = factory_cons.newInstance()
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    59
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    60
    val add = factory_class.getDeclaredMethod("add", classOf[String])
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    61
    add.setAccessible(true)
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    62
    dict.load_words.foreach(add.invoke(factory, _))
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    63
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    64
    val create = factory_class.getDeclaredMethod("create")
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    65
    create.setAccessible(true)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    66
    create.invoke(factory)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    67
  }
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    68
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    69
  def add(word: String)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    70
  {
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    71
    val m = dictionary.getClass.getDeclaredMethod("add", classOf[String])
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    72
    m.setAccessible(true)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    73
    m.invoke(dictionary, word)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    74
  }
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    75
56552
76cf86240cb7 some case-mangling;
wenzelm
parents: 56549
diff changeset
    76
  def contains(word: String): Boolean =
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    77
  {
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    78
    val m = dictionary.getClass.getSuperclass.getDeclaredMethod("exist", classOf[String])
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    79
    m.setAccessible(true)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    80
    m.invoke(dictionary, word).asInstanceOf[java.lang.Boolean].booleanValue
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    81
  }
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    82
56552
76cf86240cb7 some case-mangling;
wenzelm
parents: 56549
diff changeset
    83
  def check(word: String): Boolean =
76cf86240cb7 some case-mangling;
wenzelm
parents: 56549
diff changeset
    84
    contains(word) ||
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    85
    Library.is_all_caps(word) && contains(Library.lowercase(word, dict.locale)) ||
56552
76cf86240cb7 some case-mangling;
wenzelm
parents: 56549
diff changeset
    86
    Library.is_capitalized(word) &&
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    87
      (contains(Library.lowercase(word, dict.locale)) ||
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
    88
       contains(Library.uppercase(word, dict.locale)))
56552
76cf86240cb7 some case-mangling;
wenzelm
parents: 56549
diff changeset
    89
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    90
  def complete(word: String): List[String] =
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    91
  {
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    92
    val m = dictionary.getClass.getSuperclass.
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    93
      getDeclaredMethod("searchSuggestions", classOf[String])
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    94
    m.setAccessible(true)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    95
    m.invoke(dictionary, word).asInstanceOf[java.util.List[AnyRef]].toArray.toList.map(_.toString)
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
    96
  }
56549
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
    97
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
    98
  def bad_words(text: String): List[Text.Range] =
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
    99
  {
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   100
    val result = new mutable.ListBuffer[Text.Range]
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   101
56557
18d921496aa5 updated to jortho-1.0-1: dictionaries from SCOWL 7.1, with parameters like aspell;
wenzelm
parents: 56552
diff changeset
   102
    val it = BreakIterator.getWordInstance(dict.locale)
56549
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   103
    it.setText(text)
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   104
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   105
    var i = 0
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   106
    var j = it.next
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   107
    while (j != BreakIterator.DONE) {
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   108
      val word = text.substring(i, j)
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   109
      if (word.length >= 2 && Character.isLetter(word(0)) && !check(word))
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   110
        result += Text.Range(i, j)
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   111
      i = j
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   112
      j = it.next
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   113
    }
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   114
    result.toList
7cfc7aa121f3 added bad_words;
wenzelm
parents: 56547
diff changeset
   115
  }
56547
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
   116
}
e9bb73d7b6cf added spell-checker based on jortho-1.0;
wenzelm
parents:
diff changeset
   117