src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML
author blanchet
Tue, 01 Jul 2014 16:47:10 +0200
changeset 57461 29efe682335b
parent 57460 9cc802a8ab06
child 57462 dabd4516450d
permissions -rw-r--r--
removed needless code
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
48380
d4b7c7be3116 renamed ML files
blanchet
parents: 48379
diff changeset
     1
(*  Title:      HOL/Tools/Sledgehammer/sledgehammer_mash.ML
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     2
    Author:     Jasmin Blanchette, TU Muenchen
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
     3
    Author:     Cezary Kaliszyk, University of Innsbruck
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     4
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     5
Sledgehammer's machine-learning-based relevance filter (MaSh).
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     6
*)
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     7
48381
1b7d798460bb renamed ML structures
blanchet
parents: 48380
diff changeset
     8
signature SLEDGEHAMMER_MASH =
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     9
sig
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    10
  type stature = ATP_Problem_Generate.stature
51004
5f2788c38127 distinguish raw and non-raw facts, using raw for 10 000s of facts and non-raw after selection of some hundreds
blanchet
parents: 51003
diff changeset
    11
  type raw_fact = Sledgehammer_Fact.raw_fact
48296
e7f01b7e244e gracefully handle the case of empty theories when going up the accessibility chain
blanchet
parents: 48293
diff changeset
    12
  type fact = Sledgehammer_Fact.fact
e7f01b7e244e gracefully handle the case of empty theories when going up the accessibility chain
blanchet
parents: 48293
diff changeset
    13
  type fact_override = Sledgehammer_Fact.fact_override
55201
1ee776da8da7 renamed ML file
blanchet
parents: 54816
diff changeset
    14
  type params = Sledgehammer_Prover.params
1ee776da8da7 renamed ML file
blanchet
parents: 54816
diff changeset
    15
  type prover_result = Sledgehammer_Prover.prover_result
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    16
48308
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
    17
  val trace : bool Config.T
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
    18
  val duplicates : bool Config.T
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
    19
  val MePoN : string
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
    20
  val MaShN : string
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
    21
  val MeShN : string
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
    22
  val mepoN : string
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
    23
  val mashN : string
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
    24
  val meshN : string
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
    25
  val unlearnN : string
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
    26
  val learn_isarN : string
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
    27
  val learn_proverN : string
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
    28
  val relearn_isarN : string
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
    29
  val relearn_proverN : string
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
    30
  val fact_filters : string list
50826
18ace05656cf start using MaSh hints
blanchet
parents: 50825
diff changeset
    31
  val encode_str : string -> string
18ace05656cf start using MaSh hints
blanchet
parents: 50825
diff changeset
    32
  val encode_strs : string list -> string
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
    33
  val decode_str : string -> string
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
    34
  val decode_strs : string -> string list
50632
12c097ff3241 slightly more elegant naming convention (to keep low-level and high-level APIs separated)
blanchet
parents: 50631
diff changeset
    35
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
    36
  datatype mash_engine =
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
    37
    MaSh_NB
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
    38
  | MaSh_kNN
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
    39
  | MaSh_NB_kNN
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
    40
  | MaSh_NB_Ext
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    41
  | MaSh_kNN_Ext
57106
52e1e424eec1 export more ML functions, for experimentation
blanchet
parents: 57104
diff changeset
    42
57120
f8112c4b9cb8 extend exporter with new versions of MaSh
blanchet
parents: 57108
diff changeset
    43
  val is_mash_enabled : unit -> bool
f8112c4b9cb8 extend exporter with new versions of MaSh
blanchet
parents: 57108
diff changeset
    44
  val the_mash_engine : unit -> mash_engine
f8112c4b9cb8 extend exporter with new versions of MaSh
blanchet
parents: 57108
diff changeset
    45
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
    46
  val mesh_facts : ('a * 'a -> bool) -> int -> (real * (('a * real) list * 'a list)) list -> 'a list
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
    47
  val nickname_of_thm : thm -> string
57006
blanchet
parents: 57005
diff changeset
    48
  val find_suggested_facts : Proof.context -> ('b * thm) list -> string list -> ('b * thm) list
51135
e32114b25551 tuned code
blanchet
parents: 51134
diff changeset
    49
  val crude_thm_ord : thm * thm -> order
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
    50
  val thm_less : thm * thm -> bool
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    51
  val goal_of_thm : theory -> thm -> thm
57006
blanchet
parents: 57005
diff changeset
    52
  val run_prover_for_mash : Proof.context -> params -> string -> string -> fact list -> thm ->
blanchet
parents: 57005
diff changeset
    53
    prover_result
57406
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
    54
  val features_of : Proof.context -> theory -> stature -> term list -> string list
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
    55
  val trim_dependencies : string list -> string list option
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
    56
  val isar_dependencies_of : string Symtab.table * string Symtab.table -> thm -> string list option
57006
blanchet
parents: 57005
diff changeset
    57
  val prover_dependencies_of : Proof.context -> params -> string -> int -> raw_fact list ->
blanchet
parents: 57005
diff changeset
    58
    string Symtab.table * string Symtab.table -> thm -> bool * string list
blanchet
parents: 57005
diff changeset
    59
  val attach_parents_to_facts : ('a * thm) list -> ('a * thm) list ->
blanchet
parents: 57005
diff changeset
    60
    (string list * ('a * thm)) list
53140
a1235e90da5f pour extra features from proximate facts into goal, in exporter
blanchet
parents: 53137
diff changeset
    61
  val num_extra_feature_facts : int
53141
d27e99a6a679 take chained and proximate facts into consideration when computing MaSh features
blanchet
parents: 53140
diff changeset
    62
  val extra_feature_factor : real
53140
a1235e90da5f pour extra features from proximate facts into goal, in exporter
blanchet
parents: 53137
diff changeset
    63
  val weight_facts_smoothly : 'a list -> ('a * real) list
a1235e90da5f pour extra features from proximate facts into goal, in exporter
blanchet
parents: 53137
diff changeset
    64
  val weight_facts_steeply : 'a list -> ('a * real) list
57432
blanchet
parents: 57431
diff changeset
    65
  val find_mash_suggestions : Proof.context -> int -> string list -> ('a * thm) list ->
blanchet
parents: 57431
diff changeset
    66
    ('a * thm) list -> ('a * thm) list -> ('a * thm) list * ('a * thm) list
57407
140e16bc2a40 use right theory name for theorems in evaluation driver
blanchet
parents: 57406
diff changeset
    67
  val mash_suggested_facts : Proof.context -> theory -> params -> int -> term list -> term ->
140e16bc2a40 use right theory name for theorems in evaluation driver
blanchet
parents: 57406
diff changeset
    68
    raw_fact list -> fact list * fact list
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    69
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    70
  val mash_unlearn : unit -> unit
54503
blanchet
parents: 54432
diff changeset
    71
  val mash_learn_proof : Proof.context -> params -> term -> ('a * thm) list -> thm list -> unit
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    72
  val mash_learn_facts : Proof.context -> params -> string -> int -> bool -> Time.time ->
57120
f8112c4b9cb8 extend exporter with new versions of MaSh
blanchet
parents: 57108
diff changeset
    73
    raw_fact list -> string
57006
blanchet
parents: 57005
diff changeset
    74
  val mash_learn : Proof.context -> params -> fact_override -> thm list -> bool -> unit
57432
blanchet
parents: 57431
diff changeset
    75
  val mash_can_suggest_facts : Proof.context -> bool
53819
e55d641d0a70 honor MaSh's zero-overhead policy -- no learning if the tool is disabled
blanchet
parents: 53790
diff changeset
    76
57108
dc0b4f50e288 more generous max number of suggestions, for more safety
blanchet
parents: 57107
diff changeset
    77
  val generous_max_suggestions : int -> int
50814
4247cbd78aaf export MeSh data as well
blanchet
parents: 50755
diff changeset
    78
  val mepo_weight : real
4247cbd78aaf export MeSh data as well
blanchet
parents: 50755
diff changeset
    79
  val mash_weight : real
57006
blanchet
parents: 57005
diff changeset
    80
  val relevant_facts : Proof.context -> params -> string -> int -> fact_override -> term list ->
blanchet
parents: 57005
diff changeset
    81
    term -> raw_fact list -> (string * fact list) list
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    82
  val kill_learners : unit -> unit
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
    83
  val running_learners : unit -> unit
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
    84
end;
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
    85
48381
1b7d798460bb renamed ML structures
blanchet
parents: 48380
diff changeset
    86
structure Sledgehammer_MaSh : SLEDGEHAMMER_MASH =
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
    87
struct
48249
2bd242c56c90 dummy implementation
blanchet
parents: 48248
diff changeset
    88
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    89
open ATP_Util
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    90
open ATP_Problem_Generate
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    91
open Sledgehammer_Util
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    92
open Sledgehammer_Fact
55201
1ee776da8da7 renamed ML file
blanchet
parents: 54816
diff changeset
    93
open Sledgehammer_Prover
55202
824c48a539c9 renamed many Sledgehammer ML files to clarify structure
blanchet
parents: 55201
diff changeset
    94
open Sledgehammer_Prover_Minimize
48381
1b7d798460bb renamed ML structures
blanchet
parents: 48380
diff changeset
    95
open Sledgehammer_MePo
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    96
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
    97
val trace = Attrib.setup_config_bool @{binding sledgehammer_mash_trace} (K false)
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
    98
val duplicates = Attrib.setup_config_bool @{binding sledgehammer_fact_duplicates} (K false)
51032
69da236d7838 added option to use SNoW as machine learning algo
blanchet
parents: 51029
diff changeset
    99
48308
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
   100
fun trace_msg ctxt msg = if Config.get ctxt trace then tracing (msg ()) else ()
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
   101
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
   102
fun gen_eq_thm ctxt = if Config.get ctxt duplicates then Thm.eq_thm_strict else Thm.eq_thm_prop
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
   103
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
   104
val MePoN = "MePo"
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
   105
val MaShN = "MaSh"
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
   106
val MeShN = "MeSh"
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
   107
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
   108
val mepoN = "mepo"
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
   109
val mashN = "mash"
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
   110
val meshN = "mesh"
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
   111
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
   112
val fact_filters = [meshN, mepoN, mashN]
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
   113
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   114
val unlearnN = "unlearn"
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   115
val learn_isarN = "learn_isar"
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   116
val learn_proverN = "learn_prover"
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   117
val relearn_isarN = "relearn_isar"
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   118
val relearn_proverN = "relearn_prover"
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   119
57368
blanchet
parents: 57367
diff changeset
   120
fun map_array_at ary f i = Array.update (ary, i, f (Array.sub (ary, i)))
blanchet
parents: 57367
diff changeset
   121
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   122
type xtab = int * int Symtab.table
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   123
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   124
val empty_xtab = (0, Symtab.empty)
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   125
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   126
fun add_to_xtab key (next, tab) = (next + 1, Symtab.update_new (key, next) tab)
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   127
fun maybe_add_to_xtab key = perhaps (try (add_to_xtab key))
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   128
57386
5980ee29dbf6 always expand all paths
blanchet
parents: 57385
diff changeset
   129
fun mash_state_dir () =
5980ee29dbf6 always expand all paths
blanchet
parents: 57385
diff changeset
   130
  Path.expand (Path.explode "$ISABELLE_HOME_USER/mash" |> tap Isabelle_System.mkdir)
5980ee29dbf6 always expand all paths
blanchet
parents: 57385
diff changeset
   131
50310
b00eeb8e352e proper quoting of paths in MaSh
blanchet
parents: 50229
diff changeset
   132
fun mash_state_file () = Path.append (mash_state_dir ()) (Path.explode "state")
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   133
57007
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   134
fun wipe_out_mash_state_dir () =
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   135
  let val path = mash_state_dir () in
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   136
    try (File.fold_dir (fn file => fn _ => try File.rm (Path.append path (Path.basic file))) path)
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   137
      NONE;
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   138
    ()
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   139
  end
48330
192444b53e86 speed up MaSh queries
blanchet
parents: 48329
diff changeset
   140
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   141
datatype mash_engine =
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   142
  MaSh_NB
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   143
| MaSh_kNN
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   144
| MaSh_NB_kNN
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   145
| MaSh_NB_Ext
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   146
| MaSh_kNN_Ext
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   147
57028
e5466055e94f added Isabelle system option 'mash'
blanchet
parents: 57018
diff changeset
   148
fun mash_engine () =
57089
353652f47974 renamed 'MaSh' option
blanchet
parents: 57076
diff changeset
   149
  let val flag1 = Options.default_string @{system_option MaSh} in
57028
e5466055e94f added Isabelle system option 'mash'
blanchet
parents: 57018
diff changeset
   150
    (case if flag1 <> "none" (* default *) then flag1 else getenv "MASH" of
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   151
      "yes" => SOME MaSh_NB
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   152
    | "sml" => SOME MaSh_NB
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   153
    | "nb" => SOME MaSh_NB
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   154
    | "knn" => SOME MaSh_kNN
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   155
    | "nb_knn" => SOME MaSh_NB_kNN
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   156
    | "nb_ext" => SOME MaSh_NB_Ext
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   157
    | "knn_ext" => SOME MaSh_kNN_Ext
57028
e5466055e94f added Isabelle system option 'mash'
blanchet
parents: 57018
diff changeset
   158
    | _ => NONE)
e5466055e94f added Isabelle system option 'mash'
blanchet
parents: 57018
diff changeset
   159
  end
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
   160
57028
e5466055e94f added Isabelle system option 'mash'
blanchet
parents: 57018
diff changeset
   161
val is_mash_enabled = is_some o mash_engine
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   162
val the_mash_engine = the_default MaSh_NB o mash_engine
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
   163
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   164
fun scaled_avg [] = 0
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   165
  | scaled_avg xs = Real.ceil (100000000.0 * fold (curry (op +)) xs 0.0) div length xs
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   166
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   167
fun avg [] = 0.0
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   168
  | avg xs = fold (curry (op +)) xs 0.0 / Real.fromInt (length xs)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   169
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   170
fun normalize_scores _ [] = []
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   171
  | normalize_scores max_facts xs =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   172
    map (apsnd (curry Real.* (1.0 / avg (map snd (take max_facts xs))))) xs
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   173
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   174
fun mesh_facts fact_eq max_facts [(_, (sels, unks))] =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   175
    distinct fact_eq (map fst (take max_facts sels) @ take (max_facts - length sels) unks)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   176
  | mesh_facts fact_eq max_facts mess =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   177
    let
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   178
      val mess = mess |> map (apsnd (apfst (normalize_scores max_facts)))
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   179
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   180
      fun score_in fact (global_weight, (sels, unks)) =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   181
        let val score_at = try (nth sels) #> Option.map (fn (_, score) => global_weight * score) in
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   182
          (case find_index (curry fact_eq fact o fst) sels of
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   183
            ~1 => if member fact_eq unks fact then NONE else SOME 0.0
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   184
          | rank => score_at rank)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   185
        end
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   186
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   187
      fun weight_of fact = mess |> map_filter (score_in fact) |> scaled_avg
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   188
    in
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   189
      fold (union fact_eq o map fst o take max_facts o fst o snd) mess []
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   190
      |> map (`weight_of) |> sort (int_ord o pairself fst o swap)
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   191
      |> map snd |> take max_facts
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   192
    end
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   193
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   194
fun smooth_weight_of_fact rank = Math.pow (1.3, 15.5 - 0.2 * Real.fromInt rank) + 15.0 (* FUDGE *)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   195
fun steep_weight_of_fact rank = Math.pow (0.62, log2 (Real.fromInt (rank + 1))) (* FUDGE *)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   196
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   197
fun weight_facts_smoothly facts = facts ~~ map smooth_weight_of_fact (0 upto length facts - 1)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   198
fun weight_facts_steeply facts = facts ~~ map steep_weight_of_fact (0 upto length facts - 1)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   199
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   200
fun rev_sort_array_prefix cmp bnd a =
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   201
  let
57406
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   202
    exception BOTTOM of int
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   203
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   204
    val al = Array.length a
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   205
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   206
    fun maxson l i =
57017
blanchet
parents: 57014
diff changeset
   207
      let val i31 = i + i + i + 1 in
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   208
        if i31 + 2 < l then
57017
blanchet
parents: 57014
diff changeset
   209
          let val x = Unsynchronized.ref i31 in
blanchet
parents: 57014
diff changeset
   210
            if cmp (Array.sub (a, i31), Array.sub (a, i31 + 1)) = LESS then x := i31 + 1 else ();
blanchet
parents: 57014
diff changeset
   211
            if cmp (Array.sub (a, !x), Array.sub (a, i31 + 2)) = LESS then x := i31 + 2 else ();
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   212
            !x
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   213
          end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   214
        else
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   215
          if i31 + 1 < l andalso cmp (Array.sub (a, i31), Array.sub (a, i31 + 1)) = LESS
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   216
          then i31 + 1 else if i31 < l then i31 else raise BOTTOM i
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   217
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   218
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   219
    fun trickledown l i e =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   220
      let val j = maxson l i in
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   221
        if cmp (Array.sub (a, j), e) = GREATER then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   222
          (Array.update (a, i, Array.sub (a, j)); trickledown l j e)
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   223
        else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   224
          Array.update (a, i, e)
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   225
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   226
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   227
    fun trickle l i e = trickledown l i e handle BOTTOM i => Array.update (a, i, e)
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   228
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   229
    fun bubbledown l i =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   230
      let val j = maxson l i in
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   231
        Array.update (a, i, Array.sub (a, j));
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   232
        bubbledown l j
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   233
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   234
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   235
    fun bubble l i = bubbledown l i handle BOTTOM i => i
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   236
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   237
    fun trickleup i e =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   238
      let val father = (i - 1) div 3 in
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   239
        if cmp (Array.sub (a, father), e) = LESS then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   240
          (Array.update (a, i, Array.sub (a, father));
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   241
           if father > 0 then trickleup father e else Array.update (a, 0, e))
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   242
        else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   243
          Array.update (a, i, e)
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   244
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   245
57355
blanchet
parents: 57354
diff changeset
   246
    fun for i = if i < 0 then () else (trickle al i (Array.sub (a, i)); for (i - 1))
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   247
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   248
    fun for2 i =
57355
blanchet
parents: 57354
diff changeset
   249
      if i < Integer.max 2 (al - bnd) then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   250
        ()
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   251
      else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   252
        let val e = Array.sub (a, i) in
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   253
          Array.update (a, i, Array.sub (a, 0));
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   254
          trickleup (bubble i 0) e;
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   255
          for2 (i - 1)
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   256
        end
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   257
  in
57355
blanchet
parents: 57354
diff changeset
   258
    for (((al + 1) div 3) - 1);
blanchet
parents: 57354
diff changeset
   259
    for2 (al - 1);
blanchet
parents: 57354
diff changeset
   260
    if al > 1 then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   261
      let val e = Array.sub (a, 1) in
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   262
        Array.update (a, 1, Array.sub (a, 0));
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   263
        Array.update (a, 0, e)
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   264
      end
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   265
    else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   266
      ()
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   267
  end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   268
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   269
fun rev_sort_list_prefix cmp bnd xs =
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   270
  let val ary = Array.fromList xs in
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   271
    rev_sort_array_prefix cmp bnd ary;
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   272
    Array.foldr (op ::) [] ary
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   273
  end
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   274
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   275
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   276
(*** Isabelle-agnostic machine learning ***)
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   277
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   278
structure MaSh =
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   279
struct
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   280
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   281
fun select_visible_facts big_number recommends =
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   282
  List.app (fn at =>
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   283
    let val (j, ov) = Array.sub (recommends, at) in
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   284
      Array.update (recommends, at, (j, big_number + ov))
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   285
    end)
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   286
57374
blanchet
parents: 57373
diff changeset
   287
fun wider_array_of_vector init vec =
blanchet
parents: 57373
diff changeset
   288
  let val ary = Array.array init in
blanchet
parents: 57373
diff changeset
   289
    Array.copyVec {src = vec, dst = ary, di = 0};
blanchet
parents: 57373
diff changeset
   290
    ary
blanchet
parents: 57373
diff changeset
   291
  end
blanchet
parents: 57373
diff changeset
   292
57132
4ddf5a8f8f38 make SML code closer to Python code when 'nb_kuehlwein_style' is true
blanchet
parents: 57130
diff changeset
   293
val nb_def_prior_weight = 21 (* FUDGE *)
57095
001ec97c3e59 updated naive Bayes
blanchet
parents: 57089
diff changeset
   294
57374
blanchet
parents: 57373
diff changeset
   295
fun learn_facts (tfreq0, sfreq0, dffreq0) num_facts0 num_facts num_feats depss featss =
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   296
  let
57374
blanchet
parents: 57373
diff changeset
   297
    val tfreq = wider_array_of_vector (num_facts, 0) tfreq0
blanchet
parents: 57373
diff changeset
   298
    val sfreq = wider_array_of_vector (num_facts, Inttab.empty) sfreq0
blanchet
parents: 57373
diff changeset
   299
    val dffreq = wider_array_of_vector (num_feats, 0) dffreq0
blanchet
parents: 57373
diff changeset
   300
blanchet
parents: 57373
diff changeset
   301
    fun learn_one th feats deps =
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   302
      let
57355
blanchet
parents: 57354
diff changeset
   303
        fun add_th weight t =
blanchet
parents: 57354
diff changeset
   304
          let
blanchet
parents: 57354
diff changeset
   305
            val im = Array.sub (sfreq, t)
57374
blanchet
parents: 57373
diff changeset
   306
            fun fold_fn s = Inttab.map_default (s, 0) (Integer.add weight)
57355
blanchet
parents: 57354
diff changeset
   307
          in
57368
blanchet
parents: 57367
diff changeset
   308
            map_array_at tfreq (Integer.add weight) t;
57355
blanchet
parents: 57354
diff changeset
   309
            Array.update (sfreq, t, fold fold_fn feats im)
blanchet
parents: 57354
diff changeset
   310
          end
blanchet
parents: 57354
diff changeset
   311
57368
blanchet
parents: 57367
diff changeset
   312
        val add_sym = map_array_at dffreq (Integer.add 1)
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   313
      in
57355
blanchet
parents: 57354
diff changeset
   314
        add_th nb_def_prior_weight th;
blanchet
parents: 57354
diff changeset
   315
        List.app (add_th 1) deps;
blanchet
parents: 57354
diff changeset
   316
        List.app add_sym feats
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   317
      end
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   318
57102
blanchet
parents: 57101
diff changeset
   319
    fun for i =
57383
ba0fe0639bc8 right array indexing
blanchet
parents: 57382
diff changeset
   320
      if i = num_facts then ()
ba0fe0639bc8 right array indexing
blanchet
parents: 57382
diff changeset
   321
      else (learn_one i (Vector.sub (featss, i)) (Vector.sub (depss, i)); for (i + 1))
57102
blanchet
parents: 57101
diff changeset
   322
  in
57383
ba0fe0639bc8 right array indexing
blanchet
parents: 57382
diff changeset
   323
    for num_facts0;
57374
blanchet
parents: 57373
diff changeset
   324
    (Array.vector tfreq, Array.vector sfreq, Array.vector dffreq)
57102
blanchet
parents: 57101
diff changeset
   325
  end
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   326
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   327
fun naive_bayes (tfreq, sfreq, dffreq) num_facts max_suggs visible_facts goal_feats =
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   328
  let
57362
3ae07451a9f5 got rid of a few experimental options
blanchet
parents: 57361
diff changeset
   329
    val tau = 0.05 (* FUDGE *)
3ae07451a9f5 got rid of a few experimental options
blanchet
parents: 57361
diff changeset
   330
    val pos_weight = 10.0 (* FUDGE *)
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   331
    val def_val = ~15.0 (* FUDGE *)
57124
e4c2c792226f make SML naive Bayes closer to Python version
blanchet
parents: 57122
diff changeset
   332
57366
blanchet
parents: 57365
diff changeset
   333
    val ln_afreq = Math.ln (Real.fromInt num_facts)
57374
blanchet
parents: 57373
diff changeset
   334
    val idf = Vector.map (fn i => ln_afreq - Math.ln (Real.fromInt i)) dffreq
57366
blanchet
parents: 57365
diff changeset
   335
57359
dd89d0ec8e41 avoid subscripting array with ~1
blanchet
parents: 57358
diff changeset
   336
    fun tfidf feat = Vector.sub (idf, feat)
57102
blanchet
parents: 57101
diff changeset
   337
blanchet
parents: 57101
diff changeset
   338
    fun log_posterior i =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   339
      let
57374
blanchet
parents: 57373
diff changeset
   340
        val tfreq = Real.fromInt (Vector.sub (tfreq, i))
57097
blanchet
parents: 57096
diff changeset
   341
57401
02f56126b4e4 reintroduced 'extra features' but with lower weight than before (to account for tfidf)
blanchet
parents: 57387
diff changeset
   342
        fun fold_feats (f, fw0) (res, sfh) =
57102
blanchet
parents: 57101
diff changeset
   343
          (case Inttab.lookup sfh f of
blanchet
parents: 57101
diff changeset
   344
            SOME sf =>
57401
02f56126b4e4 reintroduced 'extra features' but with lower weight than before (to account for tfidf)
blanchet
parents: 57387
diff changeset
   345
            (res + fw0 * tfidf f * Math.ln (pos_weight * Real.fromInt sf / tfreq),
57102
blanchet
parents: 57101
diff changeset
   346
             Inttab.delete f sfh)
57409
c9e8cd8ec9e2 correctly take weights into consideration
blanchet
parents: 57407
diff changeset
   347
          | NONE => (res + fw0 * tfidf f * def_val, sfh))
57097
blanchet
parents: 57096
diff changeset
   348
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   349
        val (res, sfh) = fold fold_feats goal_feats (Math.ln tfreq, Vector.sub (sfreq, i))
57097
blanchet
parents: 57096
diff changeset
   350
57409
c9e8cd8ec9e2 correctly take weights into consideration
blanchet
parents: 57407
diff changeset
   351
        fun fold_sfh (f, sf) sow =
c9e8cd8ec9e2 correctly take weights into consideration
blanchet
parents: 57407
diff changeset
   352
          sow + tfidf f * Math.ln (1.0 + (1.0 - Real.fromInt sf) / tfreq)
57097
blanchet
parents: 57096
diff changeset
   353
57102
blanchet
parents: 57101
diff changeset
   354
        val sum_of_weights = Inttab.fold fold_sfh sfh 0.0
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   355
      in
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   356
        res + tau * sum_of_weights
57097
blanchet
parents: 57096
diff changeset
   357
      end
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   358
57356
9816f692b0ca refactoring
blanchet
parents: 57355
diff changeset
   359
    val posterior = Array.tabulate (num_facts, (fn j => (j, log_posterior j)))
57102
blanchet
parents: 57101
diff changeset
   360
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   361
    fun ret at acc =
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   362
      if at = num_facts then acc else ret (at + 1) (Array.sub (posterior, at) :: acc)
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   363
  in
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   364
    select_visible_facts 100000.0 posterior visible_facts;
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   365
    rev_sort_array_prefix (Real.compare o pairself snd) max_suggs posterior;
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   366
    ret (Integer.max 0 (num_facts - max_suggs)) []
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   367
  end
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   368
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   369
val number_of_nearest_neighbors = 10 (* FUDGE *)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   370
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   371
fun k_nearest_neighbors dffreq num_facts num_feats depss featss max_suggs visible_facts goal_feats =
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   372
  let
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   373
    exception EXIT of unit
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   374
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   375
    val ln_afreq = Math.ln (Real.fromInt num_facts)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   376
    fun tfidf feat = ln_afreq - Math.ln (Real.fromInt (Vector.sub (dffreq, feat)))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   377
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   378
    val overlaps_sqr = Array.tabulate (num_facts, rpair 0.0)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   379
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   380
    val feat_facts = Array.array (num_feats, [])
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   381
    val _ = Vector.foldl (fn (feats, fact) =>
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   382
      (List.app (map_array_at feat_facts (cons fact)) feats; fact + 1)) 0 featss
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   383
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   384
    fun do_feat (s, sw0) =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   385
      let
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   386
        val sw = sw0 * tfidf s
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   387
        val w2 = sw * sw
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   388
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   389
        fun inc_overlap j =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   390
          let val (_, ov) = Array.sub (overlaps_sqr, j) in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   391
            Array.update (overlaps_sqr, j, (j, w2 + ov))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   392
          end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   393
      in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   394
        List.app inc_overlap (Array.sub (feat_facts, s))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   395
      end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   396
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   397
    val _ = List.app do_feat goal_feats
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   398
    val _ = rev_sort_array_prefix (Real.compare o pairself snd) num_facts overlaps_sqr
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   399
    val no_recommends = Unsynchronized.ref 0
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   400
    val recommends = Array.tabulate (num_facts, rpair 0.0)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   401
    val age = Unsynchronized.ref 500000000.0
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   402
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   403
    fun inc_recommend v j =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   404
      let val (_, ov) = Array.sub (recommends, j) in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   405
        if ov <= 0.0 then
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   406
          (no_recommends := !no_recommends + 1; Array.update (recommends, j, (j, !age + ov)))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   407
        else if ov < !age + 1000.0 then
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   408
          Array.update (recommends, j, (j, v + ov))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   409
        else
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   410
          ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   411
      end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   412
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   413
    val k = Unsynchronized.ref 0
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   414
    fun do_k k =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   415
      if k >= num_facts then
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   416
        raise EXIT ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   417
      else
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   418
        let
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   419
          val (j, o2) = Array.sub (overlaps_sqr, num_facts - k - 1)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   420
          val o1 = Math.sqrt o2
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   421
          val _ = inc_recommend o1 j
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   422
          val ds = Vector.sub (depss, j)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   423
          val l = Real.fromInt (length ds)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   424
        in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   425
          List.app (inc_recommend (o1 / l)) ds
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   426
        end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   427
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   428
    fun while1 () =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   429
      if !k = number_of_nearest_neighbors then () else (do_k (!k); k := !k + 1; while1 ())
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   430
      handle EXIT () => ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   431
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   432
    fun while2 () =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   433
      if !no_recommends >= max_suggs then ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   434
      else (do_k (!k); k := !k + 1; age := !age - 10000.0; while2 ())
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   435
      handle EXIT () => ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   436
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   437
    fun ret acc at =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   438
      if at = num_facts then acc else ret (Array.sub (recommends, at) :: acc) (at + 1)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   439
  in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   440
    while1 ();
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   441
    while2 ();
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   442
    select_visible_facts 1000000000.0 recommends visible_facts;
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   443
    rev_sort_array_prefix (Real.compare o pairself snd) max_suggs recommends;
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   444
    ret [] (Integer.max 0 (num_facts - max_suggs))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   445
  end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   446
57125
2f620ef839ee added another way of invoking Python code, for experiments
blanchet
parents: 57124
diff changeset
   447
(* experimental *)
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   448
fun external_tool tool max_suggs learns goal_feats =
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   449
  let
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   450
    val ser = string_of_int (serial ()) (* poor person's attempt at thread-safety *)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   451
    val ocs = TextIO.openOut ("adv_syms" ^ ser)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   452
    val ocd = TextIO.openOut ("adv_deps" ^ ser)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   453
    val ocq = TextIO.openOut ("adv_seq" ^ ser)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   454
    val occ = TextIO.openOut ("adv_conj" ^ ser)
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   455
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   456
    fun os oc s = TextIO.output (oc, s)
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   457
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   458
    fun ol _ _ _ [] = ()
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   459
      | ol _ f _ [e] = f e
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   460
      | ol oc f sep (h :: t) = (f h; os oc sep; ol oc f sep t)
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   461
57294
ef9d4e1ceb00 use strings to communicate with external process, to ease debugging
blanchet
parents: 57291
diff changeset
   462
    fun do_learn (name, feats, deps) =
57357
30ee18eb23ac new version of adaptive k-NN with TFIDF
blanchet
parents: 57356
diff changeset
   463
      (os ocs name; os ocs ":"; ol ocs (os ocs o quote) ", " feats; os ocs "\n";
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   464
       os ocd name; os ocd ":"; ol ocd (os ocd) " " deps; os ocd "\n"; os ocq name; os ocq "\n")
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   465
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   466
    fun forkexec no =
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   467
      let
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   468
        val cmd =
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   469
          "~/misc/" ^ tool ^ " adv_syms" ^ ser ^ " adv_deps" ^ ser ^ " " ^ string_of_int no ^
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   470
          " adv_seq" ^ ser ^ " < adv_conj" ^ ser
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   471
      in
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   472
        fst (Isabelle_System.bash_output cmd)
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   473
        |> space_explode " "
57294
ef9d4e1ceb00 use strings to communicate with external process, to ease debugging
blanchet
parents: 57291
diff changeset
   474
        |> filter_out (curry (op =) "")
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   475
      end
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   476
  in
57401
02f56126b4e4 reintroduced 'extra features' but with lower weight than before (to account for tfidf)
blanchet
parents: 57387
diff changeset
   477
    (List.app do_learn learns; ol occ (os occ o quote) ", " (map fst goal_feats);
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   478
     TextIO.closeOut ocs; TextIO.closeOut ocd; TextIO.closeOut ocq; TextIO.closeOut occ;
57294
ef9d4e1ceb00 use strings to communicate with external process, to ease debugging
blanchet
parents: 57291
diff changeset
   479
     forkexec max_suggs)
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   480
  end
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   481
57375
b75438e23925 renamed experimental learning engines
blanchet
parents: 57374
diff changeset
   482
val k_nearest_neighbors_ext =
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   483
  external_tool ("newknn/knn" ^ " " ^ string_of_int number_of_nearest_neighbors)
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   484
val naive_bayes_ext = external_tool "predict/nbayes"
57374
blanchet
parents: 57373
diff changeset
   485
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   486
fun query_external ctxt engine max_suggs learns goal_feats =
57432
blanchet
parents: 57431
diff changeset
   487
  (trace_msg ctxt (fn () => "MaSh query external " ^ commas (map fst goal_feats));
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   488
   (case engine of
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   489
     MaSh_NB_Ext => naive_bayes_ext max_suggs learns goal_feats
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   490
   | MaSh_kNN_Ext => k_nearest_neighbors_ext max_suggs learns goal_feats))
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   491
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   492
fun query_internal ctxt engine num_facts num_feats (fact_names, featss, depss)
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   493
    (freqs as (_, _, dffreq)) visible_facts max_suggs goal_feats int_goal_feats =
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   494
  let
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   495
    fun nb () =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   496
      naive_bayes freqs num_facts max_suggs visible_facts int_goal_feats
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   497
      |> map fst
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   498
    fun knn () =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   499
      k_nearest_neighbors dffreq num_facts num_feats depss featss max_suggs visible_facts
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   500
        int_goal_feats
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   501
      |> map fst
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   502
  in
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   503
    (trace_msg ctxt (fn () => "MaSh query internal " ^ commas (map fst goal_feats) ^ " from {" ^
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   504
       elide_string 1000 (space_implode " " (Vector.foldr (op ::) [] fact_names)) ^ "}");
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   505
     (case engine of
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   506
       MaSh_NB => nb ()
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   507
     | MaSh_kNN => knn ()
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   508
     | MaSh_NB_kNN =>
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   509
       let
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   510
         val mess =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   511
           [(0.5 (* FUDGE *), (weight_facts_steeply (nb ()), [])),
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   512
            (0.5 (* FUDGE *), (weight_facts_steeply (knn ()), []))]
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   513
       in
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   514
         mesh_facts (op =) max_suggs mess
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   515
       end)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   516
     |> map (curry Vector.sub fact_names))
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   517
   end
57007
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   518
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   519
end;
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   520
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   521
57432
blanchet
parents: 57431
diff changeset
   522
(*** Persistent, stringly-typed state ***)
blanchet
parents: 57431
diff changeset
   523
blanchet
parents: 57431
diff changeset
   524
fun meta_char c =
blanchet
parents: 57431
diff changeset
   525
  if Char.isAlphaNum c orelse c = #"_" orelse c = #"." orelse c = #"(" orelse c = #")" orelse
blanchet
parents: 57431
diff changeset
   526
     c = #"," then
blanchet
parents: 57431
diff changeset
   527
    String.str c
blanchet
parents: 57431
diff changeset
   528
  else
blanchet
parents: 57431
diff changeset
   529
    (* fixed width, in case more digits follow *)
blanchet
parents: 57431
diff changeset
   530
    "%" ^ stringN_of_int 3 (Char.ord c)
blanchet
parents: 57431
diff changeset
   531
blanchet
parents: 57431
diff changeset
   532
fun unmeta_chars accum [] = String.implode (rev accum)
blanchet
parents: 57431
diff changeset
   533
  | unmeta_chars accum (#"%" :: d1 :: d2 :: d3 :: cs) =
blanchet
parents: 57431
diff changeset
   534
    (case Int.fromString (String.implode [d1, d2, d3]) of
blanchet
parents: 57431
diff changeset
   535
      SOME n => unmeta_chars (Char.chr n :: accum) cs
blanchet
parents: 57431
diff changeset
   536
    | NONE => "" (* error *))
blanchet
parents: 57431
diff changeset
   537
  | unmeta_chars _ (#"%" :: _) = "" (* error *)
blanchet
parents: 57431
diff changeset
   538
  | unmeta_chars accum (c :: cs) = unmeta_chars (c :: accum) cs
blanchet
parents: 57431
diff changeset
   539
blanchet
parents: 57431
diff changeset
   540
val encode_str = String.translate meta_char
blanchet
parents: 57431
diff changeset
   541
val decode_str = String.explode #> unmeta_chars []
blanchet
parents: 57431
diff changeset
   542
blanchet
parents: 57431
diff changeset
   543
val encode_strs = map encode_str #> space_implode " "
57461
29efe682335b removed needless code
blanchet
parents: 57460
diff changeset
   544
val decode_strs = space_explode " " #> map decode_str
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   545
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   546
datatype proof_kind = Isar_Proof | Automatic_Proof | Isar_Proof_wegen_Prover_Flop
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   547
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   548
fun str_of_proof_kind Isar_Proof = "i"
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   549
  | str_of_proof_kind Automatic_Proof = "a"
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   550
  | str_of_proof_kind Isar_Proof_wegen_Prover_Flop = "x"
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   551
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   552
fun proof_kind_of_str "a" = Automatic_Proof
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   553
  | proof_kind_of_str "x" = Isar_Proof_wegen_Prover_Flop
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   554
  | proof_kind_of_str _ (* "i" *) = Isar_Proof
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   555
57011
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   556
fun add_edge_to name parent =
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   557
  Graph.default_node (parent, (Isar_Proof, [], []))
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   558
  #> Graph.add_edge (parent, name)
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   559
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   560
fun add_node kind name parents feats deps (access_G, (fact_xtab, feat_xtab), learns) =
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   561
  ((Graph.new_node (name, (kind, feats, deps)) access_G
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   562
    handle Graph.DUP _ => Graph.map_node name (K (kind, feats, deps)) access_G)
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   563
   |> fold (add_edge_to name) parents,
57383
ba0fe0639bc8 right array indexing
blanchet
parents: 57382
diff changeset
   564
  (add_to_xtab name fact_xtab, fold maybe_add_to_xtab feats feat_xtab),
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   565
  (name, feats, deps) :: learns)
57011
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   566
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   567
fun try_graph ctxt when def f =
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   568
  f ()
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   569
  handle
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   570
    Graph.CYCLES (cycle :: _) =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   571
    (trace_msg ctxt (fn () => "Cycle involving " ^ commas cycle ^ " when " ^ when); def)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   572
  | Graph.DUP name =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   573
    (trace_msg ctxt (fn () => "Duplicate fact " ^ quote name ^ " when " ^ when); def)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   574
  | Graph.UNDEF name =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   575
    (trace_msg ctxt (fn () => "Unknown fact " ^ quote name ^ " when " ^ when); def)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   576
  | exn =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   577
    if Exn.is_interrupt exn then
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   578
      reraise exn
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   579
    else
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   580
      (trace_msg ctxt (fn () => "Internal error when " ^ when ^ ":\n" ^ Runtime.exn_message exn);
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   581
       def)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   582
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   583
fun graph_info G =
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   584
  string_of_int (length (Graph.keys G)) ^ " node(s), " ^
57006
blanchet
parents: 57005
diff changeset
   585
  string_of_int (fold (Integer.add o length o snd) (Graph.dest G) 0) ^ " edge(s), " ^
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   586
  string_of_int (length (Graph.maximals G)) ^ " maximal"
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   587
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
   588
type mash_state =
57358
545d02691b32 killed dead data
blanchet
parents: 57357
diff changeset
   589
  {access_G : (proof_kind * string list * string list) Graph.T,
57374
blanchet
parents: 57373
diff changeset
   590
   xtabs : xtab * xtab,
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   591
   ffds : string vector * int list vector * int list vector,
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   592
   freqs : int vector * int Inttab.table vector * int vector,
57365
blanchet
parents: 57364
diff changeset
   593
   dirty_facts : string list option}
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   594
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   595
val empty_xtabs = (empty_xtab, empty_xtab)
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   596
val empty_ffds = (Vector.fromList [], Vector.fromList [], Vector.fromList [])
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   597
val empty_freqs = (Vector.fromList [], Vector.fromList [], Vector.fromList [])
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   598
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   599
val empty_state =
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   600
  {access_G = Graph.empty,
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   601
   xtabs = empty_xtabs,
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   602
   ffds = empty_ffds,
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   603
   freqs = empty_freqs,
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   604
   dirty_facts = SOME []} : mash_state
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   605
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   606
fun recompute_ffds_freqs_from_learns learns ((num_facts, fact_tab), (num_feats, feat_tab))
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   607
    num_facts0 (fact_names0, featss0, depss0) freqs0 =
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   608
  let
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   609
    val fact_names = Vector.concat [fact_names0, Vector.fromList (map #1 learns)]
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   610
    val featss = Vector.concat [featss0,
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   611
      Vector.fromList (map (map_filter (Symtab.lookup feat_tab) o #2) learns)]
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   612
    val depss = Vector.concat [depss0,
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   613
      Vector.fromList (map (map_filter (Symtab.lookup fact_tab) o #3) learns)]
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   614
  in
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   615
    ((fact_names, featss, depss),
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   616
     MaSh.learn_facts freqs0 num_facts0 num_facts num_feats depss featss)
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   617
  end
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   618
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   619
fun reorder_learns (num_facts, fact_tab) learns =
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   620
  let val ary = Array.array (num_facts, ("", [], [])) in
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   621
    List.app (fn learn as (fact, _, _) =>
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   622
        Array.update (ary, the (Symtab.lookup fact_tab fact), learn))
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   623
      learns;
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   624
    Array.foldr (op ::) [] ary
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   625
  end
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   626
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   627
fun recompute_ffds_freqs_from_access_G access_G (xtabs as (fact_xtab, _)) =
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   628
  let
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   629
    val learns =
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   630
      Graph.schedule (fn _ => fn (fact, (_, feats, deps)) => (fact, feats, deps)) access_G
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   631
      |> reorder_learns fact_xtab
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   632
  in
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   633
    recompute_ffds_freqs_from_learns learns xtabs 0 empty_ffds empty_freqs
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   634
  end
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   635
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   636
local
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   637
57358
545d02691b32 killed dead data
blanchet
parents: 57357
diff changeset
   638
val version = "*** MaSh version 20140625 ***"
50357
187ae76a1757 more robustness in the face of MaSh format changes -- don't overwrite new versions with old versions
blanchet
parents: 50356
diff changeset
   639
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
   640
exception FILE_VERSION_TOO_NEW of unit
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   641
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   642
fun extract_node line =
55286
blanchet
parents: 55202
diff changeset
   643
  (case space_explode ":" line of
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   644
    [head, tail] =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   645
    (case (space_explode " " head, map (unprefix " ") (space_explode ";" tail)) of
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   646
      ([kind, name], [parents, feats, deps]) =>
57358
545d02691b32 killed dead data
blanchet
parents: 57357
diff changeset
   647
      SOME (proof_kind_of_str kind, decode_str name, decode_strs parents, decode_strs feats,
57010
121b63d7bcdb take weights into consideration in knn
blanchet
parents: 57009
diff changeset
   648
        decode_strs deps)
55286
blanchet
parents: 55202
diff changeset
   649
    | _ => NONE)
blanchet
parents: 55202
diff changeset
   650
  | _ => NONE)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   651
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   652
fun load_state ctxt (time_state as (memory_time, _)) =
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   653
  let val path = mash_state_file () in
57386
5980ee29dbf6 always expand all paths
blanchet
parents: 57385
diff changeset
   654
    (case try OS.FileSys.modTime (Path.implode path) of
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   655
      NONE => time_state
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   656
    | SOME disk_time =>
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   657
      if Time.>= (memory_time, disk_time) then
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   658
        time_state
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   659
      else
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   660
        (disk_time,
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   661
         (case try File.read_lines path of
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   662
           SOME (version' :: node_lines) =>
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   663
           let
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   664
             fun extract_line_and_add_node line =
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   665
               (case extract_node line of
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   666
                 NONE => I (* should not happen *)
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   667
               | SOME (kind, name, parents, feats, deps) => add_node kind name parents feats deps)
57011
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   668
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   669
             val empty_G_etc = (Graph.empty, empty_xtabs, [])
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   670
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   671
             val (access_G, xtabs, rev_learns) =
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   672
               (case string_ord (version', version) of
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   673
                 EQUAL =>
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   674
                 try_graph ctxt "loading state" empty_G_etc
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   675
                   (fn () => fold extract_line_and_add_node node_lines empty_G_etc)
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   676
               | LESS => (wipe_out_mash_state_dir (); empty_G_etc) (* cannot parse old file *)
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   677
               | GREATER => raise FILE_VERSION_TOO_NEW ())
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   678
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   679
             val (ffds, freqs) =
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   680
               recompute_ffds_freqs_from_learns (rev rev_learns) xtabs 0 empty_ffds empty_freqs
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   681
           in
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   682
             trace_msg ctxt (fn () => "Loaded fact graph (" ^ graph_info access_G ^ ")");
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   683
             {access_G = access_G, xtabs = xtabs, ffds = ffds, freqs = freqs, dirty_facts = SOME []}
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   684
           end
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   685
         | _ => empty_state)))
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   686
  end
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   687
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   688
fun str_of_entry (kind, name, parents, feats, deps) =
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   689
  str_of_proof_kind kind ^ " " ^ encode_str name ^ ": " ^ encode_strs parents ^ "; " ^
57358
545d02691b32 killed dead data
blanchet
parents: 57357
diff changeset
   690
  encode_strs feats ^ "; " ^ encode_strs deps ^ "\n"
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   691
57365
blanchet
parents: 57364
diff changeset
   692
fun save_state _ (time_state as (_, {dirty_facts = SOME [], ...})) = time_state
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   693
  | save_state ctxt (memory_time, {access_G, xtabs, ffds, freqs, dirty_facts}) =
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   694
    let
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   695
      fun append_entry (name, ((kind, feats, deps), (parents, _))) =
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   696
        cons (kind, name, Graph.Keys.dest parents, feats, deps)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   697
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   698
      val path = mash_state_file ()
57365
blanchet
parents: 57364
diff changeset
   699
      val dirty_facts' =
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   700
        (case try OS.FileSys.modTime (Path.implode path) of
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   701
          NONE => NONE
57365
blanchet
parents: 57364
diff changeset
   702
        | SOME disk_time => if Time.< (disk_time, memory_time) then dirty_facts else NONE)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   703
      val (banner, entries) =
57365
blanchet
parents: 57364
diff changeset
   704
        (case dirty_facts' of
55286
blanchet
parents: 55202
diff changeset
   705
          SOME names => (NONE, fold (append_entry o Graph.get_entry access_G) names [])
blanchet
parents: 55202
diff changeset
   706
        | NONE => (SOME (version ^ "\n"), Graph.fold append_entry access_G []))
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   707
    in
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   708
      (case banner of SOME s => File.write path s | NONE => ();
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   709
       entries |> chunk_list 500 |> List.app (File.append path o implode o map str_of_entry))
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   710
      handle IO.Io _ => ();
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   711
      trace_msg ctxt (fn () =>
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   712
        "Saved fact graph (" ^ graph_info access_G ^
57365
blanchet
parents: 57364
diff changeset
   713
        (case dirty_facts of
blanchet
parents: 57364
diff changeset
   714
          SOME dirty_facts => "; " ^ string_of_int (length dirty_facts) ^ " dirty fact(s)"
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   715
        | _ => "") ^  ")");
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   716
      (Time.now (),
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   717
       {access_G = access_G, xtabs = xtabs, ffds = ffds, freqs = freqs, dirty_facts = SOME []})
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   718
    end
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   719
57365
blanchet
parents: 57364
diff changeset
   720
val global_state = Synchronized.var "Sledgehammer_MaSh.global_state" (Time.zeroTime, empty_state)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   721
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   722
in
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   723
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   724
fun map_state ctxt f =
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   725
  Synchronized.change global_state (load_state ctxt ##> f #> save_state ctxt)
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
   726
  handle FILE_VERSION_TOO_NEW () => ()
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   727
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   728
fun peek_state ctxt =
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   729
  Synchronized.change_result global_state (perhaps (try (load_state ctxt)) #> `snd)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   730
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   731
fun clear_state () =
57007
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   732
  Synchronized.change global_state (fn _ =>
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   733
    (wipe_out_mash_state_dir (); (Time.zeroTime, empty_state)))
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   734
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   735
end
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   736
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   737
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   738
(*** Isabelle helpers ***)
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   739
50722
b422a48adc2d speed up generation of local theorem nicknames
blanchet
parents: 50718
diff changeset
   740
val local_prefix = "local" ^ Long_Name.separator
48378
9e96486d53ad handle local facts smoothly in MaSh
blanchet
parents: 48377
diff changeset
   741
50722
b422a48adc2d speed up generation of local theorem nicknames
blanchet
parents: 50718
diff changeset
   742
fun elided_backquote_thm threshold th =
57006
blanchet
parents: 57005
diff changeset
   743
  elide_string threshold (backquote_thm (Proof_Context.init_global (Thm.theory_of_thm th)) th)
48378
9e96486d53ad handle local facts smoothly in MaSh
blanchet
parents: 48377
diff changeset
   744
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
   745
val thy_name_of_thm = Context.theory_name o Thm.theory_of_thm
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
   746
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
   747
fun nickname_of_thm th =
48394
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   748
  if Thm.has_name_hint th then
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   749
    let val hint = Thm.get_name_hint th in
50722
b422a48adc2d speed up generation of local theorem nicknames
blanchet
parents: 50718
diff changeset
   750
      (* There must be a better way to detect local facts. *)
55286
blanchet
parents: 55202
diff changeset
   751
      (case try (unprefix local_prefix) hint of
48394
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   752
        SOME suf =>
55286
blanchet
parents: 55202
diff changeset
   753
        thy_name_of_thm th ^ Long_Name.separator ^ suf ^ Long_Name.separator ^
blanchet
parents: 55202
diff changeset
   754
        elided_backquote_thm 50 th
blanchet
parents: 55202
diff changeset
   755
      | NONE => hint)
48394
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   756
    end
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   757
  else
50722
b422a48adc2d speed up generation of local theorem nicknames
blanchet
parents: 50718
diff changeset
   758
    elided_backquote_thm 200 th
48378
9e96486d53ad handle local facts smoothly in MaSh
blanchet
parents: 48377
diff changeset
   759
51134
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   760
fun find_suggested_facts ctxt facts =
48330
192444b53e86 speed up MaSh queries
blanchet
parents: 48329
diff changeset
   761
  let
51134
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   762
    fun add (fact as (_, th)) = Symtab.default (nickname_of_thm th, fact)
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   763
    val tab = fold add facts Symtab.empty
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   764
    fun lookup nick =
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   765
      Symtab.lookup tab nick
57006
blanchet
parents: 57005
diff changeset
   766
      |> tap (fn NONE => trace_msg ctxt (fn () => "Cannot find " ^ quote nick) | _ => ())
51134
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   767
  in map_filter lookup end
48311
3c4e10606567 implemented MaSh QUERY operation
blanchet
parents: 48309
diff changeset
   768
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   769
fun free_feature_of s = "f" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   770
fun thy_feature_of s = "y" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   771
fun type_feature_of s = "t" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   772
fun class_feature_of s = "s" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   773
val local_feature = "local"
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   774
51135
e32114b25551 tuned code
blanchet
parents: 51134
diff changeset
   775
fun crude_theory_ord p =
50722
b422a48adc2d speed up generation of local theorem nicknames
blanchet
parents: 50718
diff changeset
   776
  if Theory.subthy p then
b422a48adc2d speed up generation of local theorem nicknames
blanchet
parents: 50718
diff changeset
   777
    if Theory.eq_thy p then EQUAL else LESS
48324
3ee5b5589402 speed up tautology/metaness check
blanchet
parents: 48323
diff changeset
   778
  else if Theory.subthy (swap p) then
3ee5b5589402 speed up tautology/metaness check
blanchet
parents: 48323
diff changeset
   779
    GREATER
55286
blanchet
parents: 55202
diff changeset
   780
  else
blanchet
parents: 55202
diff changeset
   781
    (case int_ord (pairself (length o Theory.ancestors_of) p) of
blanchet
parents: 55202
diff changeset
   782
      EQUAL => string_ord (pairself Context.theory_name p)
blanchet
parents: 55202
diff changeset
   783
    | order => order)
48324
3ee5b5589402 speed up tautology/metaness check
blanchet
parents: 48323
diff changeset
   784
51135
e32114b25551 tuned code
blanchet
parents: 51134
diff changeset
   785
fun crude_thm_ord p =
55286
blanchet
parents: 55202
diff changeset
   786
  (case crude_theory_ord (pairself theory_of_thm p) of
50359
da395f0e7dea tweaked order of theorems to avoid forward dependencies (MaSh)
blanchet
parents: 50357
diff changeset
   787
    EQUAL =>
57039
1ddd1f75fb40 added comment
blanchet
parents: 57029
diff changeset
   788
    (* The hack below is necessary because of odd dependencies that are not reflected in the theory
1ddd1f75fb40 added comment
blanchet
parents: 57029
diff changeset
   789
       comparison. *)
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
   790
    let val q = pairself nickname_of_thm p in
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
   791
      (* Hack to put "xxx_def" before "xxxI" and "xxxE" *)
55286
blanchet
parents: 55202
diff changeset
   792
      (case bool_ord (pairself (String.isSuffix "_def") (swap q)) of
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
   793
        EQUAL => string_ord q
55286
blanchet
parents: 55202
diff changeset
   794
      | ord => ord)
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
   795
    end
55286
blanchet
parents: 55202
diff changeset
   796
  | ord => ord)
48324
3ee5b5589402 speed up tautology/metaness check
blanchet
parents: 48323
diff changeset
   797
51136
fdcc06013f2d avoid crude/wrong theorem comparision
blanchet
parents: 51135
diff changeset
   798
val thm_less_eq = Theory.subthy o pairself theory_of_thm
fdcc06013f2d avoid crude/wrong theorem comparision
blanchet
parents: 51135
diff changeset
   799
fun thm_less p = thm_less_eq p andalso not (thm_less_eq (swap p))
fdcc06013f2d avoid crude/wrong theorem comparision
blanchet
parents: 51135
diff changeset
   800
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   801
val freezeT = Type.legacy_freeze_type
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   802
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   803
fun freeze (t $ u) = freeze t $ freeze u
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   804
  | freeze (Abs (s, T, t)) = Abs (s, freezeT T, freeze t)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   805
  | freeze (Var ((s, _), T)) = Free (s, freezeT T)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   806
  | freeze (Const (s, T)) = Const (s, freezeT T)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   807
  | freeze (Free (s, T)) = Free (s, freezeT T)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   808
  | freeze t = t
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   809
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   810
fun goal_of_thm thy = prop_of #> freeze #> cterm_of thy #> Goal.init
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   811
54141
f57f8e7a879f generate a comment storing the goal nickname in "learn_prover"
blanchet
parents: 54140
diff changeset
   812
fun run_prover_for_mash ctxt params prover goal_name facts goal =
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   813
  let
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   814
    val problem =
54141
f57f8e7a879f generate a comment storing the goal nickname in "learn_prover"
blanchet
parents: 54140
diff changeset
   815
      {comment = "Goal: " ^ goal_name, state = Proof.init ctxt, goal = goal, subgoal = 1,
f57f8e7a879f generate a comment storing the goal nickname in "learn_prover"
blanchet
parents: 54140
diff changeset
   816
       subgoal_count = 1, factss = [("", facts)]}
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   817
  in
54503
blanchet
parents: 54432
diff changeset
   818
    get_minimizing_prover ctxt MaSh (K ()) prover params (K (K (K ""))) problem
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   819
  end
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   820
48326
ef800e91d072 removed debugging output
blanchet
parents: 48325
diff changeset
   821
val bad_types = [@{type_name prop}, @{type_name bool}, @{type_name fun}]
ef800e91d072 removed debugging output
blanchet
parents: 48325
diff changeset
   822
53086
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   823
val pat_tvar_prefix = "_"
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   824
val pat_var_prefix = "_"
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   825
53089
a58b3b8631c6 keep long names to stay on the safe side
blanchet
parents: 53086
diff changeset
   826
(* try "Long_Name.base_name" for shorter names *)
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   827
fun massage_long_name s = s
53083
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   828
57006
blanchet
parents: 57005
diff changeset
   829
val crude_str_of_sort = space_implode ":" o map massage_long_name o subtract (op =) @{sort type}
53086
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   830
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   831
fun crude_str_of_typ (Type (s, [])) = massage_long_name s
57006
blanchet
parents: 57005
diff changeset
   832
  | crude_str_of_typ (Type (s, Ts)) = massage_long_name s ^ implode (map crude_str_of_typ Ts)
53083
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   833
  | crude_str_of_typ (TFree (_, S)) = crude_str_of_sort S
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   834
  | crude_str_of_typ (TVar (_, S)) = crude_str_of_sort S
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   835
53128
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   836
fun maybe_singleton_str _ "" = []
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   837
  | maybe_singleton_str pref s = [pref ^ s]
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   838
53148
c898409d8630 fixed subtle bug with "take" + thread overlord through
blanchet
parents: 53142
diff changeset
   839
val max_pat_breadth = 10 (* FUDGE *)
50585
306c7b807e13 contain exponential explosion of term patterns
blanchet
parents: 50584
diff changeset
   840
57406
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   841
fun term_features_of ctxt thy_name term_max_depth type_max_depth ts =
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   842
  let
50392
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   843
    val thy = Proof_Context.theory_of ctxt
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   844
50393
d8aa26c78327 record free variables as a MaSh feature
blanchet
parents: 50392
diff changeset
   845
    val fixes = map snd (Variable.dest_fixes ctxt)
50392
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   846
    val classes = Sign.classes_of thy
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   847
48304
50e64af9c829 more work on MaSh
blanchet
parents: 48303
diff changeset
   848
    fun add_classes @{sort type} = I
50392
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   849
      | add_classes S =
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   850
        fold (`(Sorts.super_classes classes)
57006
blanchet
parents: 57005
diff changeset
   851
          #> swap #> op ::
blanchet
parents: 57005
diff changeset
   852
          #> subtract (op =) @{sort type} #> map massage_long_name
blanchet
parents: 57005
diff changeset
   853
          #> map class_feature_of
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   854
          #> union (op =)) S
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   855
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   856
    fun pattify_type 0 _ = []
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   857
      | pattify_type _ (Type (s, [])) =
53086
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   858
        if member (op =) bad_types s then [] else [massage_long_name s]
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   859
      | pattify_type depth (Type (s, U :: Ts)) =
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   860
        let
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   861
          val T = Type (s, Ts)
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   862
          val ps = take max_pat_breadth (pattify_type depth T)
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   863
          val qs = take max_pat_breadth ("" :: pattify_type (depth - 1) U)
57006
blanchet
parents: 57005
diff changeset
   864
        in
blanchet
parents: 57005
diff changeset
   865
          map_product (fn p => fn "" => p | q => p ^ "(" ^ q ^ ")") ps qs
blanchet
parents: 57005
diff changeset
   866
        end
53128
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   867
      | pattify_type _ (TFree (_, S)) =
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   868
        maybe_singleton_str pat_tvar_prefix (crude_str_of_sort S)
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   869
      | pattify_type _ (TVar (_, S)) =
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   870
        maybe_singleton_str pat_tvar_prefix (crude_str_of_sort S)
57006
blanchet
parents: 57005
diff changeset
   871
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   872
    fun add_type_pat depth T =
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   873
      union (op =) (map type_feature_of (pattify_type depth T))
57006
blanchet
parents: 57005
diff changeset
   874
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   875
    fun add_type_pats 0 _ = I
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   876
      | add_type_pats depth t = add_type_pat depth t #> add_type_pats (depth - 1) t
57006
blanchet
parents: 57005
diff changeset
   877
53083
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   878
    fun add_type T =
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   879
      add_type_pats type_max_depth T
53156
f79f4693868b minor MaSh fix
blanchet
parents: 53155
diff changeset
   880
      #> fold_atyps_sorts (add_classes o snd) T
57006
blanchet
parents: 57005
diff changeset
   881
53084
877f5c28016f add subtypes as well as features in MaSh
blanchet
parents: 53083
diff changeset
   882
    fun add_subtypes (T as Type (_, Ts)) = add_type T #> fold add_subtypes Ts
877f5c28016f add subtypes as well as features in MaSh
blanchet
parents: 53083
diff changeset
   883
      | add_subtypes T = add_type T
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   884
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   885
    fun pattify_term _ 0 _ = []
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   886
      | pattify_term _ _ (Const (s, _)) =
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   887
        if is_widely_irrelevant_const s then [] else [massage_long_name s]
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   888
      | pattify_term _ _ (Free (s, T)) =
53128
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   889
        maybe_singleton_str pat_var_prefix (crude_str_of_typ T)
53090
1426c97311f2 treat frees as both consts and vars, for more hits
blanchet
parents: 53089
diff changeset
   890
        |> (if member (op =) fixes s then
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   891
              cons (free_feature_of (massage_long_name (thy_name ^ Long_Name.separator ^ s)))
53090
1426c97311f2 treat frees as both consts and vars, for more hits
blanchet
parents: 53089
diff changeset
   892
            else
1426c97311f2 treat frees as both consts and vars, for more hits
blanchet
parents: 53089
diff changeset
   893
              I)
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   894
      | pattify_term _ _ (Var (_, T)) = maybe_singleton_str pat_var_prefix (crude_str_of_typ T)
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   895
      | pattify_term Ts _ (Bound j) =
54695
a9efdf970720 generate problems with type classes
blanchet
parents: 54693
diff changeset
   896
        maybe_singleton_str pat_var_prefix (crude_str_of_typ (nth Ts j))
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   897
      | pattify_term Ts depth (t $ u) =
50339
d8dae91f3107 MaSh improvements: deeper patterns + more respect for chained facts
blanchet
parents: 50338
diff changeset
   898
        let
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   899
          val ps = take max_pat_breadth (pattify_term Ts depth t)
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   900
          val qs = take max_pat_breadth ("" :: pattify_term Ts (depth - 1) u)
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   901
        in
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   902
          map_product (fn p => fn "" => p | q => p ^ "(" ^ q ^ ")") ps qs
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   903
        end
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   904
      | pattify_term _ _ _ = []
57006
blanchet
parents: 57005
diff changeset
   905
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   906
    fun add_term_pat Ts = union (op =) oo pattify_term Ts
57006
blanchet
parents: 57005
diff changeset
   907
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   908
    fun add_term_pats _ 0 _ = I
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   909
      | add_term_pats Ts depth t = add_term_pat Ts depth t #> add_term_pats Ts (depth - 1) t
57006
blanchet
parents: 57005
diff changeset
   910
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   911
    fun add_term Ts = add_term_pats Ts term_max_depth
57006
blanchet
parents: 57005
diff changeset
   912
53085
15483854c83e handle Bounds as well in MaSh features
blanchet
parents: 53084
diff changeset
   913
    fun add_subterms Ts t =
55286
blanchet
parents: 55202
diff changeset
   914
      (case strip_comb t of
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   915
        (Const (s, T), args) =>
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   916
        (not (is_widely_irrelevant_const s) ? add_term Ts t)
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   917
        #> add_subtypes T #> fold (add_subterms Ts) args
50857
80768e28c9ee better handlig of built-ins -- at the top-level, not in subterms
blanchet
parents: 50841
diff changeset
   918
      | (head, args) =>
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   919
        (case head of
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   920
           Free (_, T) => add_term Ts t #> add_subtypes T
53084
877f5c28016f add subtypes as well as features in MaSh
blanchet
parents: 53083
diff changeset
   921
         | Var (_, T) => add_subtypes T
53085
15483854c83e handle Bounds as well in MaSh features
blanchet
parents: 53084
diff changeset
   922
         | Abs (_, T, body) => add_subtypes T #> add_subterms (T :: Ts) body
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   923
         | _ => I)
55286
blanchet
parents: 55202
diff changeset
   924
        #> fold (add_subterms Ts) args)
57006
blanchet
parents: 57005
diff changeset
   925
  in
blanchet
parents: 57005
diff changeset
   926
    fold (add_subterms []) ts []
blanchet
parents: 57005
diff changeset
   927
  end
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   928
53085
15483854c83e handle Bounds as well in MaSh features
blanchet
parents: 53084
diff changeset
   929
val term_max_depth = 2
53155
2c585fdbe197 eliminated some needless MaSh features
blanchet
parents: 53154
diff changeset
   930
val type_max_depth = 1
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   931
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   932
(* TODO: Generate type classes for types? *)
57406
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   933
fun features_of ctxt thy (scope, _) ts =
50393
d8aa26c78327 record free variables as a MaSh feature
blanchet
parents: 50392
diff changeset
   934
  let val thy_name = Context.theory_name thy in
d8aa26c78327 record free variables as a MaSh feature
blanchet
parents: 50392
diff changeset
   935
    thy_feature_of thy_name ::
57406
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   936
    term_features_of ctxt thy_name term_max_depth type_max_depth ts
50393
d8aa26c78327 record free variables as a MaSh feature
blanchet
parents: 50392
diff changeset
   937
    |> scope <> Global ? cons local_feature
d8aa26c78327 record free variables as a MaSh feature
blanchet
parents: 50392
diff changeset
   938
  end
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   939
57006
blanchet
parents: 57005
diff changeset
   940
(* Too many dependencies is a sign that a decision procedure is at work. There is not much to learn
blanchet
parents: 57005
diff changeset
   941
   from such proofs. *)
50434
960a3429615c more MaSh tweaking -- in particular, export the same facts in "MaSh_Export" as are later tried in "MaSh_Eval"
blanchet
parents: 50412
diff changeset
   942
val max_dependencies = 20
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   943
54125
420b876ff1e2 if slicing is disabled, don't enforce last slice's "max_facts", but rather the maximum "max_facts"
blanchet
parents: 54123
diff changeset
   944
val prover_default_max_facts = 25
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   945
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   946
(* "type_definition_xxx" facts are characterized by their use of "CollectI". *)
50755
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
   947
val typedef_dep = nickname_of_thm @{thm CollectI}
57006
blanchet
parents: 57005
diff changeset
   948
(* Mysterious parts of the class machinery create lots of proofs that refer exclusively to
blanchet
parents: 57005
diff changeset
   949
   "someI_ex" (and to some internal constructions). *)
50755
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
   950
val class_some_dep = nickname_of_thm @{thm someI_ex}
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   951
50828
91e6836bb68b don't learn from the proof of "psimps" etc.
blanchet
parents: 50827
diff changeset
   952
val fundef_ths =
57006
blanchet
parents: 57005
diff changeset
   953
  @{thms fundef_ex1_existence fundef_ex1_uniqueness fundef_ex1_iff fundef_default_value}
50828
91e6836bb68b don't learn from the proof of "psimps" etc.
blanchet
parents: 50827
diff changeset
   954
  |> map nickname_of_thm
91e6836bb68b don't learn from the proof of "psimps" etc.
blanchet
parents: 50827
diff changeset
   955
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   956
(* "Rep_xxx_inject", "Abs_xxx_inverse", etc., are derived using these facts. *)
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   957
val typedef_ths =
57006
blanchet
parents: 57005
diff changeset
   958
  @{thms type_definition.Abs_inverse type_definition.Rep_inverse type_definition.Rep
blanchet
parents: 57005
diff changeset
   959
      type_definition.Rep_inject type_definition.Abs_inject type_definition.Rep_cases
blanchet
parents: 57005
diff changeset
   960
      type_definition.Abs_cases type_definition.Rep_induct type_definition.Abs_induct
blanchet
parents: 57005
diff changeset
   961
      type_definition.Rep_range type_definition.Abs_image}
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
   962
  |> map nickname_of_thm
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   963
48441
2d2f009ca8eb remove MaSh junk associated with size functions
blanchet
parents: 48440
diff changeset
   964
fun is_size_def [dep] th =
55642
63beb38e9258 adapted to renaming of datatype 'cases' and 'recs' to 'case' and 'rec'
blanchet
parents: 55286
diff changeset
   965
    (case first_field ".rec" dep of
57006
blanchet
parents: 57005
diff changeset
   966
      SOME (pref, _) =>
blanchet
parents: 57005
diff changeset
   967
      (case first_field ".size" (nickname_of_thm th) of
blanchet
parents: 57005
diff changeset
   968
        SOME (pref', _) => pref = pref'
blanchet
parents: 57005
diff changeset
   969
      | NONE => false)
blanchet
parents: 57005
diff changeset
   970
    | NONE => false)
48441
2d2f009ca8eb remove MaSh junk associated with size functions
blanchet
parents: 48440
diff changeset
   971
  | is_size_def _ _ = false
2d2f009ca8eb remove MaSh junk associated with size functions
blanchet
parents: 48440
diff changeset
   972
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
   973
fun trim_dependencies deps =
50755
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
   974
  if length deps > max_dependencies then NONE else SOME deps
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   975
50755
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
   976
fun isar_dependencies_of name_tabs th =
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   977
  thms_in_proof max_dependencies (SOME name_tabs) th
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   978
  |> Option.map (fn deps =>
57006
blanchet
parents: 57005
diff changeset
   979
    if deps = [typedef_dep] orelse deps = [class_some_dep] orelse
blanchet
parents: 57005
diff changeset
   980
       exists (member (op =) fundef_ths) deps orelse exists (member (op =) typedef_ths) deps orelse
50828
91e6836bb68b don't learn from the proof of "psimps" etc.
blanchet
parents: 50827
diff changeset
   981
       is_size_def deps th then
50755
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
   982
      []
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
   983
    else
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   984
      deps)
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
   985
57006
blanchet
parents: 57005
diff changeset
   986
fun prover_dependencies_of ctxt (params as {verbose, max_facts, ...}) prover auto_level facts
blanchet
parents: 57005
diff changeset
   987
    name_tabs th =
55286
blanchet
parents: 55202
diff changeset
   988
  (case isar_dependencies_of name_tabs th of
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   989
    SOME [] => (false, [])
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   990
  | isar_deps0 =>
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   991
    let
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   992
      val isar_deps = these isar_deps0
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   993
      val thy = Proof_Context.theory_of ctxt
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   994
      val goal = goal_of_thm thy th
54141
f57f8e7a879f generate a comment storing the goal nickname in "learn_prover"
blanchet
parents: 54140
diff changeset
   995
      val name = nickname_of_thm th
52196
2281f33e8da6 redid rac7830871177 to avoid duplicate fixed variable (e.g. lemma "P (a::nat)" proof - have "!!a::int. Q a" sledgehammer [e])
blanchet
parents: 52125
diff changeset
   996
      val (_, hyp_ts, concl_t) = ATP_Util.strip_subgoal goal 1 ctxt
51136
fdcc06013f2d avoid crude/wrong theorem comparision
blanchet
parents: 51135
diff changeset
   997
      val facts = facts |> filter (fn (_, th') => thm_less (th', th))
57006
blanchet
parents: 57005
diff changeset
   998
51004
5f2788c38127 distinguish raw and non-raw facts, using raw for 10 000s of facts and non-raw after selection of some hundreds
blanchet
parents: 51003
diff changeset
   999
      fun nickify ((_, stature), th) = ((nickname_of_thm th, stature), th)
57006
blanchet
parents: 57005
diff changeset
  1000
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
  1001
      fun is_dep dep (_, th) = nickname_of_thm th = dep
57006
blanchet
parents: 57005
diff changeset
  1002
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1003
      fun add_isar_dep facts dep accum =
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1004
        if exists (is_dep dep) accum then
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1005
          accum
55286
blanchet
parents: 55202
diff changeset
  1006
        else
blanchet
parents: 55202
diff changeset
  1007
          (case find_first (is_dep dep) facts of
blanchet
parents: 55202
diff changeset
  1008
            SOME ((_, status), th) => accum @ [(("", status), th)]
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1009
          | NONE => accum (* should not happen *))
57006
blanchet
parents: 57005
diff changeset
  1010
54123
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1011
      val mepo_facts =
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
  1012
        facts
54095
d80743f28fec simplify fudge factor code
blanchet
parents: 54091
diff changeset
  1013
        |> mepo_suggested_facts ctxt params (max_facts |> the_default prover_default_max_facts) NONE
d80743f28fec simplify fudge factor code
blanchet
parents: 54091
diff changeset
  1014
             hyp_ts concl_t
54123
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1015
      val facts =
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1016
        mepo_facts
50754
74a6adcb96ac also generate queries for goals with too many Isar dependencies
blanchet
parents: 50751
diff changeset
  1017
        |> fold (add_isar_dep facts) isar_deps
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
  1018
        |> map nickify
54123
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1019
      val num_isar_deps = length isar_deps
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1020
    in
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1021
      if verbose andalso auto_level = 0 then
57017
blanchet
parents: 57014
diff changeset
  1022
        Output.urgent_message ("MaSh: " ^ quote prover ^ " on " ^ quote name ^ " with " ^
blanchet
parents: 57014
diff changeset
  1023
          string_of_int num_isar_deps ^ " + " ^ string_of_int (length facts - num_isar_deps) ^
blanchet
parents: 57014
diff changeset
  1024
          " facts.")
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1025
      else
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1026
        ();
55286
blanchet
parents: 55202
diff changeset
  1027
      (case run_prover_for_mash ctxt params prover name facts goal of
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1028
        {outcome = NONE, used_facts, ...} =>
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1029
        (if verbose andalso auto_level = 0 then
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1030
           let val num_facts = length used_facts in
57017
blanchet
parents: 57014
diff changeset
  1031
             Output.urgent_message ("Found proof with " ^ string_of_int num_facts ^ " fact" ^
blanchet
parents: 57014
diff changeset
  1032
               plural_s num_facts ^ ".")
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1033
           end
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1034
         else
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1035
           ();
50754
74a6adcb96ac also generate queries for goals with too many Isar dependencies
blanchet
parents: 50751
diff changeset
  1036
         (true, map fst used_facts))
55286
blanchet
parents: 55202
diff changeset
  1037
      | _ => (false, isar_deps))
blanchet
parents: 55202
diff changeset
  1038
    end)
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1039
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1040
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1041
(*** High-level communication with MaSh ***)
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1042
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1043
(* In the following functions, chunks are risers w.r.t. "thm_less_eq". *)
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1044
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1045
fun chunks_and_parents_for chunks th =
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1046
  let
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1047
    fun insert_parent new parents =
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1048
      let val parents = parents |> filter_out (fn p => thm_less_eq (p, new)) in
57096
e4074b91b2a6 always remove duplicates in meshing + use weights for Naive Bayes
blanchet
parents: 57095
diff changeset
  1049
        parents |> forall (fn p => not (thm_less_eq (new, p))) parents ? cons new
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1050
      end
57006
blanchet
parents: 57005
diff changeset
  1051
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1052
    fun rechunk seen (rest as th' :: ths) =
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1053
      if thm_less_eq (th', th) then (rev seen, rest)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1054
      else rechunk (th' :: seen) ths
57006
blanchet
parents: 57005
diff changeset
  1055
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1056
    fun do_chunk [] accum = accum
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1057
      | do_chunk (chunk as hd_chunk :: _) (chunks, parents) =
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1058
        if thm_less_eq (hd_chunk, th) then
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1059
          (chunk :: chunks, insert_parent hd_chunk parents)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1060
        else if thm_less_eq (List.last chunk, th) then
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1061
          let val (front, back as hd_back :: _) = rechunk [] chunk in
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1062
            (front :: back :: chunks, insert_parent hd_back parents)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1063
          end
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1064
        else
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1065
          (chunk :: chunks, parents)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1066
  in
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1067
    fold_rev do_chunk chunks ([], [])
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1068
    |>> cons []
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1069
    ||> map nickname_of_thm
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1070
  end
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1071
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1072
fun attach_parents_to_facts _ [] = []
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1073
  | attach_parents_to_facts old_facts (facts as (_, th) :: _) =
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1074
    let
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1075
      fun do_facts _ [] = []
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1076
        | do_facts (_, parents) [fact] = [(parents, fact)]
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1077
        | do_facts (chunks, parents)
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1078
                   ((fact as (_, th)) :: (facts as (_, th') :: _)) =
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1079
          let
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1080
            val chunks = app_hd (cons th) chunks
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1081
            val chunks_and_parents' =
57370
9d420da6c7e2 disable 'extra' feature tainting for now
blanchet
parents: 57369
diff changeset
  1082
              if thm_less_eq (th, th') andalso thy_name_of_thm th = thy_name_of_thm th' then
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1083
                (chunks, [nickname_of_thm th])
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1084
              else
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1085
                chunks_and_parents_for chunks th'
57006
blanchet
parents: 57005
diff changeset
  1086
          in
blanchet
parents: 57005
diff changeset
  1087
            (parents, fact) :: do_facts chunks_and_parents' facts
blanchet
parents: 57005
diff changeset
  1088
          end
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1089
    in
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1090
      old_facts @ facts
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1091
      |> do_facts (chunks_and_parents_for [[]] th)
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1092
      |> drop (length old_facts)
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1093
    end
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1094
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1095
fun maximal_wrt_graph G keys =
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1096
  let
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1097
    val tab = Symtab.empty |> fold (fn name => Symtab.default (name, ())) keys
57006
blanchet
parents: 57005
diff changeset
  1098
57012
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1099
    fun insert_new seen name = not (Symtab.defined seen name) ? insert (op =) name
57006
blanchet
parents: 57005
diff changeset
  1100
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1101
    fun num_keys keys = Graph.Keys.fold (K (Integer.add 1)) keys 0
57006
blanchet
parents: 57005
diff changeset
  1102
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1103
    fun find_maxes _ (maxs, []) = map snd maxs
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1104
      | find_maxes seen (maxs, new :: news) =
57012
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1105
        find_maxes (seen |> num_keys (Graph.imm_succs G new) > 1 ? Symtab.default (new, ()))
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1106
          (if Symtab.defined tab new then
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1107
             let
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1108
               val newp = Graph.all_preds G [new]
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1109
               fun is_ancestor x yp = member (op =) yp x
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1110
               val maxs = maxs |> filter (fn (_, max) => not (is_ancestor max newp))
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1111
             in
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1112
               if exists (is_ancestor new o fst) maxs then (maxs, news)
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1113
               else ((newp, new) :: filter_out (fn (_, max) => is_ancestor max newp) maxs, news)
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1114
             end
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1115
           else
43fd82a537a3 cleaner handling of learned proofs
blanchet
parents: 57011
diff changeset
  1116
             (maxs, Graph.Keys.fold (insert_new seen) (Graph.imm_preds G new) news))
57006
blanchet
parents: 57005
diff changeset
  1117
  in
blanchet
parents: 57005
diff changeset
  1118
    find_maxes Symtab.empty ([], Graph.maximals G)
blanchet
parents: 57005
diff changeset
  1119
  end
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1120
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1121
fun maximal_wrt_access_graph _ [] = []
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1122
  | maximal_wrt_access_graph access_G ((fact as (_, th)) :: facts) =
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1123
    let val thy = theory_of_thm th in
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1124
      fact :: filter_out (fn (_, th') => Theory.subthy (theory_of_thm th', thy)) facts
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1125
      |> map (nickname_of_thm o snd)
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1126
      |> maximal_wrt_graph access_G
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1127
    end
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1128
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1129
fun is_fact_in_graph access_G = can (Graph.get_node access_G) o nickname_of_thm
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1130
53197
6c5e7143e1f6 tuned fudge factor in light of evaluation
blanchet
parents: 53159
diff changeset
  1131
val chained_feature_factor = 0.5 (* FUDGE *)
57405
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1132
val extra_feature_factor = 0.1 (* FUDGE *)
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1133
val num_extra_feature_facts = 10 (* FUDGE *)
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1134
57386
5980ee29dbf6 always expand all paths
blanchet
parents: 57385
diff changeset
  1135
val max_proximity_facts = 100
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1136
54060
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1137
fun find_mash_suggestions ctxt max_facts suggs facts chained raw_unknown =
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1138
  let
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1139
    val inter_fact = inter (eq_snd Thm.eq_thm_prop)
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1140
    val raw_mash = find_suggested_facts ctxt facts suggs
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1141
    val proximate = take max_proximity_facts facts
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1142
    val unknown_chained = inter_fact raw_unknown chained
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1143
    val unknown_proximate = inter_fact raw_unknown proximate
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1144
    val mess =
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1145
      [(0.9 (* FUDGE *), (map (rpair 1.0) unknown_chained, [])),
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1146
       (0.4 (* FUDGE *), (weight_facts_smoothly unknown_proximate, [])),
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1147
       (0.1 (* FUDGE *), (weight_facts_steeply raw_mash, raw_unknown))]
57007
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
  1148
    val unknown = raw_unknown
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
  1149
      |> fold (subtract (eq_snd Thm.eq_thm_prop)) [unknown_chained, unknown_proximate]
57006
blanchet
parents: 57005
diff changeset
  1150
  in
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
  1151
    (mesh_facts (eq_snd (gen_eq_thm ctxt)) max_facts mess, unknown)
57006
blanchet
parents: 57005
diff changeset