src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML
author blanchet
Sun, 14 Aug 2016 12:26:09 +0200
changeset 63696 af310e622d64
parent 63695 9ad6a63cc8bd
child 63697 0afe49623cf9
permissions -rw-r--r--
tuned MaSh's metacharacters to avoid needless decoding
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
48380
d4b7c7be3116 renamed ML files
blanchet
parents: 48379
diff changeset
     1
(*  Title:      HOL/Tools/Sledgehammer/sledgehammer_mash.ML
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     2
    Author:     Jasmin Blanchette, TU Muenchen
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
     3
    Author:     Cezary Kaliszyk, University of Innsbruck
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     4
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     5
Sledgehammer's machine-learning-based relevance filter (MaSh).
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     6
*)
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     7
48381
1b7d798460bb renamed ML structures
blanchet
parents: 48380
diff changeset
     8
signature SLEDGEHAMMER_MASH =
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
     9
sig
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    10
  type stature = ATP_Problem_Generate.stature
51004
5f2788c38127 distinguish raw and non-raw facts, using raw for 10 000s of facts and non-raw after selection of some hundreds
blanchet
parents: 51003
diff changeset
    11
  type raw_fact = Sledgehammer_Fact.raw_fact
48296
e7f01b7e244e gracefully handle the case of empty theories when going up the accessibility chain
blanchet
parents: 48293
diff changeset
    12
  type fact = Sledgehammer_Fact.fact
e7f01b7e244e gracefully handle the case of empty theories when going up the accessibility chain
blanchet
parents: 48293
diff changeset
    13
  type fact_override = Sledgehammer_Fact.fact_override
55201
1ee776da8da7 renamed ML file
blanchet
parents: 54816
diff changeset
    14
  type params = Sledgehammer_Prover.params
1ee776da8da7 renamed ML file
blanchet
parents: 54816
diff changeset
    15
  type prover_result = Sledgehammer_Prover.prover_result
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    16
48308
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
    17
  val trace : bool Config.T
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
    18
  val duplicates : bool Config.T
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
    19
  val MePoN : string
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
    20
  val MaShN : string
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
    21
  val MeShN : string
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
    22
  val mepoN : string
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
    23
  val mashN : string
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
    24
  val meshN : string
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
    25
  val unlearnN : string
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
    26
  val learn_isarN : string
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
    27
  val learn_proverN : string
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
    28
  val relearn_isarN : string
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
    29
  val relearn_proverN : string
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
    30
  val fact_filters : string list
50826
18ace05656cf start using MaSh hints
blanchet
parents: 50825
diff changeset
    31
  val encode_str : string -> string
18ace05656cf start using MaSh hints
blanchet
parents: 50825
diff changeset
    32
  val encode_strs : string list -> string
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
    33
  val decode_str : string -> string
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
    34
  val decode_strs : string -> string list
50632
12c097ff3241 slightly more elegant naming convention (to keep low-level and high-level APIs separated)
blanchet
parents: 50631
diff changeset
    35
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
    36
  datatype mash_algorithm =
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
    37
    MaSh_NB
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
    38
  | MaSh_kNN
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
    39
  | MaSh_NB_kNN
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
    40
  | MaSh_NB_Ext
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    41
  | MaSh_kNN_Ext
57106
52e1e424eec1 export more ML functions, for experimentation
blanchet
parents: 57104
diff changeset
    42
57120
f8112c4b9cb8 extend exporter with new versions of MaSh
blanchet
parents: 57108
diff changeset
    43
  val is_mash_enabled : unit -> bool
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
    44
  val the_mash_algorithm : unit -> mash_algorithm
57557
242ce8d3d16b record MaSh algorithm in spying data
blanchet
parents: 57555
diff changeset
    45
  val str_of_mash_algorithm : mash_algorithm -> string
57120
f8112c4b9cb8 extend exporter with new versions of MaSh
blanchet
parents: 57108
diff changeset
    46
61322
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
    47
  val mesh_facts : ('a list -> 'a list) -> ('a * 'a -> bool) -> int ->
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
    48
    (real * (('a * real) list * 'a list)) list -> 'a list
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
    49
  val nickname_of_thm : thm -> string
57006
blanchet
parents: 57005
diff changeset
    50
  val find_suggested_facts : Proof.context -> ('b * thm) list -> string list -> ('b * thm) list
60641
f6e8293747fd clarified context;
wenzelm
parents: 60640
diff changeset
    51
  val crude_thm_ord : Proof.context -> thm * thm -> order
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
    52
  val thm_less : thm * thm -> bool
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    53
  val goal_of_thm : theory -> thm -> thm
57006
blanchet
parents: 57005
diff changeset
    54
  val run_prover_for_mash : Proof.context -> params -> string -> string -> fact list -> thm ->
blanchet
parents: 57005
diff changeset
    55
    prover_result
60640
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
    56
  val features_of : Proof.context -> string -> stature -> term list -> string list
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
    57
  val trim_dependencies : string list -> string list option
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
    58
  val isar_dependencies_of : string Symtab.table * string Symtab.table -> thm -> string list option
57006
blanchet
parents: 57005
diff changeset
    59
  val prover_dependencies_of : Proof.context -> params -> string -> int -> raw_fact list ->
blanchet
parents: 57005
diff changeset
    60
    string Symtab.table * string Symtab.table -> thm -> bool * string list
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
    61
  val attach_parents_to_facts : ('a * thm) list -> ('a * thm) list ->
57006
blanchet
parents: 57005
diff changeset
    62
    (string list * ('a * thm)) list
53140
a1235e90da5f pour extra features from proximate facts into goal, in exporter
blanchet
parents: 53137
diff changeset
    63
  val num_extra_feature_facts : int
53141
d27e99a6a679 take chained and proximate facts into consideration when computing MaSh features
blanchet
parents: 53140
diff changeset
    64
  val extra_feature_factor : real
53140
a1235e90da5f pour extra features from proximate facts into goal, in exporter
blanchet
parents: 53137
diff changeset
    65
  val weight_facts_smoothly : 'a list -> ('a * real) list
a1235e90da5f pour extra features from proximate facts into goal, in exporter
blanchet
parents: 53137
diff changeset
    66
  val weight_facts_steeply : 'a list -> ('a * real) list
57432
blanchet
parents: 57431
diff changeset
    67
  val find_mash_suggestions : Proof.context -> int -> string list -> ('a * thm) list ->
blanchet
parents: 57431
diff changeset
    68
    ('a * thm) list -> ('a * thm) list -> ('a * thm) list * ('a * thm) list
60649
e007aa6a8aa2 more explicit use of context and elimination of Thm.theory_of_thm, although unclear (and untested?) situations remain;
wenzelm
parents: 60641
diff changeset
    69
  val mash_suggested_facts : Proof.context -> string -> params -> int -> term list -> term ->
57407
140e16bc2a40 use right theory name for theorems in evaluation driver
blanchet
parents: 57406
diff changeset
    70
    raw_fact list -> fact list * fact list
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    71
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    72
  val mash_unlearn : unit -> unit
54503
blanchet
parents: 54432
diff changeset
    73
  val mash_learn_proof : Proof.context -> params -> term -> ('a * thm) list -> thm list -> unit
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
    74
  val mash_learn_facts : Proof.context -> params -> string -> int -> bool -> Time.time ->
57120
f8112c4b9cb8 extend exporter with new versions of MaSh
blanchet
parents: 57108
diff changeset
    75
    raw_fact list -> string
57006
blanchet
parents: 57005
diff changeset
    76
  val mash_learn : Proof.context -> params -> fact_override -> thm list -> bool -> unit
57432
blanchet
parents: 57431
diff changeset
    77
  val mash_can_suggest_facts : Proof.context -> bool
53819
e55d641d0a70 honor MaSh's zero-overhead policy -- no learning if the tool is disabled
blanchet
parents: 53790
diff changeset
    78
57108
dc0b4f50e288 more generous max number of suggestions, for more safety
blanchet
parents: 57107
diff changeset
    79
  val generous_max_suggestions : int -> int
50814
4247cbd78aaf export MeSh data as well
blanchet
parents: 50755
diff changeset
    80
  val mepo_weight : real
4247cbd78aaf export MeSh data as well
blanchet
parents: 50755
diff changeset
    81
  val mash_weight : real
57006
blanchet
parents: 57005
diff changeset
    82
  val relevant_facts : Proof.context -> params -> string -> int -> fact_override -> term list ->
blanchet
parents: 57005
diff changeset
    83
    term -> raw_fact list -> (string * fact list) list
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
    84
end;
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
    85
48381
1b7d798460bb renamed ML structures
blanchet
parents: 48380
diff changeset
    86
structure Sledgehammer_MaSh : SLEDGEHAMMER_MASH =
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
    87
struct
48249
2bd242c56c90 dummy implementation
blanchet
parents: 48248
diff changeset
    88
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    89
open ATP_Util
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    90
open ATP_Problem_Generate
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    91
open Sledgehammer_Util
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    92
open Sledgehammer_Fact
55201
1ee776da8da7 renamed ML file
blanchet
parents: 54816
diff changeset
    93
open Sledgehammer_Prover
55202
824c48a539c9 renamed many Sledgehammer ML files to clarify structure
blanchet
parents: 55201
diff changeset
    94
open Sledgehammer_Prover_Minimize
48381
1b7d798460bb renamed ML structures
blanchet
parents: 48380
diff changeset
    95
open Sledgehammer_MePo
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
    96
61318
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
    97
val anonymous_proof_prefix = "."
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
    98
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
    99
val trace = Attrib.setup_config_bool @{binding sledgehammer_mash_trace} (K false)
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
   100
val duplicates = Attrib.setup_config_bool @{binding sledgehammer_fact_duplicates} (K false)
51032
69da236d7838 added option to use SNoW as machine learning algo
blanchet
parents: 51029
diff changeset
   101
48308
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
   102
fun trace_msg ctxt msg = if Config.get ctxt trace then tracing (msg ()) else ()
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
   103
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
   104
fun gen_eq_thm ctxt = if Config.get ctxt duplicates then Thm.eq_thm_strict else Thm.eq_thm_prop
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
   105
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
   106
val MePoN = "MePo"
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
   107
val MaShN = "MaSh"
51008
e096c0dc538b more precise output of selected facts
blanchet
parents: 51007
diff changeset
   108
val MeShN = "MeSh"
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
   109
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
   110
val mepoN = "mepo"
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
   111
val mashN = "mash"
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
   112
val meshN = "mesh"
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
   113
48379
2b5ad61e2ccc renamed "iter" fact filter to "MePo" (Meng--Paulson)
blanchet
parents: 48378
diff changeset
   114
val fact_filters = [meshN, mepoN, mashN]
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
   115
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   116
val unlearnN = "unlearn"
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   117
val learn_isarN = "learn_isar"
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   118
val learn_proverN = "learn_prover"
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   119
val relearn_isarN = "relearn_isar"
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   120
val relearn_proverN = "relearn_prover"
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   121
57368
blanchet
parents: 57367
diff changeset
   122
fun map_array_at ary f i = Array.update (ary, i, f (Array.sub (ary, i)))
blanchet
parents: 57367
diff changeset
   123
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   124
type xtab = int * int Symtab.table
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   125
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   126
val empty_xtab = (0, Symtab.empty)
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   127
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   128
fun add_to_xtab key (next, tab) = (next + 1, Symtab.update_new (key, next) tab)
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   129
fun maybe_add_to_xtab key = perhaps (try (add_to_xtab key))
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   130
57552
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   131
fun state_file () = Path.expand (Path.explode "$ISABELLE_HOME_USER/mash_state")
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   132
val remove_state_file = try File.rm o state_file
48330
192444b53e86 speed up MaSh queries
blanchet
parents: 48329
diff changeset
   133
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   134
datatype mash_algorithm =
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   135
  MaSh_NB
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   136
| MaSh_kNN
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   137
| MaSh_NB_kNN
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   138
| MaSh_NB_Ext
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   139
| MaSh_kNN_Ext
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   140
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   141
fun mash_algorithm () =
61043
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   142
  (case Options.default_string @{system_option MaSh} of
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   143
    "yes" => SOME MaSh_NB_kNN
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   144
  | "sml" => SOME MaSh_NB_kNN
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   145
  | "nb" => SOME MaSh_NB
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   146
  | "knn" => SOME MaSh_kNN
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   147
  | "nb_knn" => SOME MaSh_NB_kNN
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   148
  | "nb_ext" => SOME MaSh_NB_Ext
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   149
  | "knn_ext" => SOME MaSh_kNN_Ext
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   150
  | "none" => NONE
0810068379d8 eliminated obsolete environment variable
blanchet
parents: 60971
diff changeset
   151
  | "" => NONE
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
   152
  | algorithm => (warning ("Unknown MaSh algorithm: " ^ quote algorithm); NONE))
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
   153
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   154
val is_mash_enabled = is_some o mash_algorithm
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   155
val the_mash_algorithm = the_default MaSh_NB_kNN o mash_algorithm
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
   156
57557
242ce8d3d16b record MaSh algorithm in spying data
blanchet
parents: 57555
diff changeset
   157
fun str_of_mash_algorithm MaSh_NB = "nb"
242ce8d3d16b record MaSh algorithm in spying data
blanchet
parents: 57555
diff changeset
   158
  | str_of_mash_algorithm MaSh_kNN = "knn"
242ce8d3d16b record MaSh algorithm in spying data
blanchet
parents: 57555
diff changeset
   159
  | str_of_mash_algorithm MaSh_NB_kNN = "nb_knn"
242ce8d3d16b record MaSh algorithm in spying data
blanchet
parents: 57555
diff changeset
   160
  | str_of_mash_algorithm MaSh_NB_Ext = "nb_ext"
242ce8d3d16b record MaSh algorithm in spying data
blanchet
parents: 57555
diff changeset
   161
  | str_of_mash_algorithm MaSh_kNN_Ext = "knn_ext"
242ce8d3d16b record MaSh algorithm in spying data
blanchet
parents: 57555
diff changeset
   162
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   163
fun scaled_avg [] = 0
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   164
  | scaled_avg xs = Real.ceil (100000000.0 * fold (curry (op +)) xs 0.0) div length xs
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   165
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   166
fun avg [] = 0.0
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   167
  | avg xs = fold (curry (op +)) xs 0.0 / Real.fromInt (length xs)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   168
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   169
fun normalize_scores _ [] = []
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   170
  | normalize_scores max_facts xs =
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
   171
    map (apsnd (curry (op *) (1.0 / avg (map snd (take max_facts xs))))) xs
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   172
61322
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
   173
fun mesh_facts maybe_distinct _ max_facts [(_, (sels, unks))] =
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
   174
    map fst (take max_facts sels) @ take (max_facts - length sels) unks
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
   175
    |> maybe_distinct
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
   176
  | mesh_facts _ fact_eq max_facts mess =
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   177
    let
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   178
      val mess = mess |> map (apsnd (apfst (normalize_scores max_facts)))
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   179
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   180
      fun score_in fact (global_weight, (sels, unks)) =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   181
        let val score_at = try (nth sels) #> Option.map (fn (_, score) => global_weight * score) in
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   182
          (case find_index (curry fact_eq fact o fst) sels of
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   183
            ~1 => if member fact_eq unks fact then NONE else SOME 0.0
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   184
          | rank => score_at rank)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   185
        end
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   186
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   187
      fun weight_of fact = mess |> map_filter (score_in fact) |> scaled_avg
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   188
    in
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   189
      fold (union fact_eq o map fst o take max_facts o fst o snd) mess []
59058
a78612c67ec0 renamed "pairself" to "apply2", in accordance to @{apply 2};
wenzelm
parents: 58919
diff changeset
   190
      |> map (`weight_of) |> sort (int_ord o apply2 fst o swap)
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   191
      |> map snd |> take max_facts
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   192
    end
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   193
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   194
fun smooth_weight_of_fact rank = Math.pow (1.3, 15.5 - 0.2 * Real.fromInt rank) + 15.0 (* FUDGE *)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   195
fun steep_weight_of_fact rank = Math.pow (0.62, log2 (Real.fromInt (rank + 1))) (* FUDGE *)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   196
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   197
fun weight_facts_smoothly facts = facts ~~ map smooth_weight_of_fact (0 upto length facts - 1)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   198
fun weight_facts_steeply facts = facts ~~ map steep_weight_of_fact (0 upto length facts - 1)
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   199
57660
86b853448f08 fixed sorting (broken since 9cc802a8ab06)
blanchet
parents: 57658
diff changeset
   200
fun sort_array_suffix cmp needed a =
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   201
  let
57406
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   202
    exception BOTTOM of int
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   203
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   204
    val al = Array.length a
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   205
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   206
    fun maxson l i =
57017
blanchet
parents: 57014
diff changeset
   207
      let val i31 = i + i + i + 1 in
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   208
        if i31 + 2 < l then
57017
blanchet
parents: 57014
diff changeset
   209
          let val x = Unsynchronized.ref i31 in
blanchet
parents: 57014
diff changeset
   210
            if cmp (Array.sub (a, i31), Array.sub (a, i31 + 1)) = LESS then x := i31 + 1 else ();
blanchet
parents: 57014
diff changeset
   211
            if cmp (Array.sub (a, !x), Array.sub (a, i31 + 2)) = LESS then x := i31 + 2 else ();
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   212
            !x
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   213
          end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   214
        else
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   215
          if i31 + 1 < l andalso cmp (Array.sub (a, i31), Array.sub (a, i31 + 1)) = LESS
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   216
          then i31 + 1 else if i31 < l then i31 else raise BOTTOM i
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   217
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   218
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   219
    fun trickledown l i e =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   220
      let val j = maxson l i in
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   221
        if cmp (Array.sub (a, j), e) = GREATER then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   222
          (Array.update (a, i, Array.sub (a, j)); trickledown l j e)
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   223
        else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   224
          Array.update (a, i, e)
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   225
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   226
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   227
    fun trickle l i e = trickledown l i e handle BOTTOM i => Array.update (a, i, e)
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   228
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   229
    fun bubbledown l i =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   230
      let val j = maxson l i in
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   231
        Array.update (a, i, Array.sub (a, j));
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   232
        bubbledown l j
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   233
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   234
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   235
    fun bubble l i = bubbledown l i handle BOTTOM i => i
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   236
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   237
    fun trickleup i e =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   238
      let val father = (i - 1) div 3 in
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   239
        if cmp (Array.sub (a, father), e) = LESS then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   240
          (Array.update (a, i, Array.sub (a, father));
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   241
           if father > 0 then trickleup father e else Array.update (a, 0, e))
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   242
        else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   243
          Array.update (a, i, e)
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   244
      end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   245
57355
blanchet
parents: 57354
diff changeset
   246
    fun for i = if i < 0 then () else (trickle al i (Array.sub (a, i)); for (i - 1))
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   247
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   248
    fun for2 i =
57660
86b853448f08 fixed sorting (broken since 9cc802a8ab06)
blanchet
parents: 57658
diff changeset
   249
      if i < Integer.max 2 (al - needed) then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   250
        ()
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   251
      else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   252
        let val e = Array.sub (a, i) in
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   253
          Array.update (a, i, Array.sub (a, 0));
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   254
          trickleup (bubble i 0) e;
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   255
          for2 (i - 1)
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   256
        end
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   257
  in
57355
blanchet
parents: 57354
diff changeset
   258
    for (((al + 1) div 3) - 1);
blanchet
parents: 57354
diff changeset
   259
    for2 (al - 1);
blanchet
parents: 57354
diff changeset
   260
    if al > 1 then
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   261
      let val e = Array.sub (a, 1) in
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   262
        Array.update (a, 1, Array.sub (a, 0));
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   263
        Array.update (a, 0, e)
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   264
      end
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   265
    else
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   266
      ()
57009
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   267
  end
8cb6a5f1ae84 added SML implementation of MaSh
blanchet
parents: 57007
diff changeset
   268
57660
86b853448f08 fixed sorting (broken since 9cc802a8ab06)
blanchet
parents: 57658
diff changeset
   269
fun rev_sort_list_prefix cmp needed xs =
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   270
  let val ary = Array.fromList xs in
57660
86b853448f08 fixed sorting (broken since 9cc802a8ab06)
blanchet
parents: 57658
diff changeset
   271
    sort_array_suffix cmp needed ary;
86b853448f08 fixed sorting (broken since 9cc802a8ab06)
blanchet
parents: 57658
diff changeset
   272
    Array.foldl (op ::) [] ary
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   273
  end
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   274
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   275
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   276
(*** Isabelle-agnostic machine learning ***)
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   277
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   278
structure MaSh =
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   279
struct
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
   280
57574
e4ddd52e1b96 made SML/NJ happier
blanchet
parents: 57561
diff changeset
   281
fun select_visible_facts (big_number : real) recommends =
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   282
  List.app (fn at =>
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   283
    let val (j, ov) = Array.sub (recommends, at) in
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   284
      Array.update (recommends, at, (j, big_number + ov))
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   285
    end)
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   286
57374
blanchet
parents: 57373
diff changeset
   287
fun wider_array_of_vector init vec =
blanchet
parents: 57373
diff changeset
   288
  let val ary = Array.array init in
blanchet
parents: 57373
diff changeset
   289
    Array.copyVec {src = vec, dst = ary, di = 0};
blanchet
parents: 57373
diff changeset
   290
    ary
blanchet
parents: 57373
diff changeset
   291
  end
blanchet
parents: 57373
diff changeset
   292
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   293
val nb_def_prior_weight = 1000 (* FUDGE *)
57095
001ec97c3e59 updated naive Bayes
blanchet
parents: 57089
diff changeset
   294
57374
blanchet
parents: 57373
diff changeset
   295
fun learn_facts (tfreq0, sfreq0, dffreq0) num_facts0 num_facts num_feats depss featss =
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   296
  let
57374
blanchet
parents: 57373
diff changeset
   297
    val tfreq = wider_array_of_vector (num_facts, 0) tfreq0
blanchet
parents: 57373
diff changeset
   298
    val sfreq = wider_array_of_vector (num_facts, Inttab.empty) sfreq0
blanchet
parents: 57373
diff changeset
   299
    val dffreq = wider_array_of_vector (num_feats, 0) dffreq0
blanchet
parents: 57373
diff changeset
   300
blanchet
parents: 57373
diff changeset
   301
    fun learn_one th feats deps =
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   302
      let
57355
blanchet
parents: 57354
diff changeset
   303
        fun add_th weight t =
blanchet
parents: 57354
diff changeset
   304
          let
blanchet
parents: 57354
diff changeset
   305
            val im = Array.sub (sfreq, t)
57374
blanchet
parents: 57373
diff changeset
   306
            fun fold_fn s = Inttab.map_default (s, 0) (Integer.add weight)
57355
blanchet
parents: 57354
diff changeset
   307
          in
57368
blanchet
parents: 57367
diff changeset
   308
            map_array_at tfreq (Integer.add weight) t;
57355
blanchet
parents: 57354
diff changeset
   309
            Array.update (sfreq, t, fold fold_fn feats im)
blanchet
parents: 57354
diff changeset
   310
          end
blanchet
parents: 57354
diff changeset
   311
57368
blanchet
parents: 57367
diff changeset
   312
        val add_sym = map_array_at dffreq (Integer.add 1)
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   313
      in
57355
blanchet
parents: 57354
diff changeset
   314
        add_th nb_def_prior_weight th;
blanchet
parents: 57354
diff changeset
   315
        List.app (add_th 1) deps;
blanchet
parents: 57354
diff changeset
   316
        List.app add_sym feats
57354
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   317
      end
ded92100ffd7 refactoring
blanchet
parents: 57353
diff changeset
   318
57102
blanchet
parents: 57101
diff changeset
   319
    fun for i =
57383
ba0fe0639bc8 right array indexing
blanchet
parents: 57382
diff changeset
   320
      if i = num_facts then ()
ba0fe0639bc8 right array indexing
blanchet
parents: 57382
diff changeset
   321
      else (learn_one i (Vector.sub (featss, i)) (Vector.sub (depss, i)); for (i + 1))
57102
blanchet
parents: 57101
diff changeset
   322
  in
57383
ba0fe0639bc8 right array indexing
blanchet
parents: 57382
diff changeset
   323
    for num_facts0;
57374
blanchet
parents: 57373
diff changeset
   324
    (Array.vector tfreq, Array.vector sfreq, Array.vector dffreq)
57102
blanchet
parents: 57101
diff changeset
   325
  end
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   326
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   327
fun naive_bayes (tfreq, sfreq, dffreq) num_facts max_suggs visible_facts goal_feats =
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   328
  let
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   329
    val tau = 0.2 (* FUDGE *)
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   330
    val pos_weight = 5.0 (* FUDGE *)
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   331
    val def_val = ~18.0 (* FUDGE *)
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   332
    val init_val = 30.0 (* FUDGE *)
57124
e4c2c792226f make SML naive Bayes closer to Python version
blanchet
parents: 57122
diff changeset
   333
57366
blanchet
parents: 57365
diff changeset
   334
    val ln_afreq = Math.ln (Real.fromInt num_facts)
57374
blanchet
parents: 57373
diff changeset
   335
    val idf = Vector.map (fn i => ln_afreq - Math.ln (Real.fromInt i)) dffreq
57366
blanchet
parents: 57365
diff changeset
   336
57359
dd89d0ec8e41 avoid subscripting array with ~1
blanchet
parents: 57358
diff changeset
   337
    fun tfidf feat = Vector.sub (idf, feat)
57102
blanchet
parents: 57101
diff changeset
   338
blanchet
parents: 57101
diff changeset
   339
    fun log_posterior i =
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   340
      let
57374
blanchet
parents: 57373
diff changeset
   341
        val tfreq = Real.fromInt (Vector.sub (tfreq, i))
57097
blanchet
parents: 57096
diff changeset
   342
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   343
        fun add_feat (f, fw0) (res, sfh) =
57102
blanchet
parents: 57101
diff changeset
   344
          (case Inttab.lookup sfh f of
blanchet
parents: 57101
diff changeset
   345
            SOME sf =>
57401
02f56126b4e4 reintroduced 'extra features' but with lower weight than before (to account for tfidf)
blanchet
parents: 57387
diff changeset
   346
            (res + fw0 * tfidf f * Math.ln (pos_weight * Real.fromInt sf / tfreq),
57102
blanchet
parents: 57101
diff changeset
   347
             Inttab.delete f sfh)
57409
c9e8cd8ec9e2 correctly take weights into consideration
blanchet
parents: 57407
diff changeset
   348
          | NONE => (res + fw0 * tfidf f * def_val, sfh))
57097
blanchet
parents: 57096
diff changeset
   349
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   350
        val (res, sfh) = fold add_feat goal_feats (init_val * Math.ln tfreq, Vector.sub (sfreq, i))
57097
blanchet
parents: 57096
diff changeset
   351
57409
c9e8cd8ec9e2 correctly take weights into consideration
blanchet
parents: 57407
diff changeset
   352
        fun fold_sfh (f, sf) sow =
59476
blanchet
parents: 59471
diff changeset
   353
          sow + tfidf f * Math.ln (1.0 - Real.fromInt (sf - 1) / tfreq)
57097
blanchet
parents: 57096
diff changeset
   354
57102
blanchet
parents: 57101
diff changeset
   355
        val sum_of_weights = Inttab.fold fold_sfh sfh 0.0
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   356
      in
57278
8f7d6f01a775 more MaSh engine variations, for evaluations
blanchet
parents: 57277
diff changeset
   357
        res + tau * sum_of_weights
57097
blanchet
parents: 57096
diff changeset
   358
      end
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   359
57356
9816f692b0ca refactoring
blanchet
parents: 57355
diff changeset
   360
    val posterior = Array.tabulate (num_facts, (fn j => (j, log_posterior j)))
57102
blanchet
parents: 57101
diff changeset
   361
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   362
    fun ret at acc =
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   363
      if at = num_facts then acc else ret (at + 1) (Array.sub (posterior, at) :: acc)
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   364
  in
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   365
    select_visible_facts 100000.0 posterior visible_facts;
59058
a78612c67ec0 renamed "pairself" to "apply2", in accordance to @{apply 2};
wenzelm
parents: 58919
diff changeset
   366
    sort_array_suffix (Real.compare o apply2 snd) max_suggs posterior;
57364
c1060d10089f honor visible in SML naive Bayes
blanchet
parents: 57363
diff changeset
   367
    ret (Integer.max 0 (num_facts - max_suggs)) []
57029
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   368
  end
75cc30d2b83f added naive Bayes ML implementation, due to Cezary Kaliszyk (like k-NN)
blanchet
parents: 57028
diff changeset
   369
59477
1b3385de296d less confusing constant
blanchet
parents: 59476
diff changeset
   370
val initial_k = 0
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   371
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   372
fun k_nearest_neighbors dffreq num_facts num_feats depss featss max_suggs visible_facts goal_feats =
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   373
  let
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   374
    exception EXIT of unit
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   375
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   376
    val ln_afreq = Math.ln (Real.fromInt num_facts)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   377
    fun tfidf feat = ln_afreq - Math.ln (Real.fromInt (Vector.sub (dffreq, feat)))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   378
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   379
    val overlaps_sqr = Array.tabulate (num_facts, rpair 0.0)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   380
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   381
    val feat_facts = Array.array (num_feats, [])
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   382
    val _ = Vector.foldl (fn (feats, fact) =>
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   383
      (List.app (map_array_at feat_facts (cons fact)) feats; fact + 1)) 0 featss
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   384
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   385
    fun do_feat (s, sw0) =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   386
      let
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   387
        val sw = sw0 * tfidf s
59476
blanchet
parents: 59471
diff changeset
   388
        val w6 = Math.pow (sw, 6.0 (* FUDGE *))
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   389
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   390
        fun inc_overlap j =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   391
          let val (_, ov) = Array.sub (overlaps_sqr, j) in
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   392
            Array.update (overlaps_sqr, j, (j, w6 + ov))
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   393
          end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   394
      in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   395
        List.app inc_overlap (Array.sub (feat_facts, s))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   396
      end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   397
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   398
    val _ = List.app do_feat goal_feats
59058
a78612c67ec0 renamed "pairself" to "apply2", in accordance to @{apply 2};
wenzelm
parents: 58919
diff changeset
   399
    val _ = sort_array_suffix (Real.compare o apply2 snd) num_facts overlaps_sqr
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   400
    val no_recommends = Unsynchronized.ref 0
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   401
    val recommends = Array.tabulate (num_facts, rpair 0.0)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   402
    val age = Unsynchronized.ref 500000000.0
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   403
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   404
    fun inc_recommend v j =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   405
      let val (_, ov) = Array.sub (recommends, j) in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   406
        if ov <= 0.0 then
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   407
          (no_recommends := !no_recommends + 1; Array.update (recommends, j, (j, !age + ov)))
59476
blanchet
parents: 59471
diff changeset
   408
        else
blanchet
parents: 59471
diff changeset
   409
          Array.update (recommends, j, (j, v + ov))
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   410
      end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   411
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   412
    val k = Unsynchronized.ref 0
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   413
    fun do_k k =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   414
      if k >= num_facts then
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   415
        raise EXIT ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   416
      else
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   417
        let
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   418
          val deps_factor = 2.7 (* FUDGE *)
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   419
          val (j, o2) = Array.sub (overlaps_sqr, num_facts - k - 1)
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   420
          val _ = inc_recommend o2 j
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   421
          val ds = Vector.sub (depss, j)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   422
          val l = Real.fromInt (length ds)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   423
        in
57531
4d9895d39b59 improvements to the machine learning algos (due to Cezary K.)
blanchet
parents: 57462
diff changeset
   424
          List.app (inc_recommend (deps_factor * o2 / l)) ds
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   425
        end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   426
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   427
    fun while1 () =
59477
1b3385de296d less confusing constant
blanchet
parents: 59476
diff changeset
   428
      if !k = initial_k + 1 then () else (do_k (!k); k := !k + 1; while1 ())
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   429
      handle EXIT () => ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   430
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   431
    fun while2 () =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   432
      if !no_recommends >= max_suggs then ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   433
      else (do_k (!k); k := !k + 1; age := !age - 10000.0; while2 ())
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   434
      handle EXIT () => ()
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   435
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   436
    fun ret acc at =
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   437
      if at = num_facts then acc else ret (Array.sub (recommends, at) :: acc) (at + 1)
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   438
  in
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   439
    while1 ();
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   440
    while2 ();
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   441
    select_visible_facts 1000000000.0 recommends visible_facts;
59058
a78612c67ec0 renamed "pairself" to "apply2", in accordance to @{apply 2};
wenzelm
parents: 58919
diff changeset
   442
    sort_array_suffix (Real.compare o apply2 snd) max_suggs recommends;
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   443
    ret [] (Integer.max 0 (num_facts - max_suggs))
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   444
  end
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   445
57125
2f620ef839ee added another way of invoking Python code, for experiments
blanchet
parents: 57124
diff changeset
   446
(* experimental *)
57376
f40ac83d076c refactoring
blanchet
parents: 57375
diff changeset
   447
fun external_tool tool max_suggs learns goal_feats =
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   448
  let
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   449
    val ser = string_of_int (serial ()) (* poor person's attempt at thread-safety *)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   450
    val ocs = TextIO.openOut ("adv_syms" ^ ser)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   451
    val ocd = TextIO.openOut ("adv_deps" ^ ser)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   452
    val ocq = TextIO.openOut ("adv_seq" ^ ser)
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   453
    val occ = TextIO.openOut ("adv_conj" ^ ser)
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   454
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   455
    fun os oc s = TextIO.output (oc, s)
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   456
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   457
    fun ol _ _ _ [] = ()
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   458
      | ol _ f _ [e] = f e
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   459
      | ol oc f sep (h :: t) = (f h; os oc sep; ol oc f sep t)
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   460
57294
ef9d4e1ceb00 use strings to communicate with external process, to ease debugging
blanchet
parents: 57291
diff changeset
   461
    fun do_learn (name, feats, deps) =
57357
30ee18eb23ac new version of adaptive k-NN with TFIDF
blanchet
parents: 57356
diff changeset
   462
      (os ocs name; os ocs ":"; ol ocs (os ocs o quote) ", " feats; os ocs "\n";
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   463
       os ocd name; os ocd ":"; ol ocd (os ocd) " " deps; os ocd "\n"; os ocq name; os ocq "\n")
57296
8a98f08a0523 tweaked experimental setup
blanchet
parents: 57294
diff changeset
   464
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   465
    fun forkexec no =
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   466
      let
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   467
        val cmd =
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   468
          "~/misc/" ^ tool ^ " adv_syms" ^ ser ^ " adv_deps" ^ ser ^ " " ^ string_of_int no ^
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   469
          " adv_seq" ^ ser ^ " < adv_conj" ^ ser
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   470
      in
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   471
        fst (Isabelle_System.bash_output cmd)
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   472
        |> space_explode " "
57294
ef9d4e1ceb00 use strings to communicate with external process, to ease debugging
blanchet
parents: 57291
diff changeset
   473
        |> filter_out (curry (op =) "")
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   474
      end
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   475
  in
57401
02f56126b4e4 reintroduced 'extra features' but with lower weight than before (to account for tfidf)
blanchet
parents: 57387
diff changeset
   476
    (List.app do_learn learns; ol occ (os occ o quote) ", " (map fst goal_feats);
57297
3d4647ea3e57 added another experimental engine
blanchet
parents: 57296
diff changeset
   477
     TextIO.closeOut ocs; TextIO.closeOut ocd; TextIO.closeOut ocq; TextIO.closeOut occ;
57294
ef9d4e1ceb00 use strings to communicate with external process, to ease debugging
blanchet
parents: 57291
diff changeset
   478
     forkexec max_suggs)
57291
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   479
  end
1bac14e0a728 added experimental MaSh engine
blanchet
parents: 57281
diff changeset
   480
57561
8200e1eb367f made SML/NJ happier
blanchet
parents: 57557
diff changeset
   481
fun k_nearest_neighbors_ext max_suggs =
59477
1b3385de296d less confusing constant
blanchet
parents: 59476
diff changeset
   482
  external_tool ("newknn/knn" ^ " " ^ string_of_int initial_k) max_suggs
57561
8200e1eb367f made SML/NJ happier
blanchet
parents: 57557
diff changeset
   483
fun naive_bayes_ext max_suggs = external_tool "predict/nbayes" max_suggs
57374
blanchet
parents: 57373
diff changeset
   484
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   485
fun query_external ctxt algorithm max_suggs learns goal_feats =
57432
blanchet
parents: 57431
diff changeset
   486
  (trace_msg ctxt (fn () => "MaSh query external " ^ commas (map fst goal_feats));
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   487
   (case algorithm of
57458
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   488
     MaSh_NB_Ext => naive_bayes_ext max_suggs learns goal_feats
419180c354c0 tuned (reordered) code
blanchet
parents: 57432
diff changeset
   489
   | MaSh_kNN_Ext => k_nearest_neighbors_ext max_suggs learns goal_feats))
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   490
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   491
fun query_internal ctxt algorithm num_facts num_feats (fact_names, featss, depss)
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   492
    (freqs as (_, _, dffreq)) visible_facts max_suggs goal_feats int_goal_feats =
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   493
  let
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   494
    fun nb () =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   495
      naive_bayes freqs num_facts max_suggs visible_facts int_goal_feats
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   496
      |> map fst
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   497
    fun knn () =
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   498
      k_nearest_neighbors dffreq num_facts num_feats depss featss max_suggs visible_facts
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   499
        int_goal_feats
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   500
      |> map fst
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   501
  in
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   502
    (trace_msg ctxt (fn () => "MaSh query internal " ^ commas (map fst goal_feats) ^ " from {" ^
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   503
       elide_string 1000 (space_implode " " (Vector.foldr (op ::) [] fact_names)) ^ "}");
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
   504
     (case algorithm of
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   505
       MaSh_NB => nb ()
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   506
     | MaSh_kNN => knn ()
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   507
     | MaSh_NB_kNN =>
61322
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
   508
       mesh_facts I (op =) max_suggs
57552
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   509
         [(0.5 (* FUDGE *), (weight_facts_steeply (nb ()), [])),
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   510
          (0.5 (* FUDGE *), (weight_facts_steeply (knn ()), []))])
57459
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   511
     |> map (curry Vector.sub fact_names))
22023ab4df3c mix NB and kNN
blanchet
parents: 57458
diff changeset
   512
   end
57007
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   513
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   514
end;
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   515
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
   516
57432
blanchet
parents: 57431
diff changeset
   517
(*** Persistent, stringly-typed state ***)
blanchet
parents: 57431
diff changeset
   518
blanchet
parents: 57431
diff changeset
   519
fun meta_char c =
blanchet
parents: 57431
diff changeset
   520
  if Char.isAlphaNum c orelse c = #"_" orelse c = #"." orelse c = #"(" orelse c = #")" orelse
63696
af310e622d64 tuned MaSh's metacharacters to avoid needless decoding
blanchet
parents: 63695
diff changeset
   521
     c = #"," orelse c = #"'" then
57432
blanchet
parents: 57431
diff changeset
   522
    String.str c
blanchet
parents: 57431
diff changeset
   523
  else
blanchet
parents: 57431
diff changeset
   524
    (* fixed width, in case more digits follow *)
blanchet
parents: 57431
diff changeset
   525
    "%" ^ stringN_of_int 3 (Char.ord c)
blanchet
parents: 57431
diff changeset
   526
blanchet
parents: 57431
diff changeset
   527
fun unmeta_chars accum [] = String.implode (rev accum)
blanchet
parents: 57431
diff changeset
   528
  | unmeta_chars accum (#"%" :: d1 :: d2 :: d3 :: cs) =
blanchet
parents: 57431
diff changeset
   529
    (case Int.fromString (String.implode [d1, d2, d3]) of
blanchet
parents: 57431
diff changeset
   530
      SOME n => unmeta_chars (Char.chr n :: accum) cs
blanchet
parents: 57431
diff changeset
   531
    | NONE => "" (* error *))
blanchet
parents: 57431
diff changeset
   532
  | unmeta_chars _ (#"%" :: _) = "" (* error *)
blanchet
parents: 57431
diff changeset
   533
  | unmeta_chars accum (c :: cs) = unmeta_chars (c :: accum) cs
blanchet
parents: 57431
diff changeset
   534
blanchet
parents: 57431
diff changeset
   535
val encode_str = String.translate meta_char
63695
9ad6a63cc8bd optimization in MaSh parsing
blanchet
parents: 63692
diff changeset
   536
val encode_strs = map encode_str #> space_implode " "
57432
blanchet
parents: 57431
diff changeset
   537
63695
9ad6a63cc8bd optimization in MaSh parsing
blanchet
parents: 63692
diff changeset
   538
fun decode_str s =
9ad6a63cc8bd optimization in MaSh parsing
blanchet
parents: 63692
diff changeset
   539
  if String.isSubstring "%" s then unmeta_chars [] (String.explode s) else s;
9ad6a63cc8bd optimization in MaSh parsing
blanchet
parents: 63692
diff changeset
   540
9ad6a63cc8bd optimization in MaSh parsing
blanchet
parents: 63692
diff changeset
   541
fun decode_strs s =
9ad6a63cc8bd optimization in MaSh parsing
blanchet
parents: 63692
diff changeset
   542
  space_explode " " s |> String.isSubstring "%" s ? map decode_str;
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   543
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   544
datatype proof_kind = Isar_Proof | Automatic_Proof | Isar_Proof_wegen_Prover_Flop
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   545
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   546
fun str_of_proof_kind Isar_Proof = "i"
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   547
  | str_of_proof_kind Automatic_Proof = "a"
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   548
  | str_of_proof_kind Isar_Proof_wegen_Prover_Flop = "x"
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   549
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   550
fun proof_kind_of_str "a" = Automatic_Proof
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   551
  | proof_kind_of_str "x" = Isar_Proof_wegen_Prover_Flop
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   552
  | proof_kind_of_str _ (* "i" *) = Isar_Proof
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   553
57011
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   554
fun add_edge_to name parent =
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   555
  Graph.default_node (parent, (Isar_Proof, [], []))
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   556
  #> Graph.add_edge (parent, name)
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   557
57661
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   558
fun add_node kind name parents feats deps (accum as (access_G, (fact_xtab, feat_xtab), learns)) =
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   559
  let val fact_xtab' = add_to_xtab name fact_xtab in
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   560
    ((Graph.new_node (name, (kind, feats, deps)) access_G
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   561
      handle Graph.DUP _ => Graph.map_node name (K (kind, feats, deps)) access_G)
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   562
     |> fold (add_edge_to name) parents,
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   563
     (fact_xtab', fold maybe_add_to_xtab feats feat_xtab),
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   564
     (name, feats, deps) :: learns)
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   565
  end
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
   566
  handle Symtab.DUP _ => accum (* robustness (in case the state file violates the invariant) *)
57011
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   567
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   568
fun try_graph ctxt when def f =
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   569
  f ()
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   570
  handle
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   571
    Graph.CYCLES (cycle :: _) =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   572
    (trace_msg ctxt (fn () => "Cycle involving " ^ commas cycle ^ " when " ^ when); def)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   573
  | Graph.DUP name =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   574
    (trace_msg ctxt (fn () => "Duplicate fact " ^ quote name ^ " when " ^ when); def)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   575
  | Graph.UNDEF name =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   576
    (trace_msg ctxt (fn () => "Unknown fact " ^ quote name ^ " when " ^ when); def)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   577
  | exn =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   578
    if Exn.is_interrupt exn then
62505
9e2a65912111 clarified modules;
wenzelm
parents: 61755
diff changeset
   579
      Exn.reraise exn
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   580
    else
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   581
      (trace_msg ctxt (fn () => "Internal error when " ^ when ^ ":\n" ^ Runtime.exn_message exn);
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   582
       def)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   583
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   584
fun graph_info G =
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   585
  string_of_int (length (Graph.keys G)) ^ " node(s), " ^
57006
blanchet
parents: 57005
diff changeset
   586
  string_of_int (fold (Integer.add o length o snd) (Graph.dest G) 0) ^ " edge(s), " ^
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   587
  string_of_int (length (Graph.maximals G)) ^ " maximal"
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   588
57545
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   589
type ffds = string vector * int list vector * int list vector
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   590
type freqs = int vector * int Inttab.table vector * int vector
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   591
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
   592
type mash_state =
57358
545d02691b32 killed dead data
blanchet
parents: 57357
diff changeset
   593
  {access_G : (proof_kind * string list * string list) Graph.T,
57374
blanchet
parents: 57373
diff changeset
   594
   xtabs : xtab * xtab,
57545
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   595
   ffds : ffds,
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   596
   freqs : freqs,
57365
blanchet
parents: 57364
diff changeset
   597
   dirty_facts : string list option}
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   598
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   599
val empty_xtabs = (empty_xtab, empty_xtab)
57545
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   600
val empty_ffds = (Vector.fromList [], Vector.fromList [], Vector.fromList []) : ffds
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   601
val empty_freqs = (Vector.fromList [], Vector.fromList [], Vector.fromList []) : freqs
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   602
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   603
val empty_state =
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   604
  {access_G = Graph.empty,
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   605
   xtabs = empty_xtabs,
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   606
   ffds = empty_ffds,
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   607
   freqs = empty_freqs,
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   608
   dirty_facts = SOME []} : mash_state
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
   609
57545
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   610
fun recompute_ffds_freqs_from_learns (learns : (string * string list * string list) list)
2126b355f0ca made SML/NJ happier
blanchet
parents: 57532
diff changeset
   611
    ((num_facts, fact_tab), (num_feats, feat_tab)) num_facts0 (fact_names0, featss0, depss0) freqs0 =
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   612
  let
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   613
    val fact_names = Vector.concat [fact_names0, Vector.fromList (map #1 learns)]
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   614
    val featss = Vector.concat [featss0,
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   615
      Vector.fromList (map (map_filter (Symtab.lookup feat_tab) o #2) learns)]
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   616
    val depss = Vector.concat [depss0,
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   617
      Vector.fromList (map (map_filter (Symtab.lookup fact_tab) o #3) learns)]
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   618
  in
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   619
    ((fact_names, featss, depss),
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   620
     MaSh.learn_facts freqs0 num_facts0 num_facts num_feats depss featss)
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   621
  end
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   622
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   623
fun reorder_learns (num_facts, fact_tab) learns =
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   624
  let val ary = Array.array (num_facts, ("", [], [])) in
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   625
    List.app (fn learn as (fact, _, _) =>
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   626
        Array.update (ary, the (Symtab.lookup fact_tab fact), learn))
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   627
      learns;
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   628
    Array.foldr (op ::) [] ary
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   629
  end
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   630
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   631
fun recompute_ffds_freqs_from_access_G access_G (xtabs as (fact_xtab, _)) =
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   632
  let
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   633
    val learns =
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   634
      Graph.schedule (fn _ => fn (fact, (_, feats, deps)) => (fact, feats, deps)) access_G
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   635
      |> reorder_learns fact_xtab
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   636
  in
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   637
    recompute_ffds_freqs_from_learns learns xtabs 0 empty_ffds empty_freqs
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   638
  end
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   639
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   640
local
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   641
63696
af310e622d64 tuned MaSh's metacharacters to avoid needless decoding
blanchet
parents: 63695
diff changeset
   642
val version = "*** MaSh version 20160805 ***"
50357
187ae76a1757 more robustness in the face of MaSh format changes -- don't overwrite new versions with old versions
blanchet
parents: 50356
diff changeset
   643
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
   644
exception FILE_VERSION_TOO_NEW of unit
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   645
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   646
fun extract_node line =
55286
blanchet
parents: 55202
diff changeset
   647
  (case space_explode ":" line of
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   648
    [head, tail] =>
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   649
    (case (space_explode " " head, map (unprefix " ") (space_explode ";" tail)) of
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   650
      ([kind, name], [parents, feats, deps]) =>
57358
545d02691b32 killed dead data
blanchet
parents: 57357
diff changeset
   651
      SOME (proof_kind_of_str kind, decode_str name, decode_strs parents, decode_strs feats,
57010
121b63d7bcdb take weights into consideration in knn
blanchet
parents: 57009
diff changeset
   652
        decode_strs deps)
55286
blanchet
parents: 55202
diff changeset
   653
    | _ => NONE)
blanchet
parents: 55202
diff changeset
   654
  | _ => NONE)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   655
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   656
fun load_state ctxt (time_state as (memory_time, _)) =
57552
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   657
  let val path = state_file () in
60971
6dfe08f5834e proper platform_path;
wenzelm
parents: 60948
diff changeset
   658
    (case try OS.FileSys.modTime (File.platform_path path) of
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   659
      NONE => time_state
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   660
    | SOME disk_time =>
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
   661
      if memory_time >= disk_time then
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   662
        time_state
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   663
      else
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   664
        (disk_time,
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   665
         (case try File.read_lines path of
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   666
           SOME (version' :: node_lines) =>
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   667
           let
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   668
             fun extract_line_and_add_node line =
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   669
               (case extract_node line of
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   670
                 NONE => I (* should not happen *)
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   671
               | SOME (kind, name, parents, feats, deps) => add_node kind name parents feats deps)
57011
a4428f517f46 implemented learning of single proofs in SML MaSh
blanchet
parents: 57010
diff changeset
   672
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   673
             val empty_G_etc = (Graph.empty, empty_xtabs, [])
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   674
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   675
             val (access_G, xtabs, rev_learns) =
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   676
               (case string_ord (version', version) of
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   677
                 EQUAL =>
57379
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   678
                 try_graph ctxt "loading state" empty_G_etc
dcaf04545de2 avoid needless (trivial) reordering on load
blanchet
parents: 57378
diff changeset
   679
                   (fn () => fold extract_line_and_add_node node_lines empty_G_etc)
57552
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   680
               | LESS => (remove_state_file (); empty_G_etc) (* cannot parse old file *)
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   681
               | GREATER => raise FILE_VERSION_TOO_NEW ())
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   682
57380
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   683
             val (ffds, freqs) =
a212bee30bba more incremental learning of single fact
blanchet
parents: 57379
diff changeset
   684
               recompute_ffds_freqs_from_learns (rev rev_learns) xtabs 0 empty_ffds empty_freqs
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   685
           in
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   686
             trace_msg ctxt (fn () => "Loaded fact graph (" ^ graph_info access_G ^ ")");
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   687
             {access_G = access_G, xtabs = xtabs, ffds = ffds, freqs = freqs, dirty_facts = SOME []}
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   688
           end
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   689
         | _ => empty_state)))
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   690
  end
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   691
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   692
fun str_of_entry (kind, name, parents, feats, deps) =
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   693
  str_of_proof_kind kind ^ " " ^ encode_str name ^ ": " ^ encode_strs parents ^ "; " ^
57358
545d02691b32 killed dead data
blanchet
parents: 57357
diff changeset
   694
  encode_strs feats ^ "; " ^ encode_strs deps ^ "\n"
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   695
57365
blanchet
parents: 57364
diff changeset
   696
fun save_state _ (time_state as (_, {dirty_facts = SOME [], ...})) = time_state
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   697
  | save_state ctxt (memory_time, {access_G, xtabs, ffds, freqs, dirty_facts}) =
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   698
    let
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   699
      fun append_entry (name, ((kind, feats, deps), (parents, _))) =
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   700
        cons (kind, name, Graph.Keys.dest parents, feats, deps)
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   701
57552
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   702
      val path = state_file ()
57365
blanchet
parents: 57364
diff changeset
   703
      val dirty_facts' =
60971
6dfe08f5834e proper platform_path;
wenzelm
parents: 60948
diff changeset
   704
        (case try OS.FileSys.modTime (File.platform_path path) of
57076
3d4b172d2209 automatically reload state file when it changes on disk
blanchet
parents: 57062
diff changeset
   705
          NONE => NONE
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
   706
        | SOME disk_time => if disk_time <= memory_time then dirty_facts else NONE)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   707
      val (banner, entries) =
57365
blanchet
parents: 57364
diff changeset
   708
        (case dirty_facts' of
55286
blanchet
parents: 55202
diff changeset
   709
          SOME names => (NONE, fold (append_entry o Graph.get_entry access_G) names [])
blanchet
parents: 55202
diff changeset
   710
        | NONE => (SOME (version ^ "\n"), Graph.fold append_entry access_G []))
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   711
    in
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   712
      (case banner of SOME s => File.write path s | NONE => ();
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   713
       entries |> chunk_list 500 |> List.app (File.append path o implode o map str_of_entry))
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   714
      handle IO.Io _ => ();
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   715
      trace_msg ctxt (fn () =>
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   716
        "Saved fact graph (" ^ graph_info access_G ^
57365
blanchet
parents: 57364
diff changeset
   717
        (case dirty_facts of
blanchet
parents: 57364
diff changeset
   718
          SOME dirty_facts => "; " ^ string_of_int (length dirty_facts) ^ " dirty fact(s)"
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
   719
        | _ => "") ^  ")");
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   720
      (Time.now (),
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
   721
       {access_G = access_G, xtabs = xtabs, ffds = ffds, freqs = freqs, dirty_facts = SOME []})
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   722
    end
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   723
57365
blanchet
parents: 57364
diff changeset
   724
val global_state = Synchronized.var "Sledgehammer_MaSh.global_state" (Time.zeroTime, empty_state)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   725
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   726
in
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   727
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   728
fun map_state ctxt f =
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   729
  Synchronized.change global_state (load_state ctxt ##> f #> save_state ctxt)
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
   730
  handle FILE_VERSION_TOO_NEW () => ()
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   731
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   732
fun peek_state ctxt =
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   733
  Synchronized.change_result global_state (perhaps (try (load_state ctxt)) #> `snd)
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   734
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
   735
fun clear_state () =
57552
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   736
  Synchronized.change global_state (fn _ => (remove_state_file (); (Time.zeroTime, empty_state)))
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   737
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   738
end
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   739
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   740
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   741
(*** Isabelle helpers ***)
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
   742
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   743
fun crude_printed_term depth t =
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   744
  let
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   745
    fun term _ (res, 0) = (res, 0)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   746
      | term (t $ u) (res, depth) =
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   747
        let
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   748
          val (res, depth) = term t (res ^ "(", depth)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   749
          val (res, depth) = term u (res ^ " ", depth)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   750
        in
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   751
          (res ^ ")", depth)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   752
        end
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   753
      | term (Abs (s, _, t)) (res, depth) = term t (res ^ "%" ^ s ^ ".", depth - 1)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   754
      | term (Bound n) (res, depth) = (res ^ "#" ^ string_of_int n, depth - 1)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   755
      | term (Const (s, _)) (res, depth) = (res ^ Long_Name.base_name s, depth - 1)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   756
      | term (Free (s, _)) (res, depth) = (res ^ s, depth - 1)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   757
      | term (Var ((s, _), _)) (res, depth) = (res ^ s, depth - 1)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   758
  in
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   759
    fst (term t ("", depth))
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   760
  end
48378
9e96486d53ad handle local facts smoothly in MaSh
blanchet
parents: 48377
diff changeset
   761
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   762
fun nickname_of_thm th =
48394
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   763
  if Thm.has_name_hint th then
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   764
    let val hint = Thm.get_name_hint th in
50722
b422a48adc2d speed up generation of local theorem nicknames
blanchet
parents: 50718
diff changeset
   765
      (* There must be a better way to detect local facts. *)
59888
27e4d0ab0948 tuned signature;
wenzelm
parents: 59877
diff changeset
   766
      (case Long_Name.dest_local hint of
60641
f6e8293747fd clarified context;
wenzelm
parents: 60640
diff changeset
   767
        SOME suf =>
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   768
        Long_Name.implode [Thm.theory_name_of_thm th, suf, crude_printed_term 50 (Thm.prop_of th)]
55286
blanchet
parents: 55202
diff changeset
   769
      | NONE => hint)
48394
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   770
    end
82fc8c956cdc fixed various issues with MaSh's file handling + tune output + generate local facts again + handle nameless facts gracefully
blanchet
parents: 48392
diff changeset
   771
  else
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   772
    crude_printed_term 100 (Thm.prop_of th)
48378
9e96486d53ad handle local facts smoothly in MaSh
blanchet
parents: 48377
diff changeset
   773
51134
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   774
fun find_suggested_facts ctxt facts =
48330
192444b53e86 speed up MaSh queries
blanchet
parents: 48329
diff changeset
   775
  let
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   776
    fun add (fact as (_, th)) = Symtab.default (nickname_of_thm th, fact)
51134
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   777
    val tab = fold add facts Symtab.empty
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   778
    fun lookup nick =
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   779
      Symtab.lookup tab nick
57006
blanchet
parents: 57005
diff changeset
   780
      |> tap (fn NONE => trace_msg ctxt (fn () => "Cannot find " ^ quote nick) | _ => ())
51134
d03ded5dcf65 more MaSh tracing
blanchet
parents: 51133
diff changeset
   781
  in map_filter lookup end
48311
3c4e10606567 implemented MaSh QUERY operation
blanchet
parents: 48309
diff changeset
   782
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   783
fun free_feature_of s = "f" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   784
fun thy_feature_of s = "y" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   785
fun type_feature_of s = "t" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   786
fun class_feature_of s = "s" ^ s
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   787
val local_feature = "local"
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   788
60948
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   789
fun crude_thm_ord ctxt =
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   790
  let
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   791
    val ancestor_lengths =
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   792
      fold (fn thy => Symtab.update (Context.theory_name thy, length (Context.ancestors_of thy)))
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   793
        (Theory.nodes_of (Proof_Context.theory_of ctxt)) Symtab.empty
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   794
    val ancestor_length = Symtab.lookup ancestor_lengths o Context.theory_id_name
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   795
60948
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   796
    fun crude_theory_ord p =
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   797
      if Context.eq_thy_id p then EQUAL
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   798
      else if Context.proper_subthy_id p then LESS
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   799
      else if Context.proper_subthy_id (swap p) then GREATER
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   800
      else
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   801
        (case apply2 ancestor_length p of
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   802
          (SOME m, SOME n) =>
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   803
            (case int_ord (m, n) of
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   804
              EQUAL => string_ord (apply2 Context.theory_id_name p)
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   805
            | ord => ord)
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   806
        | _ => string_ord (apply2 Context.theory_id_name p))
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   807
  in
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   808
    fn p =>
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   809
      (case crude_theory_ord (apply2 Thm.theory_id_of_thm p) of
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   810
        EQUAL =>
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   811
        (* The hack below is necessary because of odd dependencies that are not reflected in the theory
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   812
           comparison. *)
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   813
        let val q = apply2 nickname_of_thm p in
60948
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   814
          (* Hack to put "xxx_def" before "xxxI" and "xxxE" *)
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   815
          (case bool_ord (apply2 (String.isSuffix "_def") (swap q)) of
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   816
            EQUAL => string_ord q
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   817
          | ord => ord)
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   818
        end
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   819
      | ord => ord)
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   820
  end;
48324
3ee5b5589402 speed up tautology/metaness check
blanchet
parents: 48323
diff changeset
   821
60948
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
   822
val thm_less_eq = Context.subthy_id o apply2 Thm.theory_id_of_thm
51136
fdcc06013f2d avoid crude/wrong theorem comparision
blanchet
parents: 51135
diff changeset
   823
fun thm_less p = thm_less_eq p andalso not (thm_less_eq (swap p))
fdcc06013f2d avoid crude/wrong theorem comparision
blanchet
parents: 51135
diff changeset
   824
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   825
val freezeT = Type.legacy_freeze_type
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   826
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   827
fun freeze (t $ u) = freeze t $ freeze u
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   828
  | freeze (Abs (s, T, t)) = Abs (s, freezeT T, freeze t)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   829
  | freeze (Var ((s, _), T)) = Free (s, freezeT T)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   830
  | freeze (Const (s, T)) = Const (s, freezeT T)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   831
  | freeze (Free (s, T)) = Free (s, freezeT T)
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   832
  | freeze t = t
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   833
59621
291934bac95e Thm.cterm_of and Thm.ctyp_of operate on local context;
wenzelm
parents: 59582
diff changeset
   834
fun goal_of_thm thy = Thm.prop_of #> freeze #> Thm.global_cterm_of thy #> Goal.init
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   835
54141
f57f8e7a879f generate a comment storing the goal nickname in "learn_prover"
blanchet
parents: 54140
diff changeset
   836
fun run_prover_for_mash ctxt params prover goal_name facts goal =
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   837
  let
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   838
    val problem =
54141
f57f8e7a879f generate a comment storing the goal nickname in "learn_prover"
blanchet
parents: 54140
diff changeset
   839
      {comment = "Goal: " ^ goal_name, state = Proof.init ctxt, goal = goal, subgoal = 1,
62735
23de054397e5 early warning when Sledgehammer finds a proof
blanchet
parents: 62505
diff changeset
   840
       subgoal_count = 1, factss = [("", facts)], found_proof = I}
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   841
  in
57735
056a55b44ec7 eliminated Sledgehammer's "min" subcommand (and lots of complications in the code)
blanchet
parents: 57662
diff changeset
   842
    get_minimizing_prover ctxt MaSh (K ()) prover params problem
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   843
  end
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
   844
48326
ef800e91d072 removed debugging output
blanchet
parents: 48325
diff changeset
   845
val bad_types = [@{type_name prop}, @{type_name bool}, @{type_name fun}]
ef800e91d072 removed debugging output
blanchet
parents: 48325
diff changeset
   846
53086
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   847
val pat_tvar_prefix = "_"
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   848
val pat_var_prefix = "_"
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   849
53089
a58b3b8631c6 keep long names to stay on the safe side
blanchet
parents: 53086
diff changeset
   850
(* try "Long_Name.base_name" for shorter names *)
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   851
fun massage_long_name s = s
53083
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   852
63696
af310e622d64 tuned MaSh's metacharacters to avoid needless decoding
blanchet
parents: 63695
diff changeset
   853
val crude_str_of_sort = space_implode "," o map massage_long_name o subtract (op =) @{sort type}
53086
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   854
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   855
fun crude_str_of_typ (Type (s, [])) = massage_long_name s
57006
blanchet
parents: 57005
diff changeset
   856
  | crude_str_of_typ (Type (s, Ts)) = massage_long_name s ^ implode (map crude_str_of_typ Ts)
53083
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   857
  | crude_str_of_typ (TFree (_, S)) = crude_str_of_sort S
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   858
  | crude_str_of_typ (TVar (_, S)) = crude_str_of_sort S
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   859
53128
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   860
fun maybe_singleton_str _ "" = []
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   861
  | maybe_singleton_str pref s = [pref ^ s]
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   862
53148
c898409d8630 fixed subtle bug with "take" + thread overlord through
blanchet
parents: 53142
diff changeset
   863
val max_pat_breadth = 10 (* FUDGE *)
50585
306c7b807e13 contain exponential explosion of term patterns
blanchet
parents: 50584
diff changeset
   864
57406
e844dcf57deb killed dead code
blanchet
parents: 57405
diff changeset
   865
fun term_features_of ctxt thy_name term_max_depth type_max_depth ts =
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   866
  let
50392
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   867
    val thy = Proof_Context.theory_of ctxt
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   868
50393
d8aa26c78327 record free variables as a MaSh feature
blanchet
parents: 50392
diff changeset
   869
    val fixes = map snd (Variable.dest_fixes ctxt)
50392
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   870
    val classes = Sign.classes_of thy
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   871
48304
50e64af9c829 more work on MaSh
blanchet
parents: 48303
diff changeset
   872
    fun add_classes @{sort type} = I
50392
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   873
      | add_classes S =
190053ee24ed expand type classes into their ancestors for MaSh
blanchet
parents: 50391
diff changeset
   874
        fold (`(Sorts.super_classes classes)
57006
blanchet
parents: 57005
diff changeset
   875
          #> swap #> op ::
blanchet
parents: 57005
diff changeset
   876
          #> subtract (op =) @{sort type} #> map massage_long_name
blanchet
parents: 57005
diff changeset
   877
          #> map class_feature_of
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   878
          #> union (op =)) S
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   879
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   880
    fun pattify_type 0 _ = []
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   881
      | pattify_type _ (Type (s, [])) =
53086
15fe0ca088b3 MaSh tweaking: shorter names + killed (broken) SNoW
blanchet
parents: 53085
diff changeset
   882
        if member (op =) bad_types s then [] else [massage_long_name s]
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   883
      | pattify_type depth (Type (s, U :: Ts)) =
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   884
        let
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   885
          val T = Type (s, Ts)
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   886
          val ps = take max_pat_breadth (pattify_type depth T)
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   887
          val qs = take max_pat_breadth ("" :: pattify_type (depth - 1) U)
57006
blanchet
parents: 57005
diff changeset
   888
        in
blanchet
parents: 57005
diff changeset
   889
          map_product (fn p => fn "" => p | q => p ^ "(" ^ q ^ ")") ps qs
blanchet
parents: 57005
diff changeset
   890
        end
57552
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   891
      | pattify_type _ (TFree (_, S)) = maybe_singleton_str pat_tvar_prefix (crude_str_of_sort S)
1072599c43f6 no need for 'mash' subdirectory after removal of Python program
blanchet
parents: 57546
diff changeset
   892
      | pattify_type _ (TVar (_, S)) = maybe_singleton_str pat_tvar_prefix (crude_str_of_sort S)
57006
blanchet
parents: 57005
diff changeset
   893
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   894
    fun add_type_pat depth T =
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   895
      union (op =) (map type_feature_of (pattify_type depth T))
57006
blanchet
parents: 57005
diff changeset
   896
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   897
    fun add_type_pats 0 _ = I
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   898
      | add_type_pats depth t = add_type_pat depth t #> add_type_pats (depth - 1) t
57006
blanchet
parents: 57005
diff changeset
   899
53083
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   900
    fun add_type T =
019ecbb18e3f generate patterns for variables as well in MaSh (cf. HOL(y)Hammer)
blanchet
parents: 53082
diff changeset
   901
      add_type_pats type_max_depth T
53156
f79f4693868b minor MaSh fix
blanchet
parents: 53155
diff changeset
   902
      #> fold_atyps_sorts (add_classes o snd) T
57006
blanchet
parents: 57005
diff changeset
   903
53084
877f5c28016f add subtypes as well as features in MaSh
blanchet
parents: 53083
diff changeset
   904
    fun add_subtypes (T as Type (_, Ts)) = add_type T #> fold add_subtypes Ts
877f5c28016f add subtypes as well as features in MaSh
blanchet
parents: 53083
diff changeset
   905
      | add_subtypes T = add_type T
53082
369e39511555 generate deep type patterns in MaSh
blanchet
parents: 52697
diff changeset
   906
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   907
    fun pattify_term _ 0 _ = []
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   908
      | pattify_term _ _ (Const (s, _)) =
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   909
        if is_widely_irrelevant_const s then [] else [massage_long_name s]
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   910
      | pattify_term _ _ (Free (s, T)) =
53128
ea1b62ed5a54 get rid of some silly MaSh features
blanchet
parents: 53127
diff changeset
   911
        maybe_singleton_str pat_var_prefix (crude_str_of_typ T)
53090
1426c97311f2 treat frees as both consts and vars, for more hits
blanchet
parents: 53089
diff changeset
   912
        |> (if member (op =) fixes s then
59877
a04ea4709c8d more standard Long_Name operations;
wenzelm
parents: 59875
diff changeset
   913
              cons (free_feature_of (massage_long_name (Long_Name.append thy_name s)))
53090
1426c97311f2 treat frees as both consts and vars, for more hits
blanchet
parents: 53089
diff changeset
   914
            else
1426c97311f2 treat frees as both consts and vars, for more hits
blanchet
parents: 53089
diff changeset
   915
              I)
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   916
      | pattify_term _ _ (Var (_, T)) = maybe_singleton_str pat_var_prefix (crude_str_of_typ T)
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   917
      | pattify_term Ts _ (Bound j) =
54695
a9efdf970720 generate problems with type classes
blanchet
parents: 54693
diff changeset
   918
        maybe_singleton_str pat_var_prefix (crude_str_of_typ (nth Ts j))
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   919
      | pattify_term Ts depth (t $ u) =
50339
d8dae91f3107 MaSh improvements: deeper patterns + more respect for chained facts
blanchet
parents: 50338
diff changeset
   920
        let
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   921
          val ps = take max_pat_breadth (pattify_term Ts depth t)
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   922
          val qs = take max_pat_breadth ("" :: pattify_term Ts (depth - 1) u)
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   923
        in
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   924
          map_product (fn p => fn "" => p | q => p ^ "(" ^ q ^ ")") ps qs
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   925
        end
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   926
      | pattify_term _ _ _ = []
57006
blanchet
parents: 57005
diff changeset
   927
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   928
    fun add_term_pat Ts = union (op =) oo pattify_term Ts
57006
blanchet
parents: 57005
diff changeset
   929
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   930
    fun add_term_pats _ 0 _ = I
57055
df3a26987a8d reverted '|' features in MaSh -- these sounded like a good idea but never really worked
blanchet
parents: 57052
diff changeset
   931
      | add_term_pats Ts depth t = add_term_pat Ts depth t #> add_term_pats Ts (depth - 1) t
57006
blanchet
parents: 57005
diff changeset
   932
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   933
    fun add_term Ts = add_term_pats Ts term_max_depth
57006
blanchet
parents: 57005
diff changeset
   934
53085
15483854c83e handle Bounds as well in MaSh features
blanchet
parents: 53084
diff changeset
   935
    fun add_subterms Ts t =
55286
blanchet
parents: 55202
diff changeset
   936
      (case strip_comb t of
54089
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   937
        (Const (s, T), args) =>
b13f6731f873 use same relevance filter for ATP and SMT solvers -- attempting to filter out certain ground instances of polymorphic symbols like + and 0 has unexpected side-effects that lead to incompletenesses (relevant facts not being selected)
blanchet
parents: 54085
diff changeset
   938
        (not (is_widely_irrelevant_const s) ? add_term Ts t)
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
   939
        #> add_subtypes T #> fold (add_subterms Ts) args
50857
80768e28c9ee better handlig of built-ins -- at the top-level, not in subterms
blanchet
parents: 50841
diff changeset
   940
      | (head, args) =>
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   941
        (case head of
53130
6741ba8d5c6d improve weight computation for complex terms
blanchet
parents: 53129
diff changeset
   942
           Free (_, T) => add_term Ts t #> add_subtypes T
53084
877f5c28016f add subtypes as well as features in MaSh
blanchet
parents: 53083
diff changeset
   943
         | Var (_, T) => add_subtypes T
53085
15483854c83e handle Bounds as well in MaSh features
blanchet
parents: 53084
diff changeset
   944
         | Abs (_, T, body) => add_subtypes T #> add_subterms (T :: Ts) body
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   945
         | _ => I)
55286
blanchet
parents: 55202
diff changeset
   946
        #> fold (add_subterms Ts) args)
57006
blanchet
parents: 57005
diff changeset
   947
  in
blanchet
parents: 57005
diff changeset
   948
    fold (add_subterms []) ts []
blanchet
parents: 57005
diff changeset
   949
  end
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   950
53085
15483854c83e handle Bounds as well in MaSh features
blanchet
parents: 53084
diff changeset
   951
val term_max_depth = 2
53155
2c585fdbe197 eliminated some needless MaSh features
blanchet
parents: 53154
diff changeset
   952
val type_max_depth = 1
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   953
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   954
(* TODO: Generate type classes for types? *)
60640
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
   955
fun features_of ctxt thy_name (scope, _) ts =
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
   956
  thy_feature_of thy_name ::
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
   957
  term_features_of ctxt thy_name term_max_depth type_max_depth ts
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
   958
  |> scope <> Global ? cons local_feature
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   959
57006
blanchet
parents: 57005
diff changeset
   960
(* Too many dependencies is a sign that a decision procedure is at work. There is not much to learn
blanchet
parents: 57005
diff changeset
   961
   from such proofs. *)
50434
960a3429615c more MaSh tweaking -- in particular, export the same facts in "MaSh_Export" as are later tried in "MaSh_Eval"
blanchet
parents: 50412
diff changeset
   962
val max_dependencies = 20
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
   963
54125
420b876ff1e2 if slicing is disabled, don't enforce last slice's "max_facts", but rather the maximum "max_facts"
blanchet
parents: 54123
diff changeset
   964
val prover_default_max_facts = 25
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   965
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   966
(* "type_definition_xxx" facts are characterized by their use of "CollectI". *)
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   967
val typedef_dep = nickname_of_thm @{thm CollectI}
57006
blanchet
parents: 57005
diff changeset
   968
(* Mysterious parts of the class machinery create lots of proofs that refer exclusively to
blanchet
parents: 57005
diff changeset
   969
   "someI_ex" (and to some internal constructions). *)
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   970
val class_some_dep = nickname_of_thm @{thm someI_ex}
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   971
50828
91e6836bb68b don't learn from the proof of "psimps" etc.
blanchet
parents: 50827
diff changeset
   972
val fundef_ths =
57006
blanchet
parents: 57005
diff changeset
   973
  @{thms fundef_ex1_existence fundef_ex1_uniqueness fundef_ex1_iff fundef_default_value}
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   974
  |> map nickname_of_thm
50828
91e6836bb68b don't learn from the proof of "psimps" etc.
blanchet
parents: 50827
diff changeset
   975
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   976
(* "Rep_xxx_inject", "Abs_xxx_inverse", etc., are derived using these facts. *)
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   977
val typedef_ths =
57006
blanchet
parents: 57005
diff changeset
   978
  @{thms type_definition.Abs_inverse type_definition.Rep_inverse type_definition.Rep
blanchet
parents: 57005
diff changeset
   979
      type_definition.Rep_inject type_definition.Abs_inject type_definition.Rep_cases
blanchet
parents: 57005
diff changeset
   980
      type_definition.Abs_cases type_definition.Rep_induct type_definition.Abs_induct
blanchet
parents: 57005
diff changeset
   981
      type_definition.Rep_range type_definition.Abs_image}
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   982
  |> map nickname_of_thm
48438
3e45c98fe127 distinguish between recursive and nonrecursive definitions + clean up typedef dependencies in MaSh
blanchet
parents: 48436
diff changeset
   983
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   984
fun is_size_def [dep] th =
55642
63beb38e9258 adapted to renaming of datatype 'cases' and 'recs' to 'case' and 'rec'
blanchet
parents: 55286
diff changeset
   985
    (case first_field ".rec" dep of
57006
blanchet
parents: 57005
diff changeset
   986
      SOME (pref, _) =>
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   987
      (case first_field ".size" (nickname_of_thm th) of
57006
blanchet
parents: 57005
diff changeset
   988
        SOME (pref', _) => pref = pref'
blanchet
parents: 57005
diff changeset
   989
      | NONE => false)
blanchet
parents: 57005
diff changeset
   990
    | NONE => false)
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   991
  | is_size_def _ _ = false
48441
2d2f009ca8eb remove MaSh junk associated with size functions
blanchet
parents: 48440
diff changeset
   992
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
   993
fun trim_dependencies deps =
50755
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
   994
  if length deps > max_dependencies then NONE else SOME deps
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
   995
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
   996
fun isar_dependencies_of name_tabs th =
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   997
  thms_in_proof max_dependencies (SOME name_tabs) th
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
   998
  |> Option.map (fn deps =>
57006
blanchet
parents: 57005
diff changeset
   999
    if deps = [typedef_dep] orelse deps = [class_some_dep] orelse
57757
blanchet
parents: 57735
diff changeset
  1000
        exists (member (op =) fundef_ths) deps orelse exists (member (op =) typedef_ths) deps orelse
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1001
        is_size_def deps th then
50755
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
  1002
      []
4c781d65c0d6 get rid of spurious "Isar" proofs
blanchet
parents: 50754
diff changeset
  1003
    else
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
  1004
      deps)
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1005
57006
blanchet
parents: 57005
diff changeset
  1006
fun prover_dependencies_of ctxt (params as {verbose, max_facts, ...}) prover auto_level facts
blanchet
parents: 57005
diff changeset
  1007
    name_tabs th =
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1008
  (case isar_dependencies_of name_tabs th of
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
  1009
    SOME [] => (false, [])
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
  1010
  | isar_deps0 =>
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1011
    let
57306
ff10067b2248 optimized traversal of proof terms by skipping bad apples (e.g. full_exhaustive_int'.pinduct)
blanchet
parents: 57305
diff changeset
  1012
      val isar_deps = these isar_deps0
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1013
      val thy = Proof_Context.theory_of ctxt
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1014
      val goal = goal_of_thm thy th
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1015
      val name = nickname_of_thm th
52196
2281f33e8da6 redid rac7830871177 to avoid duplicate fixed variable (e.g. lemma "P (a::nat)" proof - have "!!a::int. Q a" sledgehammer [e])
blanchet
parents: 52125
diff changeset
  1016
      val (_, hyp_ts, concl_t) = ATP_Util.strip_subgoal goal 1 ctxt
51136
fdcc06013f2d avoid crude/wrong theorem comparision
blanchet
parents: 51135
diff changeset
  1017
      val facts = facts |> filter (fn (_, th') => thm_less (th', th))
57006
blanchet
parents: 57005
diff changeset
  1018
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1019
      fun nickify ((_, stature), th) = ((nickname_of_thm th, stature), th)
57006
blanchet
parents: 57005
diff changeset
  1020
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1021
      fun is_dep dep (_, th) = (nickname_of_thm th = dep)
57006
blanchet
parents: 57005
diff changeset
  1022
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1023
      fun add_isar_dep facts dep accum =
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1024
        if exists (is_dep dep) accum then
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1025
          accum
55286
blanchet
parents: 55202
diff changeset
  1026
        else
blanchet
parents: 55202
diff changeset
  1027
          (case find_first (is_dep dep) facts of
blanchet
parents: 55202
diff changeset
  1028
            SOME ((_, status), th) => accum @ [(("", status), th)]
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1029
          | NONE => accum (* should not happen *))
57006
blanchet
parents: 57005
diff changeset
  1030
54123
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1031
      val mepo_facts =
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
  1032
        facts
54095
d80743f28fec simplify fudge factor code
blanchet
parents: 54091
diff changeset
  1033
        |> mepo_suggested_facts ctxt params (max_facts |> the_default prover_default_max_facts) NONE
d80743f28fec simplify fudge factor code
blanchet
parents: 54091
diff changeset
  1034
             hyp_ts concl_t
54123
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1035
      val facts =
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1036
        mepo_facts
50754
74a6adcb96ac also generate queries for goals with too many Isar dependencies
blanchet
parents: 50751
diff changeset
  1037
        |> fold (add_isar_dep facts) isar_deps
50624
4d0997abce79 improved thm order hack, in case the default names are overridden
blanchet
parents: 50623
diff changeset
  1038
        |> map nickify
54123
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1039
      val num_isar_deps = length isar_deps
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1040
    in
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1041
      if verbose andalso auto_level = 0 then
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1042
        writeln ("MaSh: " ^ quote prover ^ " on " ^ quote name ^ " with " ^
57017
blanchet
parents: 57014
diff changeset
  1043
          string_of_int num_isar_deps ^ " + " ^ string_of_int (length facts - num_isar_deps) ^
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1044
          " facts")
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1045
      else
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1046
        ();
55286
blanchet
parents: 55202
diff changeset
  1047
      (case run_prover_for_mash ctxt params prover name facts goal of
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1048
        {outcome = NONE, used_facts, ...} =>
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1049
        (if verbose andalso auto_level = 0 then
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1050
           let val num_facts = length used_facts in
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1051
             writeln ("Found proof with " ^ string_of_int num_facts ^ " fact" ^
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1052
               plural_s num_facts)
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1053
           end
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1054
         else
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1055
           ();
50754
74a6adcb96ac also generate queries for goals with too many Isar dependencies
blanchet
parents: 50751
diff changeset
  1056
         (true, map fst used_facts))
55286
blanchet
parents: 55202
diff changeset
  1057
      | _ => (false, isar_deps))
blanchet
parents: 55202
diff changeset
  1058
    end)
48251
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1059
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1060
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1061
(*** High-level communication with MaSh ***)
6cdcfbddc077 moved most of MaSh exporter code to Sledgehammer
blanchet
parents: 48249
diff changeset
  1062
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1063
(* In the following functions, chunks are risers w.r.t. "thm_less_eq". *)
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1064
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1065
fun chunks_and_parents_for chunks th =
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1066
  let
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1067
    fun insert_parent new parents =
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1068
      let val parents = parents |> filter_out (fn p => thm_less_eq (p, new)) in
57096
e4074b91b2a6 always remove duplicates in meshing + use weights for Naive Bayes
blanchet
parents: 57095
diff changeset
  1069
        parents |> forall (fn p => not (thm_less_eq (new, p))) parents ? cons new
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1070
      end
57006
blanchet
parents: 57005
diff changeset
  1071
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1072
    fun rechunk seen (rest as th' :: ths) =
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1073
      if thm_less_eq (th', th) then (rev seen, rest)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1074
      else rechunk (th' :: seen) ths
57006
blanchet
parents: 57005
diff changeset
  1075
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1076
    fun do_chunk [] accum = accum
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1077
      | do_chunk (chunk as hd_chunk :: _) (chunks, parents) =
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1078
        if thm_less_eq (hd_chunk, th) then
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1079
          (chunk :: chunks, insert_parent hd_chunk parents)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1080
        else if thm_less_eq (List.last chunk, th) then
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1081
          let val (front, back as hd_back :: _) = rechunk [] chunk in
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1082
            (front :: back :: chunks, insert_parent hd_back parents)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1083
          end
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1084
        else
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1085
          (chunk :: chunks, parents)
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1086
  in
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1087
    fold_rev do_chunk chunks ([], [])
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1088
    |>> cons []
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1089
    ||> map nickname_of_thm
51181
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1090
  end
d0fa18638478 implement (more) accurate computation of parents
blanchet
parents: 51180
diff changeset
  1091
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1092
fun attach_parents_to_facts _ [] = []
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1093
  | attach_parents_to_facts old_facts (facts as (_, th) :: _) =
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1094
    let
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1095
      fun do_facts _ [] = []
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1096
        | do_facts (_, parents) [fact] = [(parents, fact)]
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1097
        | do_facts (chunks, parents)
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1098
                   ((fact as (_, th)) :: (facts as (_, th') :: _)) =
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1099
          let
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1100
            val chunks = app_hd (cons th) chunks
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1101
            val chunks_and_parents' =
60638
16d80e5ef2dc tuned signature;
wenzelm
parents: 59888
diff changeset
  1102
              if thm_less_eq (th, th') andalso
16d80e5ef2dc tuned signature;
wenzelm
parents: 59888
diff changeset
  1103
                Thm.theory_name_of_thm th = Thm.theory_name_of_thm th'
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1104
              then (chunks, [nickname_of_thm th])
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1105
              else chunks_and_parents_for chunks th'
57006
blanchet
parents: 57005
diff changeset
  1106
          in
blanchet
parents: 57005
diff changeset
  1107
            (parents, fact) :: do_facts chunks_and_parents' facts
blanchet
parents: 57005
diff changeset
  1108
          end
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1109
    in
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1110
      old_facts @ facts
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1111
      |> do_facts (chunks_and_parents_for [[]] th)
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1112
      |> drop (length old_facts)
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1113
    end
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1114
61318
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1115
fun maximal_wrt_graph _ [] = []
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1116
  | maximal_wrt_graph G keys =
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1117
    if can (Graph.get_node G o the_single) keys then
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1118
      keys
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1119
    else
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1120
      let
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1121
        val tab = Symtab.empty |> fold (fn name => Symtab.default (name, ())) keys
57006
blanchet
parents: 57005
diff changeset
  1122
61318
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1123
        fun insert_new seen name = not (Symtab.defined seen name) ? insert (op =) name
57006
blanchet
parents: 57005
diff changeset
  1124
61318
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1125
        fun num_keys keys = Graph.Keys.fold (K (Integer.add 1)) keys 0
57006
blanchet
parents: 57005
diff changeset
  1126
61318
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1127
        fun find_maxes _ (maxs, []) = map snd maxs
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1128
          | find_maxes seen (maxs, new :: news) =
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1129
            find_maxes (seen |> num_keys (Graph.imm_succs G new) > 1 ? Symtab.default (new, ()))
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1130
              (if Symtab.defined tab new then
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1131
                 let
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1132
                   val newp = Graph.all_preds G [new]
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1133
                   fun is_ancestor x yp = member (op =) yp x
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1134
                   val maxs = maxs |> filter (fn (_, max) => not (is_ancestor max newp))
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1135
                 in
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1136
                   if exists (is_ancestor new o fst) maxs then (maxs, news)
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1137
                   else ((newp, new) :: filter_out (fn (_, max) => is_ancestor max newp) maxs, news)
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1138
                 end
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1139
               else
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1140
                 (maxs, Graph.Keys.fold (insert_new seen) (Graph.imm_preds G new) news))
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1141
      in
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1142
        find_maxes Symtab.empty ([],
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1143
          G |> Graph.restrict (not o String.isPrefix anonymous_proof_prefix) |> Graph.maximals)
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1144
      end
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1145
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1146
fun maximal_wrt_access_graph _ [] = []
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1147
  | maximal_wrt_access_graph access_G ((fact as (_, th)) :: facts) =
60948
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
  1148
    let val thy_id = Thm.theory_id_of_thm th in
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
  1149
      fact :: filter_out (fn (_, th') =>
b710a5087116 prefer theory_id operations;
wenzelm
parents: 60649
diff changeset
  1150
        Context.proper_subthy_id (Thm.theory_id_of_thm th', thy_id)) facts
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1151
      |> map (nickname_of_thm o snd)
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1152
      |> maximal_wrt_graph access_G
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1153
    end
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1154
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1155
fun is_fact_in_graph access_G = can (Graph.get_node access_G) o nickname_of_thm
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1156
53197
6c5e7143e1f6 tuned fudge factor in light of evaluation
blanchet
parents: 53159
diff changeset
  1157
val chained_feature_factor = 0.5 (* FUDGE *)
57405
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1158
val extra_feature_factor = 0.1 (* FUDGE *)
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1159
val num_extra_feature_facts = 10 (* FUDGE *)
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1160
57660
86b853448f08 fixed sorting (broken since 9cc802a8ab06)
blanchet
parents: 57658
diff changeset
  1161
val max_proximity_facts = 100 (* FUDGE *)
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1162
54060
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1163
fun find_mash_suggestions ctxt max_facts suggs facts chained raw_unknown =
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1164
  let
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1165
    val inter_fact = inter (eq_snd Thm.eq_thm_prop)
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1166
    val raw_mash = find_suggested_facts ctxt facts suggs
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1167
    val proximate = take max_proximity_facts facts
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1168
    val unknown_chained = inter_fact raw_unknown chained
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1169
    val unknown_proximate = inter_fact raw_unknown proximate
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1170
    val mess =
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1171
      [(0.9 (* FUDGE *), (map (rpair 1.0) unknown_chained, [])),
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1172
       (0.4 (* FUDGE *), (weight_facts_smoothly unknown_proximate, [])),
5f96d29fb7c2 removed pointless special case
blanchet
parents: 54012
diff changeset
  1173
       (0.1 (* FUDGE *), (weight_facts_steeply raw_mash, raw_unknown))]
57007
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
  1174
    val unknown = raw_unknown
d3eed0518882 started work on MaSh/SML
blanchet
parents: 57006
diff changeset
  1175
      |> fold (subtract (eq_snd Thm.eq_thm_prop)) [unknown_chained, unknown_proximate]
57006
blanchet
parents: 57005
diff changeset
  1176
  in
61322
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
  1177
    (mesh_facts (fact_distinct (op aconv)) (eq_snd (gen_eq_thm ctxt)) max_facts mess, unknown)
57006
blanchet
parents: 57005
diff changeset
  1178
  end
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1179
60649
e007aa6a8aa2 more explicit use of context and elimination of Thm.theory_of_thm, although unclear (and untested?) situations remain;
wenzelm
parents: 60641
diff changeset
  1180
fun mash_suggested_facts ctxt thy_name ({debug, ...} : params) max_suggs hyp_ts concl_t facts =
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1181
  let
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
  1182
    val algorithm = the_mash_algorithm ()
57052
ea5912e3b008 until naive Bayes supports weights, don't incorporate 'extra' low-weight features
blanchet
parents: 57039
diff changeset
  1183
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1184
    val facts = facts
60641
f6e8293747fd clarified context;
wenzelm
parents: 60640
diff changeset
  1185
      |> rev_sort_list_prefix (crude_thm_ord ctxt o apply2 snd)
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1186
        (Int.max (num_extra_feature_facts, max_proximity_facts))
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1187
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1188
    val chained = filter (fn ((_, (scope, _)), _) => scope = Chained) facts
54085
b6b41e1d5689 optimized built-in const check
blanchet
parents: 54064
diff changeset
  1189
60638
16d80e5ef2dc tuned signature;
wenzelm
parents: 59888
diff changeset
  1190
    fun fact_has_right_theory (_, th) = thy_name = Thm.theory_name_of_thm th
56995
61855ade6c7e hide more consts to beautify documentation
blanchet
parents: 56303
diff changeset
  1191
53141
d27e99a6a679 take chained and proximate facts into consideration when computing MaSh features
blanchet
parents: 53140
diff changeset
  1192
    fun chained_or_extra_features_of factor (((_, stature), th), weight) =
59582
0fbed69ff081 tuned signature -- prefer qualified names;
wenzelm
parents: 59486
diff changeset
  1193
      [Thm.prop_of th]
60640
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
  1194
      |> features_of ctxt (Thm.theory_name_of_thm th) stature
57404
a68ae60c1504 got rid of hard-coded weights, now that we have TFIDF
blanchet
parents: 57403
diff changeset
  1195
      |> map (rpair (weight * factor))
54085
b6b41e1d5689 optimized built-in const check
blanchet
parents: 54064
diff changeset
  1196
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1197
    val {access_G, xtabs = ((num_facts, fact_tab), (num_feats, feat_tab)), ffds, freqs, ...} =
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1198
      peek_state ctxt
57017
blanchet
parents: 57014
diff changeset
  1199
60640
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
  1200
    val goal_feats0 =
60649
e007aa6a8aa2 more explicit use of context and elimination of Thm.theory_of_thm, although unclear (and untested?) situations remain;
wenzelm
parents: 60641
diff changeset
  1201
      features_of ctxt thy_name (Local, General) (concl_t :: hyp_ts)
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1202
    val chained_feats = chained
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1203
      |> map (rpair 1.0)
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1204
      |> map (chained_or_extra_features_of chained_feature_factor)
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1205
      |> rpair [] |-> fold (union (eq_fst (op =)))
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1206
    val extra_feats = facts
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1207
      |> take (Int.max (0, num_extra_feature_facts - length chained))
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1208
      |> filter fact_has_right_theory
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1209
      |> weight_facts_steeply
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1210
      |> map (chained_or_extra_features_of extra_feature_factor)
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1211
      |> rpair [] |-> fold (union (eq_fst (op =)))
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1212
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1213
    val goal_feats =
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1214
      fold (union (eq_fst (op =))) [chained_feats, extra_feats] (map (rpair 1.0) goal_feats0)
59058
a78612c67ec0 renamed "pairself" to "apply2", in accordance to @{apply 2};
wenzelm
parents: 58919
diff changeset
  1215
      |> debug ? sort (Real.compare o swap o apply2 snd)
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1216
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1217
    val parents = maximal_wrt_access_graph access_G facts
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1218
    val visible_facts = map_filter (Symtab.lookup fact_tab) (Graph.all_preds access_G parents)
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1219
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1220
    val suggs =
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
  1221
      if algorithm = MaSh_NB_Ext orelse algorithm = MaSh_kNN_Ext then
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1222
        let
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1223
          val learns =
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1224
            Graph.schedule (fn _ => fn (fact, (_, feats, deps)) => (fact, feats, deps)) access_G
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1225
        in
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
  1226
          MaSh.query_external ctxt algorithm max_suggs learns goal_feats
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1227
        end
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1228
      else
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1229
        let
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1230
          val int_goal_feats =
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1231
            map_filter (fn (s, w) => Option.map (rpair w) (Symtab.lookup feat_tab s)) goal_feats
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1232
        in
57532
c7dc1f0a2b8a tuned terminology
blanchet
parents: 57531
diff changeset
  1233
          MaSh.query_internal ctxt algorithm num_facts num_feats ffds freqs visible_facts max_suggs
57460
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1234
            goal_feats int_goal_feats
9cc802a8ab06 speed up MaSh a bit
blanchet
parents: 57459
diff changeset
  1235
        end
57017
blanchet
parents: 57014
diff changeset
  1236
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1237
    val unknown = filter_out (is_fact_in_graph access_G o snd) facts
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1238
  in
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1239
    find_mash_suggestions ctxt max_suggs suggs facts chained unknown
59058
a78612c67ec0 renamed "pairself" to "apply2", in accordance to @{apply 2};
wenzelm
parents: 58919
diff changeset
  1240
    |> apply2 (map fact_of_raw_fact)
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1241
  end
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1242
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1243
fun mash_unlearn () = (clear_state (); writeln "Reset MaSh")
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1244
57661
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
  1245
fun learn_wrt_access_graph ctxt (name, parents, feats, deps)
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
  1246
    (accum as (access_G, (fact_xtab, feat_xtab))) =
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1247
  let
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1248
    fun maybe_learn_from from (accum as (parents, access_G)) =
57013
ed95456499e6 better way to take invisible facts into account than 'island' business
blanchet
parents: 57012
diff changeset
  1249
      try_graph ctxt "updating graph" accum (fn () =>
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1250
        (from :: parents, Graph.add_edge_acyclic (from, name) access_G))
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1251
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1252
    val access_G = access_G |> Graph.default_node (name, (Isar_Proof, feats, deps))
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1253
    val (parents, access_G) = ([], access_G) |> fold maybe_learn_from parents
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1254
    val (deps, _) = ([], access_G) |> fold maybe_learn_from deps
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1255
57661
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
  1256
    val fact_xtab = add_to_xtab name fact_xtab
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1257
    val feat_xtab = fold maybe_add_to_xtab feats feat_xtab
57006
blanchet
parents: 57005
diff changeset
  1258
  in
57661
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
  1259
    (SOME (name, parents, feats, deps), (access_G, (fact_xtab, feat_xtab)))
57006
blanchet
parents: 57005
diff changeset
  1260
  end
57661
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
  1261
  handle Symtab.DUP _ => (NONE, accum) (* facts sometimes have the same name, confusingly *)
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1262
57381
blanchet
parents: 57380
diff changeset
  1263
fun relearn_wrt_access_graph ctxt (name, deps) access_G =
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1264
  let
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1265
    fun maybe_relearn_from from (accum as (parents, access_G)) =
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1266
      try_graph ctxt "updating graph" accum (fn () =>
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1267
        (from :: parents, Graph.add_edge_acyclic (from, name) access_G))
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1268
    val access_G =
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1269
      access_G |> Graph.map_node name (fn (_, feats, _) => (Automatic_Proof, feats, deps))
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1270
    val (deps, _) = ([], access_G) |> fold maybe_relearn_from deps
57006
blanchet
parents: 57005
diff changeset
  1271
  in
57381
blanchet
parents: 57380
diff changeset
  1272
    ((name, deps), access_G)
57006
blanchet
parents: 57005
diff changeset
  1273
  end
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1274
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1275
fun flop_wrt_access_graph name =
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1276
  Graph.map_node name (fn (_, feats, deps) => (Isar_Proof_wegen_Prover_Flop, feats, deps))
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1277
57273
01b68f625550 automatically learn MaSh facts also in 'blocking' mode
blanchet
parents: 57150
diff changeset
  1278
val learn_timeout_slack = 20.0
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1279
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1280
fun launch_thread timeout task =
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1281
  let
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1282
    val hard_timeout = time_mult learn_timeout_slack timeout
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1283
    val birth_time = Time.now ()
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
  1284
    val death_time = birth_time + hard_timeout
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1285
    val desc = ("Machine learner for Sledgehammer", "")
57006
blanchet
parents: 57005
diff changeset
  1286
  in
59471
ca459352d8c5 more explicit indication of Async_Manager_Legacy as Proof General legacy;
wenzelm
parents: 59172
diff changeset
  1287
    Async_Manager_Legacy.thread MaShN birth_time death_time desc task
57006
blanchet
parents: 57005
diff changeset
  1288
  end
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1289
61318
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1290
fun anonymous_proof_name () =
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1291
  Date.fmt (anonymous_proof_prefix ^ "%Y%m%d.%H%M%S.") (Date.fromTimeLocal (Time.now ())) ^
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1292
  serial_string ()
57013
ed95456499e6 better way to take invisible facts into account than 'island' business
blanchet
parents: 57012
diff changeset
  1293
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1294
fun mash_learn_proof ctxt ({timeout, ...} : params) t facts used_ths =
57387
2b6fe2a48352 reintroduced MaSh hints, this time as persistent creatures
blanchet
parents: 57386
diff changeset
  1295
  if not (null used_ths) andalso is_mash_enabled () then
54816
10d48c2a3e32 made timeouts in Sledgehammer not be 'option's -- simplified lots of code
blanchet
parents: 54799
diff changeset
  1296
    launch_thread timeout (fn () =>
57006
blanchet
parents: 57005
diff changeset
  1297
      let
blanchet
parents: 57005
diff changeset
  1298
        val thy = Proof_Context.theory_of ctxt
60640
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
  1299
        val feats = features_of ctxt (Context.theory_name thy) (Local, General) [t]
60641
f6e8293747fd clarified context;
wenzelm
parents: 60640
diff changeset
  1300
        val facts = rev_sort_list_prefix (crude_thm_ord ctxt o apply2 snd) 1 facts
57006
blanchet
parents: 57005
diff changeset
  1301
      in
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1302
        map_state ctxt
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1303
          (fn {access_G, xtabs as ((num_facts0, _), _), ffds, freqs, dirty_facts} =>
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1304
             let
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1305
               val parents = maximal_wrt_access_graph access_G facts
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1306
               val deps = used_ths
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1307
                 |> filter (is_fact_in_graph access_G)
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1308
                 |> map nickname_of_thm
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1309
61318
6a5a188ab3e7 speed up MaSh
blanchet
parents: 61312
diff changeset
  1310
               val name = anonymous_proof_name ()
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1311
               val (access_G', xtabs', rev_learns) =
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1312
                 add_node Automatic_Proof name parents feats deps (access_G, xtabs, [])
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1313
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1314
               val (ffds', freqs') =
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1315
                 recompute_ffds_freqs_from_learns (rev rev_learns) xtabs' num_facts0 ffds freqs
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1316
             in
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1317
               {access_G = access_G', xtabs = xtabs', ffds = ffds', freqs = freqs',
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1318
                dirty_facts = Option.map (cons name) dirty_facts}
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1319
             end);
57006
blanchet
parents: 57005
diff changeset
  1320
        (true, "")
blanchet
parents: 57005
diff changeset
  1321
      end)
53819
e55d641d0a70 honor MaSh's zero-overhead policy -- no learning if the tool is disabled
blanchet
parents: 53790
diff changeset
  1322
  else
e55d641d0a70 honor MaSh's zero-overhead policy -- no learning if the tool is disabled
blanchet
parents: 53790
diff changeset
  1323
    ()
53095
667717a5ad80 learn MaSh facts on the fly
blanchet
parents: 53094
diff changeset
  1324
63518
ae8fd6fe63a1 tuned signature;
wenzelm
parents: 62826
diff changeset
  1325
fun sendback sub = Active.sendback_markup_command (sledgehammerN ^ " " ^ sub)
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1326
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1327
val commit_timeout = seconds 30.0
48332
271a4a6af734 optimize parent computation in MaSh + remove temporary files
blanchet
parents: 48330
diff changeset
  1328
50485
3c6ac2da2f45 merge aliased theorems in MaSh dependencies, modulo symmetry of equality
blanchet
parents: 50484
diff changeset
  1329
(* The timeout is understood in a very relaxed fashion. *)
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1330
fun mash_learn_facts ctxt (params as {debug, verbose, ...}) prover auto_level run_prover
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1331
    learn_timeout facts =
48304
50e64af9c829 more work on MaSh
blanchet
parents: 48303
diff changeset
  1332
  let
48318
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1333
    val timer = Timer.startRealTimer ()
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
  1334
    fun next_commit_time () = Timer.checkRealTimer timer + commit_timeout
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1335
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1336
    val {access_G, ...} = peek_state ctxt
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1337
    val is_in_access_G = is_fact_in_graph access_G o snd
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1338
    val no_new_facts = forall is_in_access_G facts
48308
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
  1339
  in
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1340
    if no_new_facts andalso not run_prover then
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1341
      if auto_level < 2 then
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1342
        "No new " ^ (if run_prover then "automatic" else "Isar") ^ " proofs to learn" ^
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
  1343
        (if auto_level = 0 andalso not run_prover then
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1344
           "\n\nHint: Try " ^ sendback learn_proverN ^ " to learn from an automatic prover"
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1345
         else
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1346
           "")
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1347
      else
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1348
        ""
48308
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
  1349
    else
48304
50e64af9c829 more work on MaSh
blanchet
parents: 48303
diff changeset
  1350
      let
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1351
        val name_tabs = build_name_tables nickname_of_thm facts
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1352
48439
67a6bcbd3587 removed MaSh junk arising from primrec definitions
blanchet
parents: 48438
diff changeset
  1353
        fun deps_of status th =
57017
blanchet
parents: 57014
diff changeset
  1354
          if status = Non_Rec_Def orelse status = Rec_Def then
48439
67a6bcbd3587 removed MaSh junk arising from primrec definitions
blanchet
parents: 48438
diff changeset
  1355
            SOME []
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
  1356
          else if run_prover then
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1357
            prover_dependencies_of ctxt params prover auto_level facts name_tabs th
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1358
            |> (fn (false, _) => NONE | (true, deps) => trim_dependencies deps)
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1359
          else
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1360
            isar_dependencies_of name_tabs th
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1361
48669
cdcdb0547f29 remember ATP flops to avoid repeating them too quickly
blanchet
parents: 48668
diff changeset
  1362
        fun do_commit [] [] [] state = state
57382
6c6a0ac70eac incremental learning when learing several facts
blanchet
parents: 57381
diff changeset
  1363
          | do_commit learns relearns flops
6c6a0ac70eac incremental learning when learing several facts
blanchet
parents: 57381
diff changeset
  1364
              {access_G, xtabs as ((num_facts0, _), _), ffds, freqs, dirty_facts} =
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1365
            let
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
  1366
              val was_empty = Graph.is_empty access_G
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
  1367
57658
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1368
              val (learns, (access_G', xtabs')) =
57381
blanchet
parents: 57380
diff changeset
  1369
                fold_map (learn_wrt_access_graph ctxt) learns (access_G, xtabs)
57661
1586d0479f2c eliminated source of 'DUP's in MaSh
blanchet
parents: 57660
diff changeset
  1370
                |>> map_filter I
57658
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1371
              val (relearns, access_G'') =
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1372
                fold_map (relearn_wrt_access_graph ctxt) relearns access_G'
57371
0b2bce982afd store string-to-index tables in memory
blanchet
parents: 57370
diff changeset
  1373
57658
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1374
              val access_G''' = access_G'' |> fold flop_wrt_access_graph flops
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1375
              val dirty_facts' =
57365
blanchet
parents: 57364
diff changeset
  1376
                (case (was_empty, dirty_facts) of
57062
eb6777796782 avoid slow inspection of proof terms now that dependencies are stored in 'state'
blanchet
parents: 57061
diff changeset
  1377
                  (false, SOME names) => SOME (map #1 learns @ map #1 relearns @ names)
55286
blanchet
parents: 55202
diff changeset
  1378
                | _ => NONE)
57378
fe96689f393b recompute learning data at learning time, not query time
blanchet
parents: 57377
diff changeset
  1379
57382
6c6a0ac70eac incremental learning when learing several facts
blanchet
parents: 57381
diff changeset
  1380
              val (ffds', freqs') =
6c6a0ac70eac incremental learning when learing several facts
blanchet
parents: 57381
diff changeset
  1381
                if null relearns then
6c6a0ac70eac incremental learning when learing several facts
blanchet
parents: 57381
diff changeset
  1382
                  recompute_ffds_freqs_from_learns
57658
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1383
                    (map (fn (name, _, feats, deps) => (name, feats, deps)) learns) xtabs' num_facts0
57382
6c6a0ac70eac incremental learning when learing several facts
blanchet
parents: 57381
diff changeset
  1384
                    ffds freqs
6c6a0ac70eac incremental learning when learing several facts
blanchet
parents: 57381
diff changeset
  1385
                else
57658
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1386
                  recompute_ffds_freqs_from_access_G access_G''' xtabs'
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1387
            in
57658
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1388
              {access_G = access_G''', xtabs = xtabs', ffds = ffds', freqs = freqs',
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1389
               dirty_facts = dirty_facts'}
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1390
            end
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1391
50631
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1392
        fun commit last learns relearns flops =
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1393
          (if debug andalso auto_level = 0 then writeln "Committing..." else ();
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1394
           map_state ctxt (do_commit (rev learns) relearns flops);
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1395
           if not last andalso auto_level = 0 then
50631
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1396
             let val num_proofs = length learns + length relearns in
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1397
               writeln ("Learned " ^ string_of_int num_proofs ^ " " ^
57017
blanchet
parents: 57014
diff changeset
  1398
                 (if run_prover then "automatic" else "Isar") ^ " proof" ^
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1399
                 plural_s num_proofs ^ " in the last " ^ string_of_time commit_timeout)
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1400
             end
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1401
           else
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1402
             ())
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1403
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1404
        fun learn_new_fact _ (accum as (_, (_, _, true))) = accum
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1405
          | learn_new_fact (parents, ((_, stature as (_, status)), th))
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1406
              (learns, (num_nontrivial, next_commit, _)) =
48318
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1407
            let
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1408
              val name = nickname_of_thm th
60640
58326c4a3ea2 tuned signature;
wenzelm
parents: 60638
diff changeset
  1409
              val feats = features_of ctxt (Thm.theory_name_of_thm th) stature [Thm.prop_of th]
57757
blanchet
parents: 57735
diff changeset
  1410
              val deps = these (deps_of status th)
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1411
              val num_nontrivial = num_nontrivial |> not (null deps) ? Integer.add 1
50631
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1412
              val learns = (name, parents, feats, deps) :: learns
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1413
              val (learns, next_commit) =
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
  1414
                if Timer.checkRealTimer timer > next_commit then
50631
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1415
                  (commit false learns [] []; ([], next_commit_time ()))
48392
ca998fa08cd9 added "learn_from_atp" command to MaSh, for patient users
blanchet
parents: 48390
diff changeset
  1416
                else
50631
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1417
                  (learns, next_commit)
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
  1418
              val timed_out = Timer.checkRealTimer timer > learn_timeout
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1419
            in
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1420
              (learns, (num_nontrivial, next_commit, timed_out))
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1421
            end
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1422
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1423
        val (num_new_facts, num_nontrivial) =
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1424
          if no_new_facts then
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1425
            (0, 0)
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1426
          else
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1427
            let
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1428
              val new_facts = facts
60641
f6e8293747fd clarified context;
wenzelm
parents: 60640
diff changeset
  1429
                |> sort (crude_thm_ord ctxt o apply2 snd)
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1430
                |> attach_parents_to_facts []
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1431
                |> filter_out (is_in_access_G o snd)
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1432
              val (learns, (num_nontrivial, _, _)) =
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1433
                ([], (0, next_commit_time (), false))
51182
962190eab40d provide two modes for MaSh driver: linearized or real visibility
blanchet
parents: 51181
diff changeset
  1434
                |> fold learn_new_fact new_facts
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1435
            in
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1436
              commit true learns [] []; (length new_facts, num_nontrivial)
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1437
            end
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1438
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1439
        fun relearn_old_fact _ (accum as (_, (_, _, true))) = accum
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1440
          | relearn_old_fact ((_, (_, status)), th)
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1441
              ((relearns, flops), (num_nontrivial, next_commit, _)) =
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1442
            let
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1443
              val name = nickname_of_thm th
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1444
              val (num_nontrivial, relearns, flops) =
55286
blanchet
parents: 55202
diff changeset
  1445
                (case deps_of status th of
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1446
                  SOME deps => (num_nontrivial + 1, (name, deps) :: relearns, flops)
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1447
                | NONE => (num_nontrivial, relearns, name :: flops))
50631
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1448
              val (relearns, flops, next_commit) =
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
  1449
                if Timer.checkRealTimer timer > next_commit then
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1450
                  (commit false [] relearns flops; ([], [], next_commit_time ()))
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1451
                else
50631
b69079c14665 tuned ML function names
blanchet
parents: 50624
diff changeset
  1452
                  (relearns, flops, next_commit)
62826
eb94e570c1a4 prefer infix operations;
wenzelm
parents: 62735
diff changeset
  1453
              val timed_out = Timer.checkRealTimer timer > learn_timeout
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1454
            in
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1455
              ((relearns, flops), (num_nontrivial, next_commit, timed_out))
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1456
            end
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1457
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1458
        val num_nontrivial =
51177
e8c9755fd14e tuned code: factored out parent computation
blanchet
parents: 51176
diff changeset
  1459
          if not run_prover then
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1460
            num_nontrivial
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1461
          else
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1462
            let
48668
5d63c23b4042 remember which MaSh proofs were found using ATPs
blanchet
parents: 48667
diff changeset
  1463
              val max_isar = 1000 * max_dependencies
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1464
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1465
              fun priority_of th =
59172
d1c500e0a722 separate module Random;
wenzelm
parents: 59058
diff changeset
  1466
                Random.random_range 0 max_isar +
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1467
                (case try (Graph.get_node access_G) (nickname_of_thm th) of
57062
eb6777796782 avoid slow inspection of proof terms now that dependencies are stored in 'state'
blanchet
parents: 57061
diff changeset
  1468
                  SOME (Isar_Proof, _, deps) => ~100 * length deps
eb6777796782 avoid slow inspection of proof terms now that dependencies are stored in 'state'
blanchet
parents: 57061
diff changeset
  1469
                | SOME (Automatic_Proof, _, _) => 2 * max_isar
eb6777796782 avoid slow inspection of proof terms now that dependencies are stored in 'state'
blanchet
parents: 57061
diff changeset
  1470
                | SOME (Isar_Proof_wegen_Prover_Flop, _, _) => max_isar
eb6777796782 avoid slow inspection of proof terms now that dependencies are stored in 'state'
blanchet
parents: 57061
diff changeset
  1471
                | NONE => 0)
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1472
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1473
              val old_facts = facts
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1474
                |> filter is_in_access_G
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1475
                |> map (`(priority_of o snd))
59058
a78612c67ec0 renamed "pairself" to "apply2", in accordance to @{apply 2};
wenzelm
parents: 58919
diff changeset
  1476
                |> sort (int_ord o apply2 fst)
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1477
                |> map snd
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1478
              val ((relearns, flops), (num_nontrivial, _, _)) =
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1479
                (([], []), (num_nontrivial, next_commit_time (), false))
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1480
                |> fold relearn_old_fact old_facts
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1481
            in
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1482
              commit true [] relearns flops; num_nontrivial
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1483
            end
48318
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1484
      in
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1485
        if verbose orelse auto_level < 2 then
57385
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1486
          "Learned " ^ string_of_int num_new_facts ^ " fact" ^ plural_s num_new_facts ^ " and " ^
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1487
          string_of_int num_nontrivial ^ " nontrivial " ^
0eb0c7b93676 tuned output
blanchet
parents: 57384
diff changeset
  1488
          (if run_prover then "automatic and " else "") ^ "Isar proof" ^ plural_s num_nontrivial ^
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1489
          (if verbose then " in " ^ string_of_time (Timer.checkRealTimer timer) else "")
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
  1490
        else
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
  1491
          ""
48318
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1492
      end
48308
89674e5a4d35 make tracing an option
blanchet
parents: 48306
diff changeset
  1493
  end
48304
50e64af9c829 more work on MaSh
blanchet
parents: 48303
diff changeset
  1494
54123
271a8377656f remove overloading of "max_facts" -- it already controls the number of facts passed to ATPs for 'learn_prover'
blanchet
parents: 54115
diff changeset
  1495
fun mash_learn ctxt (params as {provers, timeout, ...}) fact_override chained run_prover =
48316
252f45c04042 drastic overhaul of MaSh data structures + fixed a few performance issues
blanchet
parents: 48315
diff changeset
  1496
  let
48396
dd82d190c2af name tuning
blanchet
parents: 48395
diff changeset
  1497
    val css = Sledgehammer_Fact.clasimpset_rule_table_of ctxt
48395
85a7fb65507a learning should honor the fact override and the chained facts
blanchet
parents: 48394
diff changeset
  1498
    val ctxt = ctxt |> Config.put instantiate_inducts false
58919
82a71046dce8 prefer explicit Keyword.keywords;
wenzelm
parents: 58843
diff changeset
  1499
    val facts =
82a71046dce8 prefer explicit Keyword.keywords;
wenzelm
parents: 58843
diff changeset
  1500
      nearly_all_facts ctxt false fact_override Keyword.empty_keywords css chained [] @{prop True}
60641
f6e8293747fd clarified context;
wenzelm
parents: 60640
diff changeset
  1501
      |> sort (crude_thm_ord ctxt o apply2 snd o swap)
48404
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1502
    val num_facts = length facts
0a261b4aa093 relearn ATP proofs
blanchet
parents: 48403
diff changeset
  1503
    val prover = hd provers
57006
blanchet
parents: 57005
diff changeset
  1504
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
  1505
    fun learn auto_level run_prover =
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1506
      mash_learn_facts ctxt params prover auto_level run_prover one_year facts
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1507
      |> writeln
48316
252f45c04042 drastic overhaul of MaSh data structures + fixed a few performance issues
blanchet
parents: 48315
diff changeset
  1508
  in
50484
8ec31bdb9d36 adopt the neutral "prover" terminology for MaSh rather than the ambiguous/wrong ATP terminology (which sometimes excludes SMT solvers)
blanchet
parents: 50450
diff changeset
  1509
    if run_prover then
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1510
      (writeln ("MaShing through " ^ string_of_int num_facts ^ " fact" ^
57017
blanchet
parents: 57014
diff changeset
  1511
         plural_s num_facts ^ " for automatic proofs (" ^ quote prover ^ " timeout: " ^
blanchet
parents: 57014
diff changeset
  1512
         string_of_time timeout ^ ").\n\nCollecting Isar proofs first...");
50340
72519bf5f135 simplify MaSh term patterns + add missing global facts if there aren't too many
blanchet
parents: 50339
diff changeset
  1513
       learn 1 false;
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1514
       writeln "Now collecting automatic proofs\n\
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1515
         \This may take several hours; you can safely stop the learning process at any point";
50340
72519bf5f135 simplify MaSh term patterns + add missing global facts if there aren't too many
blanchet
parents: 50339
diff changeset
  1516
       learn 0 true)
72519bf5f135 simplify MaSh term patterns + add missing global facts if there aren't too many
blanchet
parents: 50339
diff changeset
  1517
    else
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1518
      (writeln ("MaShing through " ^ string_of_int num_facts ^ " fact" ^
57006
blanchet
parents: 57005
diff changeset
  1519
         plural_s num_facts ^ " for Isar proofs...");
50340
72519bf5f135 simplify MaSh term patterns + add missing global facts if there aren't too many
blanchet
parents: 50339
diff changeset
  1520
       learn 0 false)
48316
252f45c04042 drastic overhaul of MaSh data structures + fixed a few performance issues
blanchet
parents: 48315
diff changeset
  1521
  end
48249
2bd242c56c90 dummy implementation
blanchet
parents: 48248
diff changeset
  1522
57555
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1523
fun mash_can_suggest_facts ctxt = not (Graph.is_empty (#access_G (peek_state ctxt)))
50311
c9d7ccd090e1 tuned order of functions
blanchet
parents: 50310
diff changeset
  1524
57274
0acfdb151e52 more generous formula -- there are lots of duplicates out there
blanchet
parents: 57273
diff changeset
  1525
(* Generate more suggestions than requested, because some might be thrown out later for various
0acfdb151e52 more generous formula -- there are lots of duplicates out there
blanchet
parents: 57273
diff changeset
  1526
   reasons (e.g., duplicates). *)
0acfdb151e52 more generous formula -- there are lots of duplicates out there
blanchet
parents: 57273
diff changeset
  1527
fun generous_max_suggestions max_facts = 3 * max_facts div 2 + 25
50383
4274b25ff4e7 take proximity into account for MaSh + fix a debilitating bug in feature generation
blanchet
parents: 50382
diff changeset
  1528
50814
4247cbd78aaf export MeSh data as well
blanchet
parents: 50755
diff changeset
  1529
val mepo_weight = 0.5
4247cbd78aaf export MeSh data as well
blanchet
parents: 50755
diff changeset
  1530
val mash_weight = 0.5
4247cbd78aaf export MeSh data as well
blanchet
parents: 50755
diff changeset
  1531
53152
cbd3c7c48d2c learn new facts on query if there aren't too many of them in MaSh
blanchet
parents: 53150
diff changeset
  1532
val max_facts_to_learn_before_query = 100
cbd3c7c48d2c learn new facts on query if there aren't too many of them in MaSh
blanchet
parents: 53150
diff changeset
  1533
57005
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1534
(* The threshold should be large enough so that MaSh does not get activated for Auto Sledgehammer
33f3d2ea803d store all MaSh data on the Isabelle side, in preparation for replacing 'mash.py' with ML solution
blanchet
parents: 56995
diff changeset
  1535
   and Try. *)
48318
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1536
val min_secs_for_learning = 15
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1537
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1538
fun relevant_facts ctxt (params as {verbose, learn, fact_filter, timeout, ...}) prover
57405
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1539
    max_facts ({add, only, ...} : fact_override) hyp_ts concl_t facts =
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
  1540
  if not (subset (op =) (the_list fact_filter, fact_filters)) then
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1541
    error ("Unknown fact filter: " ^ quote (the fact_filter))
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
  1542
  else if only then
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
  1543
    [("", map fact_of_raw_fact facts)]
48321
c552d7f1720b learn from minimized ATP proofs
blanchet
parents: 48320
diff changeset
  1544
  else if max_facts <= 0 orelse null facts then
51010
afd0213a3dab tuned data structure
blanchet
parents: 51008
diff changeset
  1545
    [("", [])]
48288
255c6e1fd505 rationalize relevance filter, slowing moving code from Iter to MaSh
blanchet
parents: 48251
diff changeset
  1546
  else
255c6e1fd505 rationalize relevance filter, slowing moving code from Iter to MaSh
blanchet
parents: 48251
diff changeset
  1547
    let
60649
e007aa6a8aa2 more explicit use of context and elimination of Thm.theory_of_thm, although unclear (and untested?) situations remain;
wenzelm
parents: 60641
diff changeset
  1548
      val thy_name = Context.theory_name (Proof_Context.theory_of ctxt)
57407
140e16bc2a40 use right theory name for theorems in evaluation driver
blanchet
parents: 57406
diff changeset
  1549
57555
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1550
      fun maybe_launch_thread exact min_num_facts_to_learn =
59471
ca459352d8c5 more explicit indication of Async_Manager_Legacy as Proof General legacy;
wenzelm
parents: 59172
diff changeset
  1551
        if not (Async_Manager_Legacy.has_running_threads MaShN) andalso
54816
10d48c2a3e32 made timeouts in Sledgehammer not be 'option's -- simplified lots of code
blanchet
parents: 54799
diff changeset
  1552
           Time.toSeconds timeout >= min_secs_for_learning then
10d48c2a3e32 made timeouts in Sledgehammer not be 'option's -- simplified lots of code
blanchet
parents: 54799
diff changeset
  1553
          let val timeout = time_mult learn_timeout_slack timeout in
57405
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1554
            (if verbose then
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1555
               writeln ("Started MaShing through " ^
57555
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1556
                 (if exact then "" else "at least ") ^ string_of_int min_num_facts_to_learn ^
63692
1bc4bc2c9fd1 killed final stops in Sledgehammer and friends
blanchet
parents: 63518
diff changeset
  1557
                 " fact" ^ plural_s min_num_facts_to_learn ^ " in the background")
57405
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1558
             else
1f49da059947 reintroduced 'extra features' + only print message in verbose mode
blanchet
parents: 57404
diff changeset
  1559
               ());
54816
10d48c2a3e32 made timeouts in Sledgehammer not be 'option's -- simplified lots of code
blanchet
parents: 54799
diff changeset
  1560
            launch_thread timeout
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1561
              (fn () => (true, mash_learn_facts ctxt params prover 2 false timeout facts))
48319
340187063d84 use async manager to manage MaSh learners to make sure they get killed cleanly
blanchet
parents: 48318
diff changeset
  1562
          end
48318
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1563
        else
325c8fd0d762 more consolidation of MaSh code
blanchet
parents: 48316
diff changeset
  1564
          ()
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
  1565
57555
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1566
      fun please_learn () =
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1567
        let
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1568
          val {access_G, xtabs = ((num_facts0, _), _), ...} = peek_state ctxt
61321
c982a4cc8dc4 sped up MaSh nickname generation
blanchet
parents: 61318
diff changeset
  1569
          val is_in_access_G = is_fact_in_graph access_G o snd
57555
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1570
          val min_num_facts_to_learn = length facts - num_facts0
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1571
        in
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1572
          if min_num_facts_to_learn <= max_facts_to_learn_before_query then
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1573
            (case length (filter_out is_in_access_G facts) of
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1574
              0 => ()
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1575
            | num_facts_to_learn =>
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1576
              if num_facts_to_learn <= max_facts_to_learn_before_query then
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1577
                mash_learn_facts ctxt params prover 2 false timeout facts
58843
521cea5fa777 discontinued obsolete Output.urgent_message;
wenzelm
parents: 57757
diff changeset
  1578
                |> (fn "" => () | s => writeln (MaShN ^ ": " ^ s))
57555
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1579
              else
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1580
                maybe_launch_thread true num_facts_to_learn)
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1581
          else
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1582
            maybe_launch_thread false min_num_facts_to_learn
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1583
        end
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1584
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1585
      val mash_enabled = is_mash_enabled ()
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1586
      val _ =
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1587
        if learn andalso mash_enabled andalso fact_filter <> SOME mepoN then please_learn () else ()
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
  1588
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1589
      val effective_fact_filter =
55286
blanchet
parents: 55202
diff changeset
  1590
        (case fact_filter of
57431
02c408aed5ee killed Python version of MaSh, now that the SML version works adequately
blanchet
parents: 57409
diff changeset
  1591
          SOME ff => ff
57555
f60d70566525 also learn when 'fact_filter =' is set explicitly
blanchet
parents: 57554
diff changeset
  1592
        | NONE => if mash_enabled andalso mash_can_suggest_facts ctxt then meshN else mepoN)
54143
18def1c73c79 make sure add: doesn't add duplicates, and works for [no_atp] facts
blanchet
parents: 54141
diff changeset
  1593
18def1c73c79 make sure add: doesn't add duplicates, and works for [no_atp] facts
blanchet
parents: 54141
diff changeset
  1594
      val unique_facts = drop_duplicate_facts facts
48288
255c6e1fd505 rationalize relevance filter, slowing moving code from Iter to MaSh
blanchet
parents: 48251
diff changeset
  1595
      val add_ths = Attrib.eval_thms ctxt add
54143
18def1c73c79 make sure add: doesn't add duplicates, and works for [no_atp] facts
blanchet
parents: 54141
diff changeset
  1596
51004
5f2788c38127 distinguish raw and non-raw facts, using raw for 10 000s of facts and non-raw after selection of some hundreds
blanchet
parents: 51003
diff changeset
  1597
      fun in_add (_, th) = member Thm.eq_thm_prop add_ths th
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
  1598
51003
198cb05fb35b report (MeSh, MePo, MaSh) triple, to be able to use different filters in different slices
blanchet
parents: 51001
diff changeset
  1599
      fun add_and_take accepts =
198cb05fb35b report (MeSh, MePo, MaSh) triple, to be able to use different filters in different slices
blanchet
parents: 51001
diff changeset
  1600
        (case add_ths of
198cb05fb35b report (MeSh, MePo, MaSh) triple, to be able to use different filters in different slices
blanchet
parents: 51001
diff changeset
  1601
           [] => accepts
57150
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
  1602
         | _ =>
f591096a9c94 add option to keep duplicates, for more precise evaluation of relevance filters
blanchet
parents: 57133
diff changeset
  1603
           (unique_facts |> filter in_add |> map fact_of_raw_fact) @ (accepts |> filter_out in_add))
48293
914ca0827804 renamed Sledgehammer options
blanchet
parents: 48292
diff changeset
  1604
        |> take max_facts
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
  1605
48406
b002cc16aa99 honor suggested MaSh weights
blanchet
parents: 48404
diff changeset
  1606
      fun mepo () =
54143
18def1c73c79 make sure add: doesn't add duplicates, and works for [no_atp] facts
blanchet
parents: 54141
diff changeset
  1607
        (mepo_suggested_facts ctxt params max_facts NONE hyp_ts concl_t unique_facts
54091
4df62d7eae34 parallelize MeSh
blanchet
parents: 54089
diff changeset
  1608
         |> weight_facts_steeply, [])
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
  1609
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
  1610
      fun mash () =
61322
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
  1611
        mash_suggested_facts ctxt thy_name params (generous_max_suggestions max_facts) hyp_ts
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
  1612
          concl_t facts
53140
a1235e90da5f pour extra features from proximate facts into goal, in exporter
blanchet
parents: 53137
diff changeset
  1613
        |>> weight_facts_steeply
57018
142950e9c7e2 more flexible environment variable
blanchet
parents: 57017
diff changeset
  1614
48314
ee33ba3c0e05 added option to control which fact filter is used
blanchet
parents: 48313
diff changeset
  1615
      val mess =
51003
198cb05fb35b report (MeSh, MePo, MaSh) triple, to be able to use different filters in different slices
blanchet
parents: 51001
diff changeset
  1616
        (* the order is important for the "case" expression below *)
54091
4df62d7eae34 parallelize MeSh
blanchet
parents: 54089
diff changeset
  1617
        [] |> effective_fact_filter <> mepoN ? cons (mash_weight, mash)
4df62d7eae34 parallelize MeSh
blanchet
parents: 54089
diff changeset
  1618
           |> effective_fact_filter <> mashN ? cons (mepo_weight, mepo)
4df62d7eae34 parallelize MeSh
blanchet
parents: 54089
diff changeset
  1619
           |> Par_List.map (apsnd (fn f => f ()))
61322
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
  1620
      val mesh =
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
  1621
        mesh_facts (fact_distinct (op aconv)) (eq_snd (gen_eq_thm ctxt)) max_facts mess
44f4ffe2b210 speed up MaSh duplicate check
blanchet
parents: 61321
diff changeset
  1622
        |> add_and_take
48288
255c6e1fd505 rationalize relevance filter, slowing moving code from Iter to MaSh
blanchet
parents: 48251
diff changeset
  1623
    in
55286
blanchet
parents: 55202
diff changeset
  1624
      (case (fact_filter, mess) of
51024
98fb341d32e3 distinguish MeSh and smart -- with smart, allow combinations of MaSh, MeSh, and MePo in different slices -- and use MaSh also with SMT solvers, based on evaluation
blanchet
parents: 51020
diff changeset
  1625
        (NONE, [(_, (mepo, _)), (_, (mash, _))]) =>
57658
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1626
        [(meshN, mesh),
f55c173a3cbc beware of duplicate fact names
blanchet
parents: 57574
diff changeset
  1627
         (mepoN, mepo |> map fst |> add_and_take),
51010
afd0213a3dab tuned data structure
blanchet
parents: 51008
diff changeset
  1628
         (mashN, mash |> map fst |> add_and_take)]
55286
blanchet
parents: 55202
diff changeset
  1629
      | _ => [(effective_fact_filter, mesh)])
48288
255c6e1fd505 rationalize relevance filter, slowing moving code from Iter to MaSh
blanchet
parents: 48251
diff changeset
  1630
    end
255c6e1fd505 rationalize relevance filter, slowing moving code from Iter to MaSh
blanchet
parents: 48251
diff changeset
  1631
48248
b6eb45a52c28 split relevance filter code into three files
blanchet
parents:
diff changeset
  1632
end;