author | wenzelm |
Tue, 24 Sep 2024 21:31:20 +0200 | |
changeset 80946 | b76f64d7d493 |
parent 70923 | 98d9b78b7f47 |
permissions | -rw-r--r-- |
59720 | 1 |
(* Title: HOL/TPTP/TPTP_Proof_Reconstruction_Test.thy |
55596 | 2 |
Author: Nik Sultana, Cambridge University Computer Laboratory |
3 |
||
4 |
Various tests for the proof reconstruction module. |
|
5 |
||
6 |
NOTE |
|
7 |
- looks for THF proofs in the path indicated by $THF_PROOFS |
|
55597
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
8 |
- these proofs are generated using LEO-II with the following |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
9 |
configuration choices: -po 1 -nux -nuc --expand_extuni |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
10 |
You can simply filter LEO-II's output using the filter_proof |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
11 |
script which is distributed with LEO-II. |
55596 | 12 |
*) |
13 |
||
14 |
theory TPTP_Proof_Reconstruction_Test |
|
15 |
imports TPTP_Test TPTP_Proof_Reconstruction |
|
16 |
begin |
|
17 |
||
55597
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
18 |
section "Main function" |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
19 |
text "This function wraps all the reconstruction-related functions. Simply |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
20 |
point it at a THF proof produced by LEO-II (using the configuration described |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
21 |
above), and it will try to reconstruct it into an Isabelle/HOL theorem. |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
22 |
It also returns the theory (into which the axioms and definitions used in the |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
23 |
proof have been imported), but note that you can get the theory value from |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
24 |
the theorem value." |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
25 |
|
63167 | 26 |
ML \<open> |
55597
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
27 |
reconstruct_leo2 (Path.explode "$THF_PROOFS/NUM667^1.p.out") @{theory} |
63167 | 28 |
\<close> |
55597
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
29 |
|
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
30 |
section "Prep for testing the component functions" |
25d7b485df81
added a function that carries out all the reconstruction steps, for improved usability;
sultana
parents:
55596
diff
changeset
|
31 |
|
55596 | 32 |
declare [[ |
33 |
tptp_trace_reconstruction = false, |
|
34 |
tptp_test_all = false, |
|
35 |
(* tptp_test_all = true, *) |
|
36 |
tptp_test_timeout = 30, |
|
37 |
(* tptp_max_term_size = 200 *) |
|
38 |
tptp_max_term_size = 0 |
|
39 |
]] |
|
40 |
||
60926 | 41 |
declare [[ML_exception_trace, ML_print_depth = 200]] |
55596 | 42 |
|
43 |
||
44 |
section "Importing proofs" |
|
45 |
||
63167 | 46 |
ML \<open> |
55596 | 47 |
val probs = |
48 |
(* "$THF_PROOFS/SYN991^1.p.out" *) (*lacks conjecture*) |
|
49 |
(* "$THF_PROOFS/SYO040^2.p.out" *) |
|
50 |
(* "$THF_PROOFS/NUM640^1.p.out" *) |
|
51 |
(* "$THF_PROOFS/SEU553^2.p.out" *) |
|
52 |
(* "$THF_PROOFS/NUM665^1.p.out" *) |
|
53 |
(* "$THF_PROOFS/SEV161^5.p.out" *) |
|
54 |
(* "$THF_PROOFS/SET014^4.p.out" *) |
|
55 |
"$THF_PROOFS/NUM667^1.p.out" |
|
56 |
|> Path.explode |
|
57 |
|> single |
|
58 |
||
59 |
val prob_names = |
|
60 |
probs |
|
69366 | 61 |
|> map (Path.file_name #> TPTP_Problem_Name.Nonstandard) |
63167 | 62 |
\<close> |
55596 | 63 |
|
63167 | 64 |
setup \<open> |
55596 | 65 |
if test_all @{context} then I |
66 |
else |
|
67 |
fold |
|
68 |
(fn path => |
|
69 |
TPTP_Reconstruct.import_thm true [Path.dir path, Path.explode "$THF_PROOFS"] path leo2_on_load) |
|
70 |
probs |
|
63167 | 71 |
\<close> |
55596 | 72 |
|
73 |
text "Display nicely." |
|
63167 | 74 |
ML \<open> |
55596 | 75 |
fun display_nicely ctxt (fms : TPTP_Reconstruct.formula_meaning list) = |
76 |
List.app (fn ((n, data) : TPTP_Reconstruct.formula_meaning) => |
|
77 |
Pretty.writeln |
|
78 |
(Pretty.block |
|
79 |
[Pretty.str (n ^ " "), |
|
80 |
Syntax.pretty_term ctxt (#fmla data), |
|
81 |
Pretty.str ( |
|
82 |
if is_none (#source_inf_opt data) then "" |
|
83 |
else ("\n\tannotation: " ^ |
|
60925 | 84 |
@{make_string} (the (#source_inf_opt data : TPTP_Proof.source_info option))))]) |
55596 | 85 |
) (rev fms); |
86 |
||
87 |
(*FIXME hack for testing*) |
|
88 |
fun test_fmla thy = |
|
89 |
TPTP_Reconstruct.get_fmlas_of_prob thy (hd prob_names); |
|
90 |
||
91 |
fun test_pannot thy = |
|
92 |
TPTP_Reconstruct.get_pannot_of_prob thy (hd prob_names); |
|
93 |
||
94 |
if test_all @{context} orelse prob_names = [] then () |
|
95 |
else |
|
96 |
display_nicely @{context} |
|
97 |
(#meta (test_pannot @{theory})) |
|
98 |
(* To look at the original proof (i.e., before the proof transformations applied |
|
99 |
when the proof is loaded) replace previous line with: |
|
100 |
(test_fmla @{theory} |
|
101 |
|> map TPTP_Reconstruct.structure_fmla_meaning) |
|
102 |
*) |
|
63167 | 103 |
\<close> |
55596 | 104 |
|
63167 | 105 |
ML \<open> |
55596 | 106 |
fun step_range_tester f_x f_exn ctxt prob_name from until = |
107 |
let |
|
108 |
val max = |
|
109 |
case until of |
|
110 |
SOME x => x |
|
111 |
| NONE => |
|
112 |
if is_some Int.maxInt then the Int.maxInt else 999999 |
|
113 |
fun test_step x = |
|
114 |
if x > max then () |
|
115 |
else |
|
116 |
(f_x x; |
|
117 |
(interpret_leo2_inference ctxt prob_name (Int.toString x); ()) |
|
118 |
handle e => f_exn e; (*FIXME naive. should let Interrupt through*) |
|
119 |
(*assumes that inferences are numbered consecutively*) |
|
120 |
test_step (x + 1)) |
|
121 |
in |
|
122 |
test_step from |
|
123 |
end |
|
124 |
||
125 |
val step_range_tester_tracing = |
|
126 |
step_range_tester |
|
127 |
(fn x => tracing ("@step " ^ Int.toString x)) |
|
60925 | 128 |
(fn e => tracing ("!!" ^ @{make_string} e)) |
63167 | 129 |
\<close> |
55596 | 130 |
|
63167 | 131 |
ML \<open> |
55596 | 132 |
(*try to reconstruct each inference step*) |
133 |
if test_all @{context} orelse prob_names = [] |
|
134 |
orelse true (*NOTE currently disabled*) |
|
135 |
then () |
|
136 |
else |
|
137 |
let |
|
138 |
(*FIXME not guaranteed to be the right nodes*) |
|
139 |
val heur_start = 3 |
|
140 |
val heur_end = |
|
141 |
hd (#meta (test_pannot @{theory})) |
|
142 |
|> #1 |
|
143 |
|> Int.fromString |
|
144 |
in |
|
145 |
step_range_tester_tracing @{context} (hd prob_names) heur_start heur_end |
|
146 |
end |
|
63167 | 147 |
\<close> |
55596 | 148 |
|
149 |
||
150 |
section "Building metadata and tactics" |
|
151 |
||
152 |
subsection "Building the skeleton" |
|
63167 | 153 |
ML \<open> |
55596 | 154 |
if test_all @{context} orelse prob_names = [] then [] |
155 |
else TPTP_Reconstruct.make_skeleton @{context} (test_pannot @{theory}); |
|
156 |
||
157 |
length it |
|
63167 | 158 |
\<close> |
55596 | 159 |
|
160 |
||
161 |
subsection "The 'one shot' tactic approach" |
|
63167 | 162 |
ML \<open> |
55596 | 163 |
val the_tactic = |
164 |
if test_all @{context} then [] |
|
165 |
else |
|
166 |
map (fn prob_name => |
|
167 |
(TPTP_Reconstruct.naive_reconstruct_tac @{context} interpret_leo2_inference (* auto_based_reconstruction_tac *) (* oracle_based_reconstruction_tac *) prob_name)) |
|
168 |
prob_names; |
|
63167 | 169 |
\<close> |
55596 | 170 |
|
171 |
||
172 |
subsection "The 'piecemeal' approach" |
|
63167 | 173 |
ML \<open> |
55596 | 174 |
val the_tactics = |
175 |
if test_all @{context} then [] |
|
176 |
else |
|
177 |
map (fn prob_name => |
|
178 |
TPTP_Reconstruct.naive_reconstruct_tacs interpret_leo2_inference (* auto_based_reconstruction_tac *) (* oracle_based_reconstruction_tac *) prob_name @{context}) |
|
179 |
prob_names; |
|
63167 | 180 |
\<close> |
55596 | 181 |
|
56281 | 182 |
declare [[ML_print_depth = 2000]] |
183 |
||
63167 | 184 |
ML \<open> |
55596 | 185 |
the_tactics |
186 |
|> map (filter (fn (_, _, x) => is_none x) |
|
187 |
#> map (fn (x, SOME y, _) => (x, cterm_of @{theory} y))) |
|
63167 | 188 |
\<close> |
55596 | 189 |
|
190 |
||
191 |
section "Using metadata and tactics" |
|
192 |
text "There are various ways of testing the two ways (whole tactics or lists of tactics) of representing 'reconstructors'." |
|
193 |
||
194 |
||
195 |
subsection "The 'one shot' tactic approach" |
|
196 |
text "First we test whole tactics." |
|
63167 | 197 |
ML \<open> |
55596 | 198 |
(*produce thm*) |
199 |
if test_all @{context} then [] |
|
200 |
else |
|
201 |
map ( |
|
202 |
(* try *) (TPTP_Reconstruct.reconstruct @{context} |
|
203 |
(fn prob_name => |
|
204 |
TPTP_Reconstruct.naive_reconstruct_tac @{context} interpret_leo2_inference prob_name |
|
205 |
(* oracle_based_reconstruction_tac *)))) |
|
206 |
prob_names |
|
63167 | 207 |
\<close> |
55596 | 208 |
|
209 |
||
210 |
subsection "The 'piecemeal' approach" |
|
63167 | 211 |
ML \<open> |
55596 | 212 |
fun attac n = List.nth (List.nth (the_tactics, 0), n) |> #3 |> the |> snd |
213 |
fun attac_to n 0 = attac n |
|
214 |
| attac_to n m = attac n THEN attac_to (n + 1) (m - 1) |
|
215 |
fun shotac n = List.nth (List.nth (the_tactics, 0), n) |> #3 |> the |> fst |
|
63167 | 216 |
\<close> |
55596 | 217 |
|
63167 | 218 |
ML \<open> |
55596 | 219 |
(*Given a list of reconstructed inferences (as in "the_tactics" above, |
220 |
count the number of failures and successes, and list the failed inference |
|
221 |
reconstructions.*) |
|
222 |
fun evaluate_the_tactics [] acc = acc |
|
223 |
| evaluate_the_tactics ((node_no, (inf_name, inf_fmla, NONE)) :: xs) ((fai, suc), inf_list) = |
|
224 |
let |
|
225 |
val score = (fai + 1, suc) |
|
226 |
val index_info = get_index (fn (x, _) => if x = node_no then SOME true else NONE) inf_list |
|
227 |
val inf_list' = |
|
228 |
case index_info of |
|
229 |
NONE => (node_no, (inf_name, inf_fmla, 1)) :: inf_list |
|
230 |
| SOME (idx, _) => |
|
231 |
nth_map idx (fn (node_no, (inf_name, inf_fmla, count)) => (node_no, (inf_name, inf_fmla, count + 1))) inf_list |
|
232 |
in |
|
233 |
evaluate_the_tactics xs (score, inf_list') |
|
234 |
end |
|
235 |
| evaluate_the_tactics ((_, (_, _, SOME _)) :: xs) ((fai, suc), inf_list) = |
|
236 |
evaluate_the_tactics xs ((fai, suc + 1), inf_list) |
|
63167 | 237 |
\<close> |
55596 | 238 |
|
239 |
||
240 |
text "Now we build a tactic by combining lists of tactics" |
|
63167 | 241 |
ML \<open> |
55596 | 242 |
(*given a list of tactics to be applied in sequence (i.e., they |
243 |
follow a skeleton), we build a single tactic, interleaving |
|
244 |
some tracing info to help with debugging.*) |
|
57772
7d9134b032b2
updated application of print_tac to take context parameter;
sultana
parents:
56467
diff
changeset
|
245 |
fun step_by_step_tacs ctxt verbose (thm_tacs : (thm * tactic) list) : tactic = |
55596 | 246 |
let |
247 |
fun interleave_tacs [] [] = all_tac |
|
248 |
| interleave_tacs (tac1 :: tacs1) (tac2 :: tacs2) = |
|
249 |
EVERY [tac1, tac2] |
|
250 |
THEN interleave_tacs tacs1 tacs2 |
|
251 |
val thms_to_traceprint = |
|
252 |
map (fn thm => fn st => |
|
253 |
(*FIXME uses makestring*) |
|
60925 | 254 |
print_tac ctxt (@{make_string} thm) st) |
55596 | 255 |
|
256 |
in |
|
257 |
if verbose then |
|
258 |
ListPair.unzip thm_tacs |
|
259 |
|> apfst (fn thms => enumerate 1 thms) |
|
260 |
|> apfst thms_to_traceprint |
|
261 |
|> uncurry interleave_tacs |
|
262 |
else EVERY (map #2 thm_tacs) |
|
263 |
end |
|
63167 | 264 |
\<close> |
55596 | 265 |
|
63167 | 266 |
ML \<open> |
55596 | 267 |
(*apply step_by_step_tacs to all problems under test*) |
57772
7d9134b032b2
updated application of print_tac to take context parameter;
sultana
parents:
56467
diff
changeset
|
268 |
fun narrated_tactics ctxt = |
55596 | 269 |
map (map (#3 #> the) |
57772
7d9134b032b2
updated application of print_tac to take context parameter;
sultana
parents:
56467
diff
changeset
|
270 |
#> step_by_step_tacs ctxt false) |
55596 | 271 |
the_tactics; |
272 |
||
273 |
(*produce thm*) |
|
274 |
(*use narrated_tactics to reconstruct all problems under test*) |
|
275 |
if test_all @{context} then [] |
|
276 |
else |
|
277 |
map (fn (prob_name, tac) => |
|
278 |
TPTP_Reconstruct.reconstruct @{context} |
|
279 |
(fn _ => tac) prob_name) |
|
57772
7d9134b032b2
updated application of print_tac to take context parameter;
sultana
parents:
56467
diff
changeset
|
280 |
(ListPair.zip (prob_names, narrated_tactics @{context})) |
63167 | 281 |
\<close> |
55596 | 282 |
|
283 |
||
284 |
subsection "Manually using 'piecemeal' approach" |
|
285 |
text "Another testing possibility involves manually creating a lemma |
|
286 |
and running through the list of tactics generating to prove that lemma. The following code shows the goal of each problem under test, and then for each problem returns the list of tactics which can be invoked individually as shown below." |
|
63167 | 287 |
ML \<open> |
55596 | 288 |
fun show_goal ctxt prob_name = |
289 |
let |
|
290 |
val thy = Proof_Context.theory_of ctxt |
|
291 |
val pannot = TPTP_Reconstruct.get_pannot_of_prob thy prob_name |
|
292 |
in |
|
293 |
#meta pannot |
|
58412 | 294 |
|> filter (fn (_, info) => |
55596 | 295 |
#role info = TPTP_Syntax.Role_Conjecture) |
296 |
|> hd |
|
297 |
|> snd |> #fmla |
|
298 |
|> cterm_of thy |
|
299 |
end; |
|
300 |
||
301 |
if test_all @{context} then [] |
|
302 |
else |
|
303 |
map (show_goal @{context}) prob_names; |
|
63167 | 304 |
\<close> |
55596 | 305 |
|
63167 | 306 |
ML \<open> |
55596 | 307 |
(*project out the list of tactics from "the_tactics"*) |
308 |
val just_the_tacs = |
|
309 |
map (map (#3 #> the #> #2)) |
|
310 |
the_tactics; |
|
311 |
||
312 |
map length just_the_tacs |
|
63167 | 313 |
\<close> |
55596 | 314 |
|
63167 | 315 |
ML \<open> |
55596 | 316 |
(*like just_the_tacs, but extract the thms, to inspect their thys*) |
317 |
val just_the_thms = |
|
318 |
map (map (#3 #> the #> #1)) |
|
319 |
the_tactics; |
|
320 |
||
321 |
map length just_the_thms; |
|
63167 | 322 |
\<close> |
55596 | 323 |
|
63167 | 324 |
ML \<open> |
55596 | 325 |
(*Show the skeleton-level inference which is done by each element of just_the_tacs. This is useful when debugging using the technique shown next*) |
326 |
if test_all @{context} orelse prob_names = [] then () |
|
327 |
else |
|
328 |
the_tactics |
|
329 |
|> hd |
|
330 |
|> map #1 |
|
331 |
|> TPTP_Reconstruct_Library.enumerate 0 |
|
60925 | 332 |
|> List.app (@{make_string} #> writeln) |
63167 | 333 |
\<close> |
55596 | 334 |
|
63167 | 335 |
ML \<open> |
60649
e007aa6a8aa2
more explicit use of context and elimination of Thm.theory_of_thm, although unclear (and untested?) situations remain;
wenzelm
parents:
59720
diff
changeset
|
336 |
fun leo2_tac_wrap ctxt prob_name step i st = |
e007aa6a8aa2
more explicit use of context and elimination of Thm.theory_of_thm, although unclear (and untested?) situations remain;
wenzelm
parents:
59720
diff
changeset
|
337 |
rtac (interpret_leo2_inference ctxt prob_name step) i st |
63167 | 338 |
\<close> |
55596 | 339 |
|
340 |
(*FIXME move these examples elsewhere*) |
|
341 |
(* |
|
61076 | 342 |
lemma "\<forall>(Xj::TPTP_Interpret.ind) Xk::TPTP_Interpret.ind. |
55596 | 343 |
bnd_cCKB6_BLACK Xj Xk \<longrightarrow> |
344 |
bnd_cCKB6_BLACK (bnd_s (bnd_s (bnd_s Xj))) (bnd_s Xk)" |
|
345 |
apply (tactic {*nth (nth just_the_tacs 0) 0*}) |
|
346 |
apply (tactic {*nth (nth just_the_tacs 0) 1*}) |
|
347 |
apply (tactic {*nth (nth just_the_tacs 0) 2*}) |
|
348 |
apply (tactic {*nth (nth just_the_tacs 0) 3*}) |
|
349 |
apply (tactic {*nth (nth just_the_tacs 0) 4*}) |
|
350 |
apply (tactic {*nth (nth just_the_tacs 0) 5*}) |
|
351 |
ML_prf "nth (hd the_tactics) 6" |
|
352 |
apply (tactic {*nth (nth just_the_tacs 0) 6*}) |
|
353 |
apply (tactic {*nth (nth just_the_tacs 0) 7*}) |
|
354 |
apply (tactic {*nth (nth just_the_tacs 0) 8*}) |
|
355 |
apply (tactic {*nth (nth just_the_tacs 0) 9*}) |
|
356 |
apply (tactic {*nth (nth just_the_tacs 0) 10*}) |
|
357 |
apply (tactic {*nth (nth just_the_tacs 0) 11*}) |
|
358 |
apply (tactic {*nth (nth just_the_tacs 0) 12*}) |
|
359 |
apply (tactic {*nth (nth just_the_tacs 0) 13*}) |
|
360 |
apply (tactic {*nth (nth just_the_tacs 0) 14*}) |
|
361 |
apply (tactic {*nth (nth just_the_tacs 0) 15*}) |
|
362 |
||
363 |
apply (tactic {*nth (nth just_the_tacs 0) 16*}) |
|
364 |
||
365 |
(* |
|
366 |
apply (tactic {* |
|
367 |
rtac (interpret_leo2_inference @{context} (hd prob_names) "8") 1 |
|
368 |
*}) |
|
369 |
apply (tactic {* |
|
370 |
rtac (interpret_leo2_inference @{context} (hd prob_names) "7") 1 |
|
371 |
*}) |
|
372 |
apply (tactic {* |
|
373 |
rtac (interpret_leo2_inference @{context} (hd prob_names) "6") 1 |
|
374 |
*}) |
|
375 |
(* |
|
376 |
apply (tactic {* |
|
377 |
rtac (interpret_leo2_inference @{context} (hd prob_names) "4") 1 |
|
378 |
*}) |
|
379 |
*) |
|
380 |
*) |
|
381 |
||
382 |
apply (tactic {*nth (nth just_the_tacs 0) 17*}) |
|
383 |
apply (tactic {*nth (nth just_the_tacs 0) 18*}) |
|
384 |
apply (tactic {*nth (nth just_the_tacs 0) 19*}) |
|
385 |
apply (tactic {*nth (nth just_the_tacs 0) 20*}) |
|
386 |
apply (tactic {*nth (nth just_the_tacs 0) 21*}) |
|
387 |
||
388 |
ML_prf "nth (hd the_tactics) 21" |
|
389 |
ML_prf "nth (hd the_tactics) 22" |
|
390 |
||
391 |
apply (tactic {*nth (nth just_the_tacs 0) 22*}) |
|
392 |
apply (tactic {*nth (nth just_the_tacs 0) 23*}) |
|
393 |
apply (tactic {*nth (nth just_the_tacs 0) 24*}) |
|
394 |
apply (tactic {*nth (nth just_the_tacs 0) 25*}) |
|
395 |
||
396 |
||
397 |
ML_prf "nth (hd the_tactics) 19" |
|
398 |
||
399 |
apply (tactic {* |
|
400 |
interpret_leo2_inference_wrap (hd prob_names) "8" 1 |
|
401 |
*}) |
|
402 |
apply (tactic {* |
|
403 |
interpret_leo2_inference_wrap (hd prob_names) "7" 1 |
|
404 |
*}) |
|
405 |
apply (tactic {* |
|
406 |
interpret_leo2_inference_wrap (hd prob_names) "6" 1 |
|
407 |
*}) |
|
408 |
apply (tactic {* |
|
409 |
interpret_leo2_inference_wrap (hd prob_names) "4" 1 |
|
410 |
*}) |
|
411 |
||
412 |
ML_prf "nth (hd the_tactics) 20" |
|
413 |
ML_prf "nth (hd the_tactics) 21" |
|
414 |
ML_prf "nth (hd the_tactics) 22" |
|
415 |
*) |
|
416 |
||
417 |
(* |
|
418 |
lemma "bnd_powersetE1 \<longrightarrow> |
|
419 |
bnd_sepInPowerset \<longrightarrow> |
|
420 |
(\<forall>A Xphi. bnd_subset (bnd_dsetconstr A Xphi) A)" |
|
421 |
apply (tactic {*nth (nth just_the_tacs 0) 0*}) |
|
422 |
apply (tactic {*nth (nth just_the_tacs 0) 1*}) |
|
423 |
apply (tactic {*nth (nth just_the_tacs 0) 2*}) |
|
424 |
apply (tactic {*nth (nth just_the_tacs 0) 3*}) |
|
425 |
apply (tactic {*nth (nth just_the_tacs 0) 4*}) |
|
426 |
apply (tactic {*nth (nth just_the_tacs 0) 5*}) |
|
427 |
ML_prf "nth (hd the_tactics) 6" |
|
428 |
apply (tactic {*nth (nth just_the_tacs 0) 6*}) |
|
429 |
apply (tactic {*nth (nth just_the_tacs 0) 7*}) |
|
430 |
apply (tactic {*nth (nth just_the_tacs 0) 8*}) |
|
431 |
apply (tactic {*nth (nth just_the_tacs 0) 9*}) |
|
432 |
apply (tactic {*nth (nth just_the_tacs 0) 10*}) |
|
433 |
apply (tactic {*nth (nth just_the_tacs 0) 11*}) |
|
434 |
apply (tactic {*nth (nth just_the_tacs 0) 12*}) |
|
435 |
apply (tactic {*nth (nth just_the_tacs 0) 13*}) |
|
436 |
apply (tactic {*nth (nth just_the_tacs 0) 14*}) |
|
437 |
apply (tactic {*nth (nth just_the_tacs 0) 15*}) |
|
438 |
apply (tactic {*nth (nth just_the_tacs 0) 16*}) |
|
439 |
apply (tactic {*nth (nth just_the_tacs 0) 17*}) |
|
440 |
apply (tactic {*nth (nth just_the_tacs 0) 18*}) |
|
441 |
apply (tactic {*nth (nth just_the_tacs 0) 19*}) |
|
442 |
apply (tactic {*nth (nth just_the_tacs 0) 20*}) |
|
443 |
apply (tactic {*nth (nth just_the_tacs 0) 21*}) |
|
444 |
apply (tactic {*nth (nth just_the_tacs 0) 22*}) |
|
445 |
apply (tactic {*nth (nth just_the_tacs 0) 23*}) |
|
446 |
apply (tactic {*nth (nth just_the_tacs 0) 24*}) |
|
447 |
apply (tactic {*nth (nth just_the_tacs 0) 25*}) |
|
448 |
(* apply (tactic {*nth (nth just_the_tacs 0) 26*}) *) |
|
449 |
ML_prf "nth (hd the_tactics) 26" |
|
450 |
apply (subgoal_tac "(\<not> (\<forall>A Xphi. bnd_subset (bnd_dsetconstr A Xphi) A)) = |
|
451 |
True \<Longrightarrow> |
|
452 |
(\<not> bnd_subset (bnd_dsetconstr bnd_sK1 bnd_sK2) bnd_sK1) = |
|
453 |
True") |
|
454 |
prefer 2 |
|
455 |
apply (thin_tac "(bnd_powersetE1 \<longrightarrow> |
|
456 |
bnd_sepInPowerset \<longrightarrow> |
|
457 |
(\<forall>A Xphi. bnd_subset (bnd_dsetconstr A Xphi) A)) = |
|
458 |
False") |
|
459 |
apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*}) |
|
460 |
apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*}) |
|
461 |
apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*}) |
|
462 |
apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*}) |
|
463 |
||
464 |
apply simp |
|
465 |
||
466 |
(* apply (tactic {*nth (nth just_the_tacs 0) 26*}) *) |
|
467 |
apply (tactic {*nth (nth just_the_tacs 0) 27*}) |
|
468 |
apply (tactic {*nth (nth just_the_tacs 0) 28*}) |
|
469 |
apply (tactic {*nth (nth just_the_tacs 0) 29*}) |
|
470 |
apply (tactic {*nth (nth just_the_tacs 0) 30*}) |
|
471 |
apply (tactic {*nth (nth just_the_tacs 0) 31*}) |
|
472 |
apply (tactic {*nth (nth just_the_tacs 0) 32*}) |
|
473 |
apply (tactic {*nth (nth just_the_tacs 0) 33*}) |
|
474 |
apply (tactic {*nth (nth just_the_tacs 0) 34*}) |
|
475 |
apply (tactic {*nth (nth just_the_tacs 0) 35*}) |
|
476 |
apply (tactic {*nth (nth just_the_tacs 0) 36*}) |
|
477 |
apply (tactic {*nth (nth just_the_tacs 0) 37*}) |
|
478 |
apply (tactic {*nth (nth just_the_tacs 0) 38*}) |
|
479 |
apply (tactic {*nth (nth just_the_tacs 0) 39*}) |
|
480 |
apply (tactic {*nth (nth just_the_tacs 0) 40*}) |
|
481 |
apply (tactic {*nth (nth just_the_tacs 0) 41*}) |
|
482 |
apply (tactic {*nth (nth just_the_tacs 0) 42*}) |
|
483 |
apply (tactic {*nth (nth just_the_tacs 0) 43*}) |
|
484 |
apply (tactic {*nth (nth just_the_tacs 0) 44*}) |
|
485 |
apply (tactic {*nth (nth just_the_tacs 0) 45*}) |
|
486 |
apply (tactic {*nth (nth just_the_tacs 0) 46*}) |
|
487 |
apply (tactic {*nth (nth just_the_tacs 0) 47*}) |
|
488 |
apply (tactic {*nth (nth just_the_tacs 0) 48*}) |
|
489 |
apply (tactic {*nth (nth just_the_tacs 0) 49*}) |
|
490 |
apply (tactic {*nth (nth just_the_tacs 0) 50*}) |
|
491 |
apply (tactic {*nth (nth just_the_tacs 0) 51*}) |
|
492 |
done |
|
493 |
*) |
|
494 |
||
495 |
(* |
|
496 |
We can use just_the_tacs as follows: |
|
497 |
||
498 |
(this is from SEV012^5.p.out) |
|
499 |
lemma "((\<forall>(Xx :: bool) (Xy :: bool). True \<longrightarrow> True) \<and> |
|
500 |
(\<forall>(Xx :: bool) (Xy :: bool) (Xz :: bool). True \<and> True \<longrightarrow> True)) \<and> |
|
501 |
(\<lambda>(Xx :: bool) (Xy :: bool). True) = (\<lambda>Xx Xy. True)" |
|
502 |
apply (tactic {*nth (nth just_the_tacs 0) 0*}) |
|
503 |
apply (tactic {*nth (nth just_the_tacs 0) 1*}) |
|
504 |
apply (tactic {*nth (nth just_the_tacs 0) 2*}) |
|
505 |
apply (tactic {*nth (nth just_the_tacs 0) 3*}) |
|
506 |
apply (tactic {*nth (nth just_the_tacs 0) 4*}) |
|
507 |
apply (tactic {*nth (nth just_the_tacs 0) 5*}) |
|
508 |
ML_prf "nth (hd the_tactics) 6" |
|
509 |
apply (tactic {*nth (nth just_the_tacs 0) 6*}) |
|
510 |
apply (tactic {*nth (nth just_the_tacs 0) 7*}) |
|
511 |
apply (tactic {*nth (nth just_the_tacs 0) 8*}) |
|
512 |
apply (tactic {*nth (nth just_the_tacs 0) 9*}) |
|
513 |
apply (tactic {*nth (nth just_the_tacs 0) 10*}) |
|
514 |
apply (tactic {*nth (nth just_the_tacs 0) 11*}) |
|
515 |
apply (tactic {*nth (nth just_the_tacs 0) 12*}) |
|
516 |
apply (tactic {*nth (nth just_the_tacs 0) 13*}) |
|
517 |
apply (tactic {*nth (nth just_the_tacs 0) 14*}) |
|
518 |
apply (tactic {*nth (nth just_the_tacs 0) 15*}) |
|
519 |
apply (tactic {*nth (nth just_the_tacs 0) 16*}) |
|
520 |
apply (tactic {*nth (nth just_the_tacs 0) 17*}) |
|
521 |
apply (tactic {*nth (nth just_the_tacs 0) 18*}) |
|
522 |
apply (tactic {*nth (nth just_the_tacs 0) 19*}) |
|
523 |
apply (tactic {*nth (nth just_the_tacs 0) 20*}) |
|
524 |
apply (tactic {*nth (nth just_the_tacs 0) 21*}) |
|
525 |
apply (tactic {*nth (nth just_the_tacs 0) 22*}) |
|
526 |
done |
|
527 |
||
528 |
(* |
|
529 |
We could also use previous definitions directly, |
|
530 |
e.g. the following should prove the goal at a go: |
|
531 |
- apply (tactic {*narrated_tactics |> hd |> hd*}) |
|
532 |
- apply (tactic {* |
|
533 |
TPTP_Reconstruct.naive_reconstruct_tac |
|
534 |
interpret_leo2_inference |
|
535 |
(hd prob_names) |
|
536 |
@{context}*}) |
|
537 |
(Note that the previous two methods don't work in this |
|
538 |
"lemma" testing mode, not sure why. The previous methods |
|
539 |
(producing the thm values directly) should work though.) |
|
540 |
*) |
|
541 |
*) |
|
542 |
||
543 |
||
544 |
section "Testing against benchmark" |
|
545 |
||
63167 | 546 |
ML \<open> |
55596 | 547 |
(*if reconstruction_info value is NONE then a big error must have occurred*) |
548 |
type reconstruction_info = |
|
549 |
((int(*no of failures*) * int(*no of successes*)) * |
|
550 |
(TPTP_Reconstruct.rolling_stock * term option(*inference formula*) * int (*number of times the inference occurs in the skeleton*)) list) option |
|
551 |
||
552 |
datatype proof_contents = |
|
553 |
No_info |
|
554 |
| Empty |
|
555 |
| Nonempty of reconstruction_info |
|
556 |
||
557 |
(*To make output less cluttered in whole-run tests*) |
|
558 |
fun erase_inference_fmlas (Nonempty (SOME (outline, inf_info))) = |
|
559 |
Nonempty (SOME (outline, map (fn (inf_name, _, count) => (inf_name, NONE, count)) inf_info)) |
|
560 |
| erase_inference_fmlas x = x |
|
63167 | 561 |
\<close> |
55596 | 562 |
|
63167 | 563 |
ML \<open> |
55596 | 564 |
(*Report on how many inferences in a proof are reconstructed, and give some |
565 |
info about the inferences for which reconstruction failed.*) |
|
566 |
fun test_partial_reconstruction thy prob_file = |
|
567 |
let |
|
568 |
val prob_name = |
|
65999 | 569 |
prob_file |
69366 | 570 |
|> Path.file_name |
65999 | 571 |
|> TPTP_Problem_Name.Nonstandard |
55596 | 572 |
|
573 |
val thy' = |
|
574 |
try |
|
575 |
(TPTP_Reconstruct.import_thm |
|
576 |
true |
|
577 |
[Path.dir prob_file, Path.explode "$TPTP"] |
|
578 |
prob_file leo2_on_load) |
|
579 |
thy |
|
580 |
||
581 |
val ctxt' = |
|
582 |
if is_some thy' then SOME (Proof_Context.init_global (the thy')) else NONE |
|
583 |
||
584 |
(*to test if proof is empty*) |
|
585 |
val fms = |
|
586 |
if is_some thy' |
|
587 |
then SOME (TPTP_Reconstruct.get_fmlas_of_prob (the thy') prob_name) |
|
588 |
else NONE |
|
589 |
||
590 |
val the_tactics = |
|
591 |
if is_some thy' then |
|
592 |
SOME (TPTP_Reconstruct.naive_reconstruct_tacs (* metis_based_reconstruction_tac *) |
|
593 |
interpret_leo2_inference (* auto_based_reconstruction_tac *) (* oracle_based_reconstruction_tac *) prob_name (the ctxt')) |
|
594 |
else NONE |
|
595 |
||
60925 | 596 |
(* val _ = tracing ("tt=" ^ @{make_string} the_tactics) *) |
55596 | 597 |
|
598 |
val skeleton = |
|
599 |
if is_some thy' then |
|
600 |
SOME (TPTP_Reconstruct.make_skeleton (the ctxt') |
|
601 |
(TPTP_Reconstruct.get_pannot_of_prob (the thy') prob_name)) |
|
602 |
else NONE |
|
603 |
||
604 |
val skeleton_and_tactics = |
|
605 |
if is_some thy' then |
|
606 |
SOME (ListPair.zip (the skeleton, the the_tactics)) |
|
607 |
else NONE |
|
608 |
||
609 |
val result = |
|
610 |
if is_some thy' then |
|
611 |
SOME (evaluate_the_tactics (the skeleton_and_tactics) |
|
612 |
((0, 0), [])) |
|
613 |
else NONE |
|
614 |
||
615 |
(*strip node names*) |
|
616 |
val result' = |
|
617 |
if is_some result then SOME (apsnd (map #2) (the result)) else NONE |
|
618 |
in |
|
619 |
if is_some fms andalso List.null (the fms) then Empty |
|
620 |
else Nonempty result' |
|
621 |
end |
|
63167 | 622 |
\<close> |
55596 | 623 |
|
63167 | 624 |
ML \<open> |
55596 | 625 |
(*default timeout is 1 min*) |
626 |
fun reconstruct timeout light_output file thy = |
|
627 |
let |
|
628 |
val timer = Timer.startRealTimer () |
|
629 |
in |
|
62519 | 630 |
Timeout.apply (Time.fromSeconds (if timeout = 0 then 60 else timeout)) |
55596 | 631 |
(test_partial_reconstruction thy |
632 |
#> light_output ? erase_inference_fmlas |
|
60925 | 633 |
#> @{make_string} (* FIXME *) |
634 |
#> (fn s => report (Proof_Context.init_global thy) (@{make_string} file ^ " === " ^ s ^ |
|
635 |
" t=" ^ (Timer.checkRealTimer timer |> Time.toMilliseconds |> @{make_string})))) |
|
55596 | 636 |
file |
637 |
end |
|
63167 | 638 |
\<close> |
55596 | 639 |
|
63167 | 640 |
ML \<open> |
55596 | 641 |
(*this version of "reconstruct" builds theorems, instead of lists of reconstructed inferences*) |
642 |
(*default timeout is 1 min*) |
|
643 |
fun reconstruct timeout file thy = |
|
644 |
let |
|
645 |
val timer = Timer.startRealTimer () |
|
646 |
val thy' = |
|
647 |
TPTP_Reconstruct.import_thm true |
|
648 |
[Path.dir file, Path.explode "$TPTP"] |
|
649 |
file leo2_on_load thy |
|
650 |
||
651 |
val ctxt = Proof_Context.init_global thy' (*FIXME pass ctxt instead of thy*) |
|
652 |
val prob_name = |
|
653 |
file |
|
69366 | 654 |
|> Path.file_name |
55596 | 655 |
|> TPTP_Problem_Name.Nonstandard |
656 |
in |
|
62519 | 657 |
Timeout.apply (Time.fromSeconds (if timeout = 0 then 60 else timeout)) |
55596 | 658 |
(fn prob_name => |
659 |
(can |
|
660 |
(TPTP_Reconstruct.reconstruct ctxt (fn prob_name => |
|
661 |
TPTP_Reconstruct.naive_reconstruct_tac ctxt interpret_leo2_inference prob_name (* oracle_based_reconstruction_tac *))) prob_name ) |
|
662 |
|> (fn s => report ctxt (Path.print file ^ " === " ^ Bool.toString s ^ |
|
60925 | 663 |
" t=" ^ (Timer.checkRealTimer timer |> Time.toMilliseconds |> @{make_string})))) |
55596 | 664 |
prob_name |
665 |
end |
|
63167 | 666 |
\<close> |
55596 | 667 |
|
63167 | 668 |
ML \<open> |
55596 | 669 |
fun reconstruction_test timeout ctxt = |
670 |
test_fn ctxt |
|
671 |
(fn file => reconstruct timeout file (Proof_Context.theory_of ctxt)) |
|
672 |
"reconstructor" |
|
673 |
() |
|
63167 | 674 |
\<close> |
55596 | 675 |
|
63167 | 676 |
ML \<open> |
55596 | 677 |
datatype examination_results = |
678 |
Whole_proof of string(*filename*) * proof_contents |
|
679 |
| Specific_rule of string(*filename*) * string(*inference rule*) * term option list |
|
680 |
||
681 |
(*Look out for failures reconstructing a particular inference rule*) |
|
682 |
fun filter_failures inference_name (Whole_proof (filename, results)) = |
|
683 |
let |
|
684 |
val filtered_results = |
|
685 |
case results of |
|
686 |
Nonempty (SOME results') => |
|
687 |
#2 results' |
|
58411 | 688 |
|> maps (fn (stock as TPTP_Reconstruct.Annotated_step (_, inf_name), inf_fmla, _) => |
55596 | 689 |
if inf_name = inference_name then [inf_fmla] else []) |
690 |
| _ => [] |
|
691 |
in Specific_rule (filename, inference_name, filtered_results) end |
|
692 |
||
693 |
(*Returns detailed info about a proof-reconstruction attempt. |
|
694 |
If rule_name is specified then the related failed inferences |
|
695 |
are returned, otherwise all failed inferences are returned.*) |
|
696 |
fun examine_failed_inferences ctxt filename rule_name = |
|
697 |
let |
|
698 |
val thy = Proof_Context.theory_of ctxt |
|
699 |
val prob_file = Path.explode filename |
|
700 |
val results = |
|
701 |
if test_all ctxt then No_info |
|
702 |
else test_partial_reconstruction thy prob_file |
|
703 |
in |
|
704 |
Whole_proof (filename, results) |
|
705 |
|> is_some rule_name ? (fn x => |
|
706 |
filter_failures (the rule_name) x) |
|
707 |
end |
|
63167 | 708 |
\<close> |
55596 | 709 |
|
63167 | 710 |
ML \<open> |
55596 | 711 |
exception NONSENSE |
712 |
||
713 |
fun annotation_or_id (TPTP_Reconstruct.Step n) = n |
|
714 |
| annotation_or_id (TPTP_Reconstruct.Annotated_step (n, anno)) = anno |
|
715 |
| annotation_or_id TPTP_Reconstruct.Assumed = "assumption" |
|
716 |
| annotation_or_id TPTP_Reconstruct.Unconjoin = "conjI" |
|
717 |
| annotation_or_id TPTP_Reconstruct.Caboose = "(end)" |
|
718 |
| annotation_or_id (TPTP_Reconstruct.Synth_step s) = s |
|
719 |
| annotation_or_id (TPTP_Reconstruct.Split (split_node, soln_node, _)) = "split_at " ^ split_node ^ " " ^ soln_node; |
|
720 |
||
721 |
fun count_failures (Whole_proof (_, No_info)) = raise NONSENSE |
|
722 |
| count_failures (Whole_proof (_, Empty)) = raise NONSENSE |
|
723 |
| count_failures (Whole_proof (_, Nonempty NONE)) = raise NONSENSE |
|
724 |
| count_failures (Whole_proof (_, Nonempty (SOME (((n, _), _))))) = n |
|
725 |
| count_failures (Specific_rule (_, _, t)) = length t |
|
726 |
||
727 |
fun pre_classify_failures [] alist = alist |
|
728 |
| pre_classify_failures ((stock, _, _) :: xs) alist = |
|
729 |
let |
|
730 |
val inf = annotation_or_id stock |
|
67399 | 731 |
val count = AList.lookup (=) alist inf |
55596 | 732 |
in |
733 |
if is_none count |
|
734 |
then pre_classify_failures xs ((inf, 1) :: alist) |
|
735 |
else |
|
736 |
pre_classify_failures xs |
|
67399 | 737 |
(AList.update (=) (inf, the count + 1) alist) |
55596 | 738 |
end |
739 |
||
740 |
fun classify_failures (Whole_proof (_, Nonempty (SOME (((_, _), inferences))))) = pre_classify_failures inferences [] |
|
741 |
| classify_failures (Specific_rule (_, rule, t)) = [(rule, length t)] |
|
742 |
| classify_failures _ = raise NONSENSE |
|
63167 | 743 |
\<close> |
55596 | 744 |
|
63167 | 745 |
ML \<open> |
55596 | 746 |
val regressions = map (fn s => "$THF_PROOFS/" ^ s) |
747 |
["SEV405^5.p.out", |
|
748 |
(*"SYO377^5.p.out", Always seems to raise Interrupt on my laptop -- probably because node 475 has lots of premises*) |
|
749 |
"PUZ031^5.p.out", |
|
750 |
"ALG001^5.p.out", |
|
751 |
"SYO238^5.p.out", |
|
752 |
(*"SEV158^5.p.out", This is big*) |
|
753 |
"SYO285^5.p.out", |
|
754 |
"../SYO285^5.p.out_reduced", |
|
755 |
(* "SYO225^5.p.out", This is big*) |
|
756 |
"SYO291^5.p.out", |
|
757 |
"SET669^3.p.out", |
|
758 |
"SEV233^5.p.out", |
|
759 |
(*"SEU511^1.p.out", This is big*) |
|
760 |
"SEV161^5.p.out", |
|
761 |
"SEV012^5.p.out", |
|
762 |
"SYO035^1.p.out", |
|
763 |
"SYO291^5.p.out", |
|
764 |
"SET741^4.p.out", (*involves both definitions and contorted splitting. has nice graph.*) |
|
765 |
"SEU548^2.p.out", |
|
766 |
"SEU513^2.p.out", |
|
767 |
"SYO006^1.p.out", |
|
768 |
"SYO371^5.p.out" (*has contorted splitting, like SYO006^1.p.out, but doesn't involve definitions*) |
|
769 |
] |
|
63167 | 770 |
\<close> |
55596 | 771 |
|
63167 | 772 |
ML \<open> |
55596 | 773 |
val experiment = examine_failed_inferences @{context} |
774 |
(List.last regressions) NONE; |
|
775 |
||
776 |
(* |
|
777 |
val experiment_focus = |
|
778 |
filter_failures "extcnf_combined" experiment; |
|
779 |
*) |
|
780 |
||
781 |
(* |
|
782 |
count_failures experiment_focus |
|
783 |
classify_failures experiment |
|
784 |
*) |
|
63167 | 785 |
\<close> |
55596 | 786 |
|
787 |
text "Run reconstruction on all problems in a benchmark (provided via a script) |
|
788 |
and report on partial success." |
|
789 |
||
790 |
declare [[ |
|
791 |
tptp_test_all = true, |
|
792 |
tptp_test_timeout = 10 |
|
793 |
]] |
|
794 |
||
63167 | 795 |
ML \<open> |
55596 | 796 |
(*problem source*) |
797 |
val tptp_probs_dir = |
|
798 |
Path.explode "$THF_PROOFS" |
|
799 |
|> Path.expand; |
|
63167 | 800 |
\<close> |
55596 | 801 |
|
63167 | 802 |
ML \<open> |
55596 | 803 |
if test_all @{context} then |
804 |
(report @{context} "Reconstructing proofs"; |
|
805 |
S timed_test (reconstruction_test (get_timeout @{context})) @{context} (TPTP_Syntax.get_file_list tptp_probs_dir)) |
|
806 |
else () |
|
63167 | 807 |
\<close> |
55596 | 808 |
|
809 |
(* |
|
810 |
Debugging strategy: |
|
811 |
1) get list of all proofs |
|
812 |
2) order by size |
|
813 |
3) try to construct each in turn, given some timeout |
|
814 |
||
815 |
Use this to find the smallest failure, then debug that. |
|
816 |
*) |
|
62390 | 817 |
end |