src/HOL/TPTP/TPTP_Proof_Reconstruction_Test.thy
author wenzelm
Sun Sep 18 20:33:48 2016 +0200 (2016-09-18)
changeset 63915 bab633745c7f
parent 63167 0909deb8059b
child 65999 ee4cf96a9406
permissions -rw-r--r--
tuned proofs;
     1 (*  Title:      HOL/TPTP/TPTP_Proof_Reconstruction_Test.thy
     2     Author:     Nik Sultana, Cambridge University Computer Laboratory
     3 
     4 Various tests for the proof reconstruction module.
     5 
     6 NOTE
     7   - looks for THF proofs in the path indicated by $THF_PROOFS
     8   - these proofs are generated using LEO-II with the following
     9     configuration choices: -po 1 -nux -nuc --expand_extuni
    10     You can simply filter LEO-II's output using the filter_proof
    11     script which is distributed with LEO-II.
    12 *)
    13 
    14 theory TPTP_Proof_Reconstruction_Test
    15 imports TPTP_Test TPTP_Proof_Reconstruction
    16 begin
    17 
    18 section "Main function"
    19 text "This function wraps all the reconstruction-related functions. Simply
    20  point it at a THF proof produced by LEO-II (using the configuration described
    21  above), and it will try to reconstruct it into an Isabelle/HOL theorem.
    22  It also returns the theory (into which the axioms and definitions used in the
    23  proof have been imported), but note that you can get the theory value from
    24  the theorem value."
    25 
    26 ML \<open>
    27   reconstruct_leo2 (Path.explode "$THF_PROOFS/NUM667^1.p.out") @{theory}
    28 \<close>
    29 
    30 section "Prep for testing the component functions"
    31 
    32 declare [[
    33   tptp_trace_reconstruction = false,
    34   tptp_test_all = false,
    35   (* tptp_test_all = true, *)
    36   tptp_test_timeout = 30,
    37   (* tptp_max_term_size = 200 *)
    38   tptp_max_term_size = 0
    39 ]]
    40 
    41 declare [[ML_exception_trace, ML_print_depth = 200]]
    42 
    43 
    44 section "Importing proofs"
    45 
    46 ML \<open>
    47 val probs =
    48   (* "$THF_PROOFS/SYN991^1.p.out" *) (*lacks conjecture*)
    49   (* "$THF_PROOFS/SYO040^2.p.out" *)
    50   (* "$THF_PROOFS/NUM640^1.p.out" *)
    51   (* "$THF_PROOFS/SEU553^2.p.out" *)
    52   (* "$THF_PROOFS/NUM665^1.p.out" *)
    53   (* "$THF_PROOFS/SEV161^5.p.out" *)
    54   (* "$THF_PROOFS/SET014^4.p.out" *)
    55   "$THF_PROOFS/NUM667^1.p.out"
    56   |> Path.explode
    57   |> single
    58 
    59 val prob_names =
    60   probs
    61   |> map (Path.base #> Path.implode #> TPTP_Problem_Name.Nonstandard)
    62 \<close>
    63 
    64 setup \<open>
    65   if test_all @{context} then I
    66   else
    67     fold
    68      (fn path =>
    69        TPTP_Reconstruct.import_thm true [Path.dir path, Path.explode "$THF_PROOFS"] path leo2_on_load)
    70      probs
    71 \<close>
    72 
    73 text "Display nicely."
    74 ML \<open>
    75 fun display_nicely ctxt (fms : TPTP_Reconstruct.formula_meaning list) =
    76   List.app (fn ((n, data) : TPTP_Reconstruct.formula_meaning) =>
    77     Pretty.writeln
    78       (Pretty.block
    79         [Pretty.str (n ^ " "),
    80          Syntax.pretty_term ctxt (#fmla data),
    81          Pretty.str (
    82           if is_none (#source_inf_opt data) then ""
    83           else ("\n\tannotation: " ^
    84            @{make_string} (the (#source_inf_opt data : TPTP_Proof.source_info option))))])
    85     ) (rev fms);
    86 
    87 (*FIXME hack for testing*)
    88 fun test_fmla thy =
    89   TPTP_Reconstruct.get_fmlas_of_prob thy (hd prob_names);
    90 
    91 fun test_pannot thy =
    92   TPTP_Reconstruct.get_pannot_of_prob thy (hd prob_names);
    93 
    94 if test_all @{context} orelse prob_names = [] then ()
    95 else
    96   display_nicely @{context}
    97   (#meta (test_pannot @{theory}))
    98 (* To look at the original proof (i.e., before the proof transformations applied
    99    when the proof is loaded) replace previous line with:
   100    (test_fmla @{theory}
   101     |> map TPTP_Reconstruct.structure_fmla_meaning)
   102 *)
   103 \<close>
   104 
   105 ML \<open>
   106 fun step_range_tester f_x f_exn ctxt prob_name from until =
   107   let
   108     val max =
   109       case until of
   110           SOME x => x
   111         | NONE =>
   112             if is_some Int.maxInt then the Int.maxInt else 999999
   113     fun test_step x =
   114       if x > max then ()
   115       else
   116         (f_x x;
   117          (interpret_leo2_inference ctxt prob_name (Int.toString x); ())
   118          handle e => f_exn e; (*FIXME naive. should let Interrupt through*)
   119          (*assumes that inferences are numbered consecutively*)
   120          test_step (x + 1))
   121   in
   122     test_step from
   123   end
   124 
   125 val step_range_tester_tracing =
   126   step_range_tester
   127    (fn x => tracing ("@step " ^ Int.toString x))
   128    (fn e => tracing ("!!" ^ @{make_string} e))
   129 \<close>
   130 
   131 ML \<open>
   132 (*try to reconstruct each inference step*)
   133 if test_all @{context} orelse prob_names = []
   134 orelse true (*NOTE currently disabled*)
   135 then ()
   136 else
   137   let
   138     (*FIXME not guaranteed to be the right nodes*)
   139     val heur_start = 3
   140     val heur_end =
   141       hd (#meta (test_pannot @{theory}))
   142       |> #1
   143       |> Int.fromString
   144   in
   145     step_range_tester_tracing @{context} (hd prob_names) heur_start heur_end
   146   end
   147 \<close>
   148 
   149 
   150 section "Building metadata and tactics"
   151 
   152 subsection "Building the skeleton"
   153 ML \<open>
   154 if test_all @{context} orelse prob_names = [] then []
   155 else TPTP_Reconstruct.make_skeleton @{context} (test_pannot @{theory});
   156 
   157 length it
   158 \<close>
   159 
   160 
   161 subsection "The 'one shot' tactic approach"
   162 ML \<open>
   163 val the_tactic =
   164   if test_all @{context} then []
   165   else
   166     map (fn prob_name =>
   167       (TPTP_Reconstruct.naive_reconstruct_tac @{context} interpret_leo2_inference (* auto_based_reconstruction_tac *) (* oracle_based_reconstruction_tac *) prob_name))
   168      prob_names;
   169 \<close>
   170 
   171 
   172 subsection "The 'piecemeal' approach"
   173 ML \<open>
   174 val the_tactics =
   175   if test_all @{context} then []
   176   else
   177     map (fn prob_name =>
   178       TPTP_Reconstruct.naive_reconstruct_tacs interpret_leo2_inference (* auto_based_reconstruction_tac *) (* oracle_based_reconstruction_tac *) prob_name @{context})
   179      prob_names;
   180 \<close>
   181 
   182 declare [[ML_print_depth = 2000]]
   183 
   184 ML \<open>
   185 the_tactics
   186 |> map (filter (fn (_, _, x) => is_none x)
   187         #> map (fn (x, SOME y, _) => (x, cterm_of @{theory} y)))
   188 \<close>
   189 
   190 
   191 section "Using metadata and tactics"
   192 text "There are various ways of testing the two ways (whole tactics or lists of tactics) of representing 'reconstructors'."
   193 
   194 
   195 subsection "The 'one shot' tactic approach"
   196 text "First we test whole tactics."
   197 ML \<open>
   198 (*produce thm*)
   199 if test_all @{context} then []
   200 else
   201   map (
   202     (* try *) (TPTP_Reconstruct.reconstruct @{context}
   203      (fn prob_name =>
   204        TPTP_Reconstruct.naive_reconstruct_tac @{context} interpret_leo2_inference prob_name
   205      (* oracle_based_reconstruction_tac *))))
   206    prob_names
   207 \<close>
   208 
   209 
   210 subsection "The 'piecemeal' approach"
   211 ML \<open>
   212 fun attac n = List.nth (List.nth (the_tactics, 0), n) |> #3 |> the |> snd
   213 fun attac_to n 0 = attac n
   214   | attac_to n m = attac n THEN attac_to (n + 1) (m - 1)
   215 fun shotac n = List.nth (List.nth (the_tactics, 0), n) |> #3 |> the |> fst
   216 \<close>
   217 
   218 ML \<open>
   219 (*Given a list of reconstructed inferences (as in "the_tactics" above,
   220   count the number of failures and successes, and list the failed inference
   221   reconstructions.*)
   222 fun evaluate_the_tactics [] acc = acc
   223   | evaluate_the_tactics ((node_no, (inf_name, inf_fmla, NONE)) :: xs) ((fai, suc), inf_list) =
   224       let
   225         val score = (fai + 1, suc)
   226         val index_info = get_index (fn (x, _) => if x = node_no then SOME true else NONE) inf_list
   227         val inf_list' =
   228           case index_info of
   229               NONE => (node_no, (inf_name, inf_fmla, 1)) :: inf_list
   230             | SOME (idx, _) =>
   231                 nth_map idx (fn (node_no, (inf_name, inf_fmla, count)) => (node_no, (inf_name, inf_fmla, count + 1))) inf_list
   232       in
   233         evaluate_the_tactics xs (score, inf_list')
   234       end
   235   | evaluate_the_tactics ((_, (_, _, SOME _)) :: xs) ((fai, suc), inf_list) =
   236       evaluate_the_tactics xs ((fai, suc + 1), inf_list)
   237 \<close>
   238 
   239 
   240 text "Now we build a tactic by combining lists of tactics"
   241 ML \<open>
   242 (*given a list of tactics to be applied in sequence (i.e., they
   243   follow a skeleton), we build a single tactic, interleaving
   244   some tracing info to help with debugging.*)
   245 fun step_by_step_tacs ctxt verbose (thm_tacs : (thm * tactic) list) : tactic =
   246     let
   247       fun interleave_tacs [] [] = all_tac
   248         | interleave_tacs (tac1 :: tacs1) (tac2 :: tacs2) =
   249             EVERY [tac1, tac2]
   250             THEN interleave_tacs tacs1 tacs2
   251       val thms_to_traceprint =
   252         map (fn thm => fn st =>
   253               (*FIXME uses makestring*)
   254               print_tac ctxt (@{make_string} thm) st)
   255 
   256     in
   257       if verbose then
   258         ListPair.unzip thm_tacs
   259         |> apfst (fn thms => enumerate 1 thms)
   260         |> apfst thms_to_traceprint
   261         |> uncurry interleave_tacs
   262       else EVERY (map #2 thm_tacs)
   263     end
   264 \<close>
   265 
   266 ML \<open>
   267 (*apply step_by_step_tacs to all problems under test*)
   268 fun narrated_tactics ctxt =
   269  map (map (#3 #> the)
   270       #> step_by_step_tacs ctxt false)
   271    the_tactics;
   272 
   273 (*produce thm*)
   274 (*use narrated_tactics to reconstruct all problems under test*)
   275 if test_all @{context} then []
   276 else
   277   map (fn (prob_name, tac) =>
   278          TPTP_Reconstruct.reconstruct @{context}
   279            (fn _ => tac) prob_name)
   280     (ListPair.zip (prob_names, narrated_tactics @{context}))
   281 \<close>
   282 
   283 
   284 subsection "Manually using 'piecemeal' approach"
   285 text "Another testing possibility involves manually creating a lemma
   286 and running through the list of tactics generating to prove that lemma. The following code shows the goal of each problem under test, and then for each problem returns the list of tactics which can be invoked individually as shown below."
   287 ML \<open>
   288 fun show_goal ctxt prob_name =
   289   let
   290     val thy = Proof_Context.theory_of ctxt
   291     val pannot = TPTP_Reconstruct.get_pannot_of_prob thy prob_name
   292   in
   293     #meta pannot
   294     |> filter (fn (_, info) =>
   295         #role info = TPTP_Syntax.Role_Conjecture)
   296     |> hd
   297     |> snd |> #fmla
   298     |> cterm_of thy
   299   end;
   300 
   301 if test_all @{context} then []
   302 else
   303   map (show_goal @{context}) prob_names;
   304 \<close>
   305 
   306 ML \<open>
   307 (*project out the list of tactics from "the_tactics"*)
   308 val just_the_tacs  =
   309  map (map (#3 #> the #> #2))
   310    the_tactics;
   311 
   312 map length just_the_tacs
   313 \<close>
   314 
   315 ML \<open>
   316 (*like just_the_tacs, but extract the thms, to inspect their thys*)
   317 val just_the_thms  =
   318  map (map (#3 #> the #> #1))
   319    the_tactics;
   320 
   321 map length just_the_thms;
   322 \<close>
   323 
   324 ML \<open>
   325 val axms_of_thy =
   326   `Theory.axioms_of
   327   #> apsnd cterm_of
   328   #> swap
   329   #> apsnd (map snd)
   330   #> uncurry map
   331 \<close>
   332 
   333 ML \<open>
   334 (*Show the skeleton-level inference which is done by each element of just_the_tacs. This is useful when debugging using the technique shown next*)
   335 if test_all @{context} orelse prob_names = [] then ()
   336 else
   337   the_tactics
   338   |> hd
   339   |> map #1
   340   |> TPTP_Reconstruct_Library.enumerate 0
   341   |> List.app (@{make_string} #> writeln)
   342 \<close>
   343 
   344 ML \<open>
   345 fun leo2_tac_wrap ctxt prob_name step i st =
   346   rtac (interpret_leo2_inference ctxt prob_name step) i st
   347 \<close>
   348 
   349 (*FIXME move these examples elsewhere*)
   350 (*
   351 lemma "\<forall>(Xj::TPTP_Interpret.ind) Xk::TPTP_Interpret.ind.
   352         bnd_cCKB6_BLACK Xj Xk \<longrightarrow>
   353         bnd_cCKB6_BLACK (bnd_s (bnd_s (bnd_s Xj))) (bnd_s Xk)"
   354 apply (tactic {*nth (nth just_the_tacs 0) 0*})
   355 apply (tactic {*nth (nth just_the_tacs 0) 1*})
   356 apply (tactic {*nth (nth just_the_tacs 0) 2*})
   357 apply (tactic {*nth (nth just_the_tacs 0) 3*})
   358 apply (tactic {*nth (nth just_the_tacs 0) 4*})
   359 apply (tactic {*nth (nth just_the_tacs 0) 5*})
   360 ML_prf "nth (hd the_tactics) 6"
   361 apply (tactic {*nth (nth just_the_tacs 0) 6*})
   362 apply (tactic {*nth (nth just_the_tacs 0) 7*})
   363 apply (tactic {*nth (nth just_the_tacs 0) 8*})
   364 apply (tactic {*nth (nth just_the_tacs 0) 9*})
   365 apply (tactic {*nth (nth just_the_tacs 0) 10*})
   366 apply (tactic {*nth (nth just_the_tacs 0) 11*})
   367 apply (tactic {*nth (nth just_the_tacs 0) 12*})
   368 apply (tactic {*nth (nth just_the_tacs 0) 13*})
   369 apply (tactic {*nth (nth just_the_tacs 0) 14*})
   370 apply (tactic {*nth (nth just_the_tacs 0) 15*})
   371 
   372 apply (tactic {*nth (nth just_the_tacs 0) 16*})
   373 
   374 (*
   375 apply (tactic {*
   376 rtac (interpret_leo2_inference @{context} (hd prob_names) "8") 1
   377 *})
   378 apply (tactic {*
   379 rtac (interpret_leo2_inference @{context} (hd prob_names) "7") 1
   380 *})
   381 apply (tactic {*
   382 rtac (interpret_leo2_inference @{context} (hd prob_names) "6") 1
   383 *})
   384 (*
   385 apply (tactic {*
   386 rtac (interpret_leo2_inference @{context} (hd prob_names) "4") 1
   387 *})
   388 *)
   389 *)
   390 
   391 apply (tactic {*nth (nth just_the_tacs 0) 17*})
   392 apply (tactic {*nth (nth just_the_tacs 0) 18*})
   393 apply (tactic {*nth (nth just_the_tacs 0) 19*})
   394 apply (tactic {*nth (nth just_the_tacs 0) 20*})
   395 apply (tactic {*nth (nth just_the_tacs 0) 21*})
   396 
   397 ML_prf "nth (hd the_tactics) 21"
   398 ML_prf "nth (hd the_tactics) 22"
   399 
   400 apply (tactic {*nth (nth just_the_tacs 0) 22*})
   401 apply (tactic {*nth (nth just_the_tacs 0) 23*})
   402 apply (tactic {*nth (nth just_the_tacs 0) 24*})
   403 apply (tactic {*nth (nth just_the_tacs 0) 25*})
   404 
   405 
   406 ML_prf "nth (hd the_tactics) 19"
   407 
   408 apply (tactic {*
   409 interpret_leo2_inference_wrap (hd prob_names) "8" 1
   410 *})
   411 apply (tactic {*
   412 interpret_leo2_inference_wrap (hd prob_names) "7" 1
   413 *})
   414 apply (tactic {*
   415 interpret_leo2_inference_wrap (hd prob_names) "6" 1
   416 *})
   417 apply (tactic {*
   418 interpret_leo2_inference_wrap (hd prob_names) "4" 1
   419 *})
   420 
   421 ML_prf "nth (hd the_tactics) 20"
   422 ML_prf "nth (hd the_tactics) 21"
   423 ML_prf "nth (hd the_tactics) 22"
   424 *)
   425 
   426 (*
   427 lemma "bnd_powersetE1 \<longrightarrow>
   428      bnd_sepInPowerset \<longrightarrow>
   429      (\<forall>A Xphi. bnd_subset (bnd_dsetconstr A Xphi) A)"
   430 apply (tactic {*nth (nth just_the_tacs 0) 0*})
   431 apply (tactic {*nth (nth just_the_tacs 0) 1*})
   432 apply (tactic {*nth (nth just_the_tacs 0) 2*})
   433 apply (tactic {*nth (nth just_the_tacs 0) 3*})
   434 apply (tactic {*nth (nth just_the_tacs 0) 4*})
   435 apply (tactic {*nth (nth just_the_tacs 0) 5*})
   436 ML_prf "nth (hd the_tactics) 6"
   437 apply (tactic {*nth (nth just_the_tacs 0) 6*})
   438 apply (tactic {*nth (nth just_the_tacs 0) 7*})
   439 apply (tactic {*nth (nth just_the_tacs 0) 8*})
   440 apply (tactic {*nth (nth just_the_tacs 0) 9*})
   441 apply (tactic {*nth (nth just_the_tacs 0) 10*})
   442 apply (tactic {*nth (nth just_the_tacs 0) 11*})
   443 apply (tactic {*nth (nth just_the_tacs 0) 12*})
   444 apply (tactic {*nth (nth just_the_tacs 0) 13*})
   445 apply (tactic {*nth (nth just_the_tacs 0) 14*})
   446 apply (tactic {*nth (nth just_the_tacs 0) 15*})
   447 apply (tactic {*nth (nth just_the_tacs 0) 16*})
   448 apply (tactic {*nth (nth just_the_tacs 0) 17*})
   449 apply (tactic {*nth (nth just_the_tacs 0) 18*})
   450 apply (tactic {*nth (nth just_the_tacs 0) 19*})
   451 apply (tactic {*nth (nth just_the_tacs 0) 20*})
   452 apply (tactic {*nth (nth just_the_tacs 0) 21*})
   453 apply (tactic {*nth (nth just_the_tacs 0) 22*})
   454 apply (tactic {*nth (nth just_the_tacs 0) 23*})
   455 apply (tactic {*nth (nth just_the_tacs 0) 24*})
   456 apply (tactic {*nth (nth just_the_tacs 0) 25*})
   457 (* apply (tactic {*nth (nth just_the_tacs 0) 26*}) *)
   458 ML_prf "nth (hd the_tactics) 26"
   459 apply (subgoal_tac "(\<not> (\<forall>A Xphi. bnd_subset (bnd_dsetconstr A Xphi) A)) =
   460        True \<Longrightarrow>
   461        (\<not> bnd_subset (bnd_dsetconstr bnd_sK1 bnd_sK2) bnd_sK1) =
   462        True")
   463 prefer 2
   464 apply (thin_tac "(bnd_powersetE1 \<longrightarrow>
   465       bnd_sepInPowerset \<longrightarrow>
   466       (\<forall>A Xphi. bnd_subset (bnd_dsetconstr A Xphi) A)) =
   467      False")
   468 apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*})
   469 apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*})
   470 apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*})
   471 apply (tactic {*extcnf_combined_simulator_tac (hd prob_names) 1*})
   472 
   473 apply simp
   474 
   475 (* apply (tactic {*nth (nth just_the_tacs 0) 26*}) *)
   476 apply (tactic {*nth (nth just_the_tacs 0) 27*})
   477 apply (tactic {*nth (nth just_the_tacs 0) 28*})
   478 apply (tactic {*nth (nth just_the_tacs 0) 29*})
   479 apply (tactic {*nth (nth just_the_tacs 0) 30*})
   480 apply (tactic {*nth (nth just_the_tacs 0) 31*})
   481 apply (tactic {*nth (nth just_the_tacs 0) 32*})
   482 apply (tactic {*nth (nth just_the_tacs 0) 33*})
   483 apply (tactic {*nth (nth just_the_tacs 0) 34*})
   484 apply (tactic {*nth (nth just_the_tacs 0) 35*})
   485 apply (tactic {*nth (nth just_the_tacs 0) 36*})
   486 apply (tactic {*nth (nth just_the_tacs 0) 37*})
   487 apply (tactic {*nth (nth just_the_tacs 0) 38*})
   488 apply (tactic {*nth (nth just_the_tacs 0) 39*})
   489 apply (tactic {*nth (nth just_the_tacs 0) 40*})
   490 apply (tactic {*nth (nth just_the_tacs 0) 41*})
   491 apply (tactic {*nth (nth just_the_tacs 0) 42*})
   492 apply (tactic {*nth (nth just_the_tacs 0) 43*})
   493 apply (tactic {*nth (nth just_the_tacs 0) 44*})
   494 apply (tactic {*nth (nth just_the_tacs 0) 45*})
   495 apply (tactic {*nth (nth just_the_tacs 0) 46*})
   496 apply (tactic {*nth (nth just_the_tacs 0) 47*})
   497 apply (tactic {*nth (nth just_the_tacs 0) 48*})
   498 apply (tactic {*nth (nth just_the_tacs 0) 49*})
   499 apply (tactic {*nth (nth just_the_tacs 0) 50*})
   500 apply (tactic {*nth (nth just_the_tacs 0) 51*})
   501 done
   502 *)
   503 
   504 (*
   505 We can use just_the_tacs as follows:
   506 
   507 (this is from SEV012^5.p.out)
   508 lemma "((\<forall>(Xx :: bool) (Xy :: bool). True \<longrightarrow> True) \<and>
   509       (\<forall>(Xx :: bool) (Xy :: bool) (Xz :: bool). True \<and> True \<longrightarrow> True)) \<and>
   510      (\<lambda>(Xx :: bool) (Xy :: bool). True) = (\<lambda>Xx Xy. True)"
   511 apply (tactic {*nth (nth just_the_tacs 0) 0*})
   512 apply (tactic {*nth (nth just_the_tacs 0) 1*})
   513 apply (tactic {*nth (nth just_the_tacs 0) 2*})
   514 apply (tactic {*nth (nth just_the_tacs 0) 3*})
   515 apply (tactic {*nth (nth just_the_tacs 0) 4*})
   516 apply (tactic {*nth (nth just_the_tacs 0) 5*})
   517 ML_prf "nth (hd the_tactics) 6"
   518 apply (tactic {*nth (nth just_the_tacs 0) 6*})
   519 apply (tactic {*nth (nth just_the_tacs 0) 7*})
   520 apply (tactic {*nth (nth just_the_tacs 0) 8*})
   521 apply (tactic {*nth (nth just_the_tacs 0) 9*})
   522 apply (tactic {*nth (nth just_the_tacs 0) 10*})
   523 apply (tactic {*nth (nth just_the_tacs 0) 11*})
   524 apply (tactic {*nth (nth just_the_tacs 0) 12*})
   525 apply (tactic {*nth (nth just_the_tacs 0) 13*})
   526 apply (tactic {*nth (nth just_the_tacs 0) 14*})
   527 apply (tactic {*nth (nth just_the_tacs 0) 15*})
   528 apply (tactic {*nth (nth just_the_tacs 0) 16*})
   529 apply (tactic {*nth (nth just_the_tacs 0) 17*})
   530 apply (tactic {*nth (nth just_the_tacs 0) 18*})
   531 apply (tactic {*nth (nth just_the_tacs 0) 19*})
   532 apply (tactic {*nth (nth just_the_tacs 0) 20*})
   533 apply (tactic {*nth (nth just_the_tacs 0) 21*})
   534 apply (tactic {*nth (nth just_the_tacs 0) 22*})
   535 done
   536 
   537 (*
   538 We could also use previous definitions directly,
   539 e.g. the following should prove the goal at a go:
   540 - apply (tactic {*narrated_tactics |> hd |> hd*})
   541 - apply (tactic {*
   542     TPTP_Reconstruct.naive_reconstruct_tac
   543      interpret_leo2_inference
   544      (hd prob_names)
   545      @{context}*})
   546 (Note that the previous two methods don't work in this
   547  "lemma" testing mode, not sure why. The previous methods
   548  (producing the thm values directly) should work though.)
   549 *)
   550 *)
   551 
   552 
   553 section "Testing against benchmark"
   554 
   555 ML \<open>
   556 (*if reconstruction_info value is NONE then a big error must have occurred*)
   557 type reconstruction_info =
   558   ((int(*no of failures*) * int(*no of successes*)) *
   559   (TPTP_Reconstruct.rolling_stock * term option(*inference formula*) * int (*number of times the inference occurs in the skeleton*)) list) option
   560 
   561 datatype proof_contents =
   562     No_info
   563   | Empty
   564   | Nonempty of reconstruction_info
   565 
   566 (*To make output less cluttered in whole-run tests*)
   567 fun erase_inference_fmlas (Nonempty (SOME (outline, inf_info))) =
   568       Nonempty (SOME (outline, map (fn (inf_name, _, count) => (inf_name, NONE, count)) inf_info))
   569   | erase_inference_fmlas x = x
   570 \<close>
   571 
   572 ML \<open>
   573 (*Report on how many inferences in a proof are reconstructed, and give some
   574   info about the inferences for which reconstruction failed.*)
   575 fun test_partial_reconstruction thy prob_file =
   576   let
   577     val prob_name =
   578       (Path.base #> Path.implode #> TPTP_Problem_Name.Nonstandard) prob_file
   579 
   580     val thy' =
   581       try
   582        (TPTP_Reconstruct.import_thm
   583         true
   584         [Path.dir prob_file, Path.explode "$TPTP"]
   585         prob_file leo2_on_load)
   586        thy
   587 
   588     val ctxt' =
   589       if is_some thy' then SOME (Proof_Context.init_global (the thy')) else NONE
   590 
   591     (*to test if proof is empty*)
   592     val fms =
   593       if is_some thy'
   594       then SOME (TPTP_Reconstruct.get_fmlas_of_prob (the thy') prob_name)
   595       else NONE
   596 
   597     val the_tactics =
   598       if is_some thy' then
   599         SOME (TPTP_Reconstruct.naive_reconstruct_tacs (* metis_based_reconstruction_tac *)
   600 interpret_leo2_inference (* auto_based_reconstruction_tac *) (* oracle_based_reconstruction_tac *) prob_name (the ctxt'))
   601       else NONE
   602 
   603 (* val _ = tracing ("tt=" ^ @{make_string} the_tactics) *)
   604 
   605     val skeleton =
   606       if is_some thy' then
   607         SOME (TPTP_Reconstruct.make_skeleton (the ctxt')
   608               (TPTP_Reconstruct.get_pannot_of_prob (the thy') prob_name))
   609       else NONE
   610 
   611     val skeleton_and_tactics =
   612       if is_some thy' then
   613         SOME (ListPair.zip (the skeleton, the the_tactics))
   614       else NONE
   615 
   616     val result =
   617       if is_some thy' then
   618         SOME (evaluate_the_tactics (the skeleton_and_tactics)
   619               ((0, 0), []))
   620       else NONE
   621 
   622     (*strip node names*)
   623     val result' =
   624       if is_some result then SOME (apsnd (map #2) (the result)) else NONE
   625   in
   626     if is_some fms andalso List.null (the fms) then Empty
   627     else Nonempty result'
   628   end
   629 \<close>
   630 
   631 ML \<open>
   632   (*default timeout is 1 min*)
   633   fun reconstruct timeout light_output file thy =
   634     let
   635       val timer = Timer.startRealTimer ()
   636     in
   637       Timeout.apply (Time.fromSeconds (if timeout = 0 then 60 else timeout))
   638        (test_partial_reconstruction thy
   639         #> light_output ? erase_inference_fmlas
   640         #> @{make_string} (* FIXME *)
   641         #> (fn s => report (Proof_Context.init_global thy) (@{make_string} file ^ " === " ^ s ^
   642              " t=" ^ (Timer.checkRealTimer timer |> Time.toMilliseconds |> @{make_string}))))
   643        file
   644     end
   645 \<close>
   646 
   647 ML \<open>
   648   (*this version of "reconstruct" builds theorems, instead of lists of reconstructed inferences*)
   649   (*default timeout is 1 min*)
   650   fun reconstruct timeout file thy =
   651     let
   652       val timer = Timer.startRealTimer ()
   653       val thy' =
   654         TPTP_Reconstruct.import_thm true
   655          [Path.dir file, Path.explode "$TPTP"]
   656          file leo2_on_load thy
   657 
   658       val ctxt = Proof_Context.init_global thy' (*FIXME pass ctxt instead of thy*)
   659       val prob_name =
   660         file
   661         |> Path.base
   662         |> Path.implode
   663         |> TPTP_Problem_Name.Nonstandard
   664     in
   665       Timeout.apply (Time.fromSeconds (if timeout = 0 then 60 else timeout))
   666        (fn prob_name =>
   667         (can
   668           (TPTP_Reconstruct.reconstruct ctxt (fn prob_name =>
   669             TPTP_Reconstruct.naive_reconstruct_tac ctxt interpret_leo2_inference prob_name (* oracle_based_reconstruction_tac *))) prob_name )
   670        |> (fn s => report ctxt (Path.print file ^ " === " ^ Bool.toString s ^
   671              " t=" ^ (Timer.checkRealTimer timer |> Time.toMilliseconds |> @{make_string}))))
   672        prob_name
   673     end
   674 \<close>
   675 
   676 ML \<open>
   677   fun reconstruction_test timeout ctxt =
   678     test_fn ctxt
   679      (fn file => reconstruct timeout file (Proof_Context.theory_of ctxt))
   680      "reconstructor"
   681      ()
   682 \<close>
   683 
   684 ML \<open>
   685 datatype examination_results =
   686     Whole_proof of string(*filename*) * proof_contents
   687   | Specific_rule of string(*filename*) * string(*inference rule*) * term option list
   688 
   689 (*Look out for failures reconstructing a particular inference rule*)
   690 fun filter_failures inference_name (Whole_proof (filename, results)) =
   691   let
   692     val filtered_results =
   693       case results of
   694           Nonempty (SOME results') =>
   695             #2 results'
   696             |> maps (fn (stock as TPTP_Reconstruct.Annotated_step (_, inf_name), inf_fmla, _) =>
   697                  if inf_name = inference_name then [inf_fmla] else [])
   698         | _ => []
   699   in Specific_rule (filename, inference_name, filtered_results) end
   700 
   701 (*Returns detailed info about a proof-reconstruction attempt.
   702   If rule_name is specified then the related failed inferences
   703   are returned, otherwise all failed inferences are returned.*)
   704 fun examine_failed_inferences ctxt filename rule_name =
   705   let
   706     val thy = Proof_Context.theory_of ctxt
   707     val prob_file = Path.explode filename
   708     val results =
   709       if test_all ctxt then No_info
   710       else test_partial_reconstruction thy prob_file
   711   in
   712     Whole_proof (filename, results)
   713     |> is_some rule_name ? (fn x =>
   714                              filter_failures (the rule_name) x)
   715   end
   716 \<close>
   717 
   718 ML \<open>
   719 exception NONSENSE
   720 
   721 fun annotation_or_id (TPTP_Reconstruct.Step n) = n
   722   | annotation_or_id (TPTP_Reconstruct.Annotated_step (n, anno)) = anno
   723   | annotation_or_id TPTP_Reconstruct.Assumed = "assumption"
   724   | annotation_or_id TPTP_Reconstruct.Unconjoin = "conjI"
   725   | annotation_or_id TPTP_Reconstruct.Caboose = "(end)"
   726   | annotation_or_id (TPTP_Reconstruct.Synth_step s) = s
   727   | annotation_or_id (TPTP_Reconstruct.Split (split_node, soln_node, _)) = "split_at " ^ split_node ^ " " ^ soln_node;
   728 
   729 fun count_failures (Whole_proof (_, No_info)) = raise NONSENSE
   730   | count_failures (Whole_proof (_, Empty)) = raise NONSENSE
   731   | count_failures (Whole_proof (_, Nonempty NONE)) = raise NONSENSE
   732   | count_failures (Whole_proof (_, Nonempty (SOME (((n, _), _))))) = n
   733   | count_failures (Specific_rule (_, _, t)) = length t
   734 
   735 fun pre_classify_failures [] alist = alist
   736   | pre_classify_failures ((stock, _, _) :: xs) alist =
   737       let
   738         val inf = annotation_or_id stock
   739         val count = AList.lookup (op =) alist inf
   740       in
   741         if is_none count
   742         then pre_classify_failures xs ((inf, 1) :: alist)
   743         else
   744           pre_classify_failures xs
   745            (AList.update (op =) (inf, the count + 1) alist)
   746       end
   747 
   748 fun classify_failures (Whole_proof (_, Nonempty (SOME (((_, _), inferences))))) = pre_classify_failures inferences []
   749   | classify_failures (Specific_rule (_, rule, t)) = [(rule, length t)]
   750   | classify_failures _ = raise NONSENSE
   751 \<close>
   752 
   753 ML \<open>
   754 val regressions = map (fn s => "$THF_PROOFS/" ^ s)
   755   ["SEV405^5.p.out",
   756    (*"SYO377^5.p.out", Always seems to raise Interrupt on my laptop -- probably because node 475 has lots of premises*)
   757    "PUZ031^5.p.out",
   758    "ALG001^5.p.out",
   759    "SYO238^5.p.out",
   760    (*"SEV158^5.p.out", This is big*)
   761    "SYO285^5.p.out",
   762    "../SYO285^5.p.out_reduced",
   763    (* "SYO225^5.p.out", This is big*)
   764    "SYO291^5.p.out",
   765    "SET669^3.p.out",
   766    "SEV233^5.p.out",
   767    (*"SEU511^1.p.out", This is big*)
   768    "SEV161^5.p.out",
   769    "SEV012^5.p.out",
   770    "SYO035^1.p.out",
   771    "SYO291^5.p.out",
   772    "SET741^4.p.out", (*involves both definitions and contorted splitting. has nice graph.*)
   773    "SEU548^2.p.out",
   774    "SEU513^2.p.out",
   775    "SYO006^1.p.out",
   776    "SYO371^5.p.out" (*has contorted splitting, like SYO006^1.p.out, but doesn't involve definitions*)
   777   ]
   778 \<close>
   779 
   780 ML \<open>
   781 val experiment = examine_failed_inferences @{context}
   782   (List.last regressions) NONE;
   783 
   784 (*
   785 val experiment_focus =
   786   filter_failures "extcnf_combined" experiment;
   787 *)
   788 
   789 (*
   790 count_failures experiment_focus
   791 classify_failures experiment
   792 *)
   793 \<close>
   794 
   795 text "Run reconstruction on all problems in a benchmark (provided via a script)
   796 and report on partial success."
   797 
   798 declare [[
   799   tptp_test_all = true,
   800   tptp_test_timeout = 10
   801 ]]
   802 
   803 ML \<open>
   804   (*problem source*)
   805   val tptp_probs_dir =
   806     Path.explode "$THF_PROOFS"
   807     |> Path.expand;
   808 \<close>
   809 
   810 ML \<open>
   811   if test_all @{context} then
   812     (report @{context} "Reconstructing proofs";
   813     S timed_test (reconstruction_test (get_timeout @{context})) @{context} (TPTP_Syntax.get_file_list tptp_probs_dir))
   814   else ()
   815 \<close>
   816 
   817 (*
   818 Debugging strategy:
   819   1) get list of all proofs
   820   2) order by size
   821   3) try to construct each in turn, given some timeout
   822 
   823 Use this to find the smallest failure, then debug that.
   824 *)
   825 end