diff -r d26348b667f2 -r 25153c08655e src/HOL/Probability/Information.thy --- a/src/HOL/Probability/Information.thy Mon May 03 14:35:10 2010 +0200 +++ b/src/HOL/Probability/Information.thy Mon May 03 14:35:10 2010 +0200 @@ -2,168 +2,263 @@ imports Probability_Space Product_Measure "../Multivariate_Analysis/Convex" begin -lemma pos_neg_part_abs: - fixes f :: "'a \ real" - shows "pos_part f x + neg_part f x = \f x\" -unfolding real_abs_def pos_part_def neg_part_def by auto +section "Convex theory" -lemma pos_part_abs: - fixes f :: "'a \ real" - shows "pos_part (\ x. \f x\) y = \f y\" -unfolding pos_part_def real_abs_def by auto - -lemma neg_part_abs: - fixes f :: "'a \ real" - shows "neg_part (\ x. \f x\) y = 0" -unfolding neg_part_def real_abs_def by auto +lemma log_setsum: + assumes "finite s" "s \ {}" + assumes "b > 1" + assumes "(\ i \ s. a i) = 1" + assumes "\ i. i \ s \ a i \ 0" + assumes "\ i. i \ s \ y i \ {0 <..}" + shows "log b (\ i \ s. a i * y i) \ (\ i \ s. a i * log b (y i))" +proof - + have "convex_on {0 <..} (\ x. - log b x)" + by (rule minus_log_convex[OF `b > 1`]) + hence "- log b (\ i \ s. a i * y i) \ (\ i \ s. a i * - log b (y i))" + using convex_on_setsum[of _ _ "\ x. - log b x"] assms pos_is_convex by fastsimp + thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le) +qed -lemma (in measure_space) int_abs: - assumes "integrable f" - shows "integrable (\ x. \f x\)" -using assms +lemma log_setsum': + assumes "finite s" "s \ {}" + assumes "b > 1" + assumes "(\ i \ s. a i) = 1" + assumes pos: "\ i. i \ s \ 0 \ a i" + "\ i. \ i \ s ; 0 < a i \ \ 0 < y i" + shows "log b (\ i \ s. a i * y i) \ (\ i \ s. a i * log b (y i))" proof - - from assms obtain p q where pq: "p \ nnfis (pos_part f)" "q \ nnfis (neg_part f)" - unfolding integrable_def by auto - hence "p + q \ nnfis (\ x. pos_part f x + neg_part f x)" - using nnfis_add by auto - hence "p + q \ nnfis (\ x. \f x\)" using pos_neg_part_abs[of f] by simp - thus ?thesis unfolding integrable_def - using ext[OF pos_part_abs[of f], of "\ y. y"] - ext[OF neg_part_abs[of f], of "\ y. y"] - using nnfis_0 by auto + have "\y. (\ i \ s - {i. a i = 0}. a i * y i) = (\ i \ s. a i * y i)" + using assms by (auto intro!: setsum_mono_zero_cong_left) + moreover have "log b (\ i \ s - {i. a i = 0}. a i * y i) \ (\ i \ s - {i. a i = 0}. a i * log b (y i))" + proof (rule log_setsum) + have "setsum a (s - {i. a i = 0}) = setsum a s" + using assms(1) by (rule setsum_mono_zero_cong_left) auto + thus sum_1: "setsum a (s - {i. a i = 0}) = 1" + "finite (s - {i. a i = 0})" using assms by simp_all + + show "s - {i. a i = 0} \ {}" + proof + assume *: "s - {i. a i = 0} = {}" + hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty) + with sum_1 show False by simp +qed + + fix i assume "i \ s - {i. a i = 0}" + hence "i \ s" "a i \ 0" by simp_all + thus "0 \ a i" "y i \ {0<..}" using pos[of i] by auto + qed fact+ + ultimately show ?thesis by simp qed -lemma (in measure_space) measure_mono: - assumes "a \ b" "a \ sets M" "b \ sets M" - shows "measure M a \ measure M b" +section "Information theory" + +lemma (in finite_prob_space) sum_over_space_distrib: + "(\x\X`space M. distribution X {x}) = 1" + unfolding distribution_def prob_space[symmetric] using finite_space + by (subst measure_finitely_additive'') + (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=prob]) + +locale finite_information_space = finite_prob_space + + fixes b :: real assumes b_gt_1: "1 < b" + +definition + "KL_divergence b M X Y = + measure_space.integral (M\measure := X\) + (\x. log b ((measure_space.RN_deriv (M \measure := Y\ ) X) x))" + +lemma (in finite_prob_space) distribution_mono: + assumes "\t. \ t \ space M ; X t \ x \ \ Y t \ y" + shows "distribution X x \ distribution Y y" + unfolding distribution_def + using assms by (auto simp: sets_eq_Pow intro!: measure_mono) + +lemma (in prob_space) distribution_remove_const: + shows "joint_distribution X (\x. ()) {(x, ())} = distribution X {x}" + and "joint_distribution (\x. ()) X {((), x)} = distribution X {x}" + and "joint_distribution X (\x. (Y x, ())) {(x, y, ())} = joint_distribution X Y {(x, y)}" + and "joint_distribution X (\x. ((), Y x)) {(x, (), y)} = joint_distribution X Y {(x, y)}" + and "distribution (\x. ()) {()} = 1" + unfolding prob_space[symmetric] + by (auto intro!: arg_cong[where f=prob] simp: distribution_def) + + +context finite_information_space +begin + +lemma distribution_mono_gt_0: + assumes gt_0: "0 < distribution X x" + assumes *: "\t. \ t \ space M ; X t \ x \ \ Y t \ y" + shows "0 < distribution Y y" + by (rule less_le_trans[OF gt_0 distribution_mono]) (rule *) + +lemma + assumes "0 \ A" and pos: "0 < A \ 0 < B" "0 < A \ 0 < C" + shows mult_log_mult: "A * log b (B * C) = A * log b B + A * log b C" (is "?mult") + and mult_log_divide: "A * log b (B / C) = A * log b B - A * log b C" (is "?div") proof - - have "b = a \ (b - a)" using assms by auto - moreover have "{} = a \ (b - a)" by auto - ultimately have "measure M b = measure M a + measure M (b - a)" - using measure_additive[of a "b - a"] local.Diff[of b a] assms by auto - moreover have "measure M (b - a) \ 0" using positive assms by auto - ultimately show "measure M a \ measure M b" by auto + have "?mult \ ?div" +proof (cases "A = 0") + case False + hence "0 < A" using `0 \ A` by auto + with pos[OF this] show "?mult \ ?div" using b_gt_1 + by (auto simp: log_divide log_mult field_simps) +qed simp + thus ?mult and ?div by auto qed -lemma (in measure_space) integral_0: - fixes f :: "'a \ real" - assumes "integrable f" "integral f = 0" "nonneg f" and borel: "f \ borel_measurable M" - shows "measure M ({x. f x \ 0} \ space M) = 0" -proof - - have "{x. f x \ 0} = {x. \f x\ > 0}" by auto - moreover - { fix y assume "y \ {x. \ f x \ > 0}" - hence "\ f y \ > 0" by auto - hence "\ n. \f y\ \ inverse (real (Suc n))" - using ex_inverse_of_nat_Suc_less[of "\f y\"] less_imp_le unfolding real_of_nat_def by auto - hence "y \ (\ n. {x. \f x\ \ inverse (real (Suc n))})" - by auto } - moreover - { fix y assume "y \ (\ n. {x. \f x\ \ inverse (real (Suc n))})" - then obtain n where n: "y \ {x. \f x\ \ inverse (real (Suc n))}" by auto - hence "\f y\ \ inverse (real (Suc n))" by auto - hence "\f y\ > 0" - using real_of_nat_Suc_gt_zero - positive_imp_inverse_positive[of "real_of_nat (Suc n)"] by fastsimp - hence "y \ {x. \f x\ > 0}" by auto } - ultimately have fneq0_UN: "{x. f x \ 0} = (\ n. {x. \f x\ \ inverse (real (Suc n))})" - by blast - { fix n - have int_one: "integrable (\ x. \f x\ ^ 1)" using int_abs assms by auto - have "measure M (f -` {inverse (real (Suc n))..} \ space M) - \ integral (\ x. \f x\ ^ 1) / (inverse (real (Suc n)) ^ 1)" - using markov_ineq[OF `integrable f` _ int_one] real_of_nat_Suc_gt_zero by auto - hence le0: "measure M (f -` {inverse (real (Suc n))..} \ space M) \ 0" - using assms unfolding nonneg_def by auto - have "{x. f x \ inverse (real (Suc n))} \ space M \ sets M" - apply (subst Int_commute) unfolding Int_def - using borel[unfolded borel_measurable_ge_iff] by simp - hence m0: "measure M ({x. f x \ inverse (real (Suc n))} \ space M) = 0 \ - {x. f x \ inverse (real (Suc n))} \ space M \ sets M" - using positive le0 unfolding atLeast_def by fastsimp } - moreover hence "range (\ n. {x. f x \ inverse (real (Suc n))} \ space M) \ sets M" - by auto - moreover - { fix n - have "inverse (real (Suc n)) \ inverse (real (Suc (Suc n)))" - using less_imp_inverse_less real_of_nat_Suc_gt_zero[of n] by fastsimp - hence "\ x. f x \ inverse (real (Suc n)) \ f x \ inverse (real (Suc (Suc n)))" by (rule order_trans) - hence "{x. f x \ inverse (real (Suc n))} \ space M - \ {x. f x \ inverse (real (Suc (Suc n)))} \ space M" by auto } - ultimately have "(\ x. 0) ----> measure M (\ n. {x. f x \ inverse (real (Suc n))} \ space M)" - using monotone_convergence[of "\ n. {x. f x \ inverse (real (Suc n))} \ space M"] - unfolding o_def by (simp del: of_nat_Suc) - hence "measure M (\ n. {x. f x \ inverse (real (Suc n))} \ space M) = 0" - using LIMSEQ_const[of 0] LIMSEQ_unique by simp - hence "measure M ((\ n. {x. \f x\ \ inverse (real (Suc n))}) \ space M) = 0" - using assms unfolding nonneg_def by auto - thus "measure M ({x. f x \ 0} \ space M) = 0" using fneq0_UN by simp +lemma split_pairs: + shows + "((A, B) = X) \ (fst X = A \ snd X = B)" and + "(X = (A, B)) \ (fst X = A \ snd X = B)" by auto + +ML {* + + (* tactic to solve equations of the form @{term "W * log b (X / (Y * Z)) = W * log b X - W * log b (Y * Z)"} + where @{term W} is a joint distribution of @{term X}, @{term Y}, and @{term Z}. *) + + val mult_log_intros = [@{thm mult_log_divide}, @{thm mult_log_mult}] + val intros = [@{thm divide_pos_pos}, @{thm mult_pos_pos}, @{thm positive_distribution}] + + val distribution_gt_0_tac = (rtac @{thm distribution_mono_gt_0} + THEN' assume_tac + THEN' clarsimp_tac (clasimpset_of @{context} addsimps2 @{thms split_pairs})) + + val distr_mult_log_eq_tac = REPEAT_ALL_NEW (CHANGED o TRY o + (resolve_tac (mult_log_intros @ intros) + ORELSE' distribution_gt_0_tac + ORELSE' clarsimp_tac (clasimpset_of @{context}))) + + fun instanciate_term thy redex intro = + let + val intro_concl = Thm.concl_of intro + + val lhs = intro_concl |> HOLogic.dest_Trueprop |> HOLogic.dest_eq |> fst + + val m = SOME (Pattern.match thy (lhs, redex) (Vartab.empty, Vartab.empty)) + handle Pattern.MATCH => NONE + + in + Option.map (fn m => Envir.subst_term m intro_concl) m + end + + fun mult_log_simproc simpset redex = + let + val ctxt = Simplifier.the_context simpset + val thy = ProofContext.theory_of ctxt + fun prove (SOME thm) = (SOME + (Goal.prove ctxt [] [] thm (K (distr_mult_log_eq_tac 1)) + |> mk_meta_eq) + handle THM _ => NONE) + | prove NONE = NONE + in + get_first (instanciate_term thy (term_of redex) #> prove) mult_log_intros + end +*} + +simproc_setup mult_log ("distribution X x * log b (A * B)" | + "distribution X x * log b (A / B)") = {* K mult_log_simproc *} + +end + +lemma KL_divergence_eq_finite: + assumes u: "finite_measure_space (M\measure := u\)" + assumes v: "finite_measure_space (M\measure := v\)" + assumes u_0: "\x. \ x \ space M ; v {x} = 0 \ \ u {x} = 0" + shows "KL_divergence b M u v = (\x\space M. u {x} * log b (u {x} / v {x}))" (is "_ = ?sum") +proof (simp add: KL_divergence_def, subst finite_measure_space.integral_finite_singleton, simp_all add: u) + have ms_u: "measure_space (M\measure := u\)" + using u unfolding finite_measure_space_def by simp + + show "(\x \ space M. log b (measure_space.RN_deriv (M\measure := v\) u x) * u {x}) = ?sum" + apply (rule setsum_cong[OF refl]) + apply simp + apply (safe intro!: arg_cong[where f="log b"] ) + apply (subst finite_measure_space.RN_deriv_finite_singleton) + using assms ms_u by auto qed -definition - "KL_divergence b M u v = - measure_space.integral (M\measure := u\) - (\x. log b ((measure_space.RN_deriv (M \measure := v\ ) u) x))" - -lemma (in finite_prob_space) finite_measure_space: - shows "finite_measure_space \ space = X ` space M, sets = Pow (X ` space M), measure = distribution X\" - (is "finite_measure_space ?S") -proof (rule finite_Pow_additivity_sufficient, simp_all) - show "finite (X ` space M)" using finite_space by simp - - show "positive ?S (distribution X)" unfolding distribution_def - unfolding positive_def using positive'[unfolded positive_def] sets_eq_Pow by auto +lemma log_setsum_divide: + assumes "finite S" and "S \ {}" and "1 < b" + assumes "(\x\S. g x) = 1" + assumes pos: "\x. x \ S \ g x \ 0" "\x. x \ S \ f x \ 0" + assumes g_pos: "\x. \ x \ S ; 0 < g x \ \ 0 < f x" + shows "- (\x\S. g x * log b (g x / f x)) \ log b (\x\S. f x)" +proof - + have log_mono: "\x y. 0 < x \ x \ y \ log b x \ log b y" + using `1 < b` by (subst log_le_cancel_iff) auto - show "additive ?S (distribution X)" unfolding additive_def distribution_def - proof (simp, safe) - fix x y - have x: "(X -` x) \ space M \ sets M" - and y: "(X -` y) \ space M \ sets M" using sets_eq_Pow by auto - assume "x \ y = {}" - from additive[unfolded additive_def, rule_format, OF x y] this - have "prob (((X -` x) \ (X -` y)) \ space M) = - prob ((X -` x) \ space M) + prob ((X -` y) \ space M)" - apply (subst Int_Un_distrib2) - by auto - thus "prob ((X -` x \ X -` y) \ space M) = prob (X -` x \ space M) + prob (X -` y \ space M)" - by auto + have "- (\x\S. g x * log b (g x / f x)) = (\x\S. g x * log b (f x / g x))" + proof (unfold setsum_negf[symmetric], rule setsum_cong) + fix x assume x: "x \ S" + show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)" + proof (cases "g x = 0") + case False + with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all + thus ?thesis using `1 < b` by (simp add: log_divide field_simps) + qed simp + qed rule + also have "... \ log b (\x\S. g x * (f x / g x))" + proof (rule log_setsum') + fix x assume x: "x \ S" "0 < g x" + with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos) + qed fact+ + also have "... = log b (\x\S - {x. g x = 0}. f x)" using `finite S` + by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"] + split: split_if_asm) + also have "... \ log b (\x\S. f x)" + proof (rule log_mono) + have "0 = (\x\S - {x. g x = 0}. 0)" by simp + also have "... < (\x\S - {x. g x = 0}. f x)" (is "_ < ?sum") + proof (rule setsum_strict_mono) + show "finite (S - {x. g x = 0})" using `finite S` by simp + show "S - {x. g x = 0} \ {}" + proof + assume "S - {x. g x = 0} = {}" + hence "(\x\S. g x) = 0" by (subst setsum_0') auto + with `(\x\S. g x) = 1` show False by simp + qed + fix x assume "x \ S - {x. g x = 0}" + thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto + qed + finally show "0 < ?sum" . + show "(\x\S - {x. g x = 0}. f x) \ (\x\S. f x)" + using `finite S` pos by (auto intro!: setsum_mono2) qed + finally show ?thesis . qed -lemma (in finite_prob_space) finite_prob_space: - "finite_prob_space \ space = X ` space M, sets = Pow (X ` space M), measure = distribution X\" - (is "finite_prob_space ?S") - unfolding finite_prob_space_def prob_space_def prob_space_axioms_def -proof safe - show "finite_measure_space ?S" by (rule finite_measure_space) - thus "measure_space ?S" by (simp add: finite_measure_space_def) +lemma KL_divergence_positive_finite: + assumes u: "finite_prob_space (M\measure := u\)" + assumes v: "finite_prob_space (M\measure := v\)" + assumes u_0: "\x. \ x \ space M ; v {x} = 0 \ \ u {x} = 0" + and "1 < b" + shows "0 \ KL_divergence b M u v" +proof - + interpret u: finite_prob_space "M\measure := u\" using u . + interpret v: finite_prob_space "M\measure := v\" using v . - have "X -` X ` space M \ space M = space M" by auto - thus "measure ?S (space ?S) = 1" - by (simp add: distribution_def prob_space) -qed + have *: "space M \ {}" using u.not_empty by simp -lemma (in finite_prob_space) finite_measure_space_image_prod: - "finite_measure_space \space = X ` space M \ Y ` space M, - sets = Pow (X ` space M \ Y ` space M), measure_space.measure = distribution (\x. (X x, Y x))\" - (is "finite_measure_space ?Z") -proof (rule finite_Pow_additivity_sufficient, simp_all) - show "finite (X ` space M \ Y ` space M)" using finite_space by simp + have "- (KL_divergence b M u v) \ log b (\x\space M. v {x})" + proof (subst KL_divergence_eq_finite, safe intro!: log_setsum_divide *) + show "finite_measure_space (M\measure := u\)" + "finite_measure_space (M\measure := v\)" + using u v unfolding finite_prob_space_eq by simp_all - let ?d = "distribution (\x. (X x, Y x))" + show "finite (space M)" using u.finite_space by simp + show "1 < b" by fact + show "(\x\space M. u {x}) = 1" using u.sum_over_space_eq_1 by simp - show "positive ?Z ?d" - using sets_eq_Pow by (auto simp: positive_def distribution_def intro!: positive) + fix x assume x: "x \ space M" + thus pos: "0 \ u {x}" "0 \ v {x}" + using u.positive u.sets_eq_Pow v.positive v.sets_eq_Pow by simp_all - show "additive ?Z ?d" unfolding additive_def - proof safe - fix x y assume "x \ sets ?Z" and "y \ sets ?Z" - assume "x \ y = {}" - thus "?d (x \ y) = ?d x + ?d y" - apply (simp add: distribution_def) - apply (subst measure_additive[unfolded sets_eq_Pow, simplified]) - by (auto simp: Un_Int_distrib Un_Int_distrib2 intro!: arg_cong[where f=prob]) + { assume "v {x} = 0" from u_0[OF x this] show "u {x} = 0" . } + { assume "0 < u {x}" + hence "v {x} \ 0" using u_0[OF x] by auto + with pos show "0 < v {x}" by simp } qed + thus "0 \ KL_divergence b M u v" using v.sum_over_space_eq_1 by simp qed definition (in prob_space) @@ -174,346 +269,142 @@ in KL_divergence b prod_space (joint_distribution X Y) (measure prod_space)" -abbreviation (in finite_prob_space) - finite_mutual_information ("\\<^bsub>_\<^esub>'(_ ; _')") where - "\\<^bsub>b\<^esub>(X ; Y) \ mutual_information b +abbreviation (in finite_information_space) + finite_mutual_information ("\'(_ ; _')") where + "\(X ; Y) \ mutual_information b \ space = X`space M, sets = Pow (X`space M) \ \ space = Y`space M, sets = Pow (Y`space M) \ X Y" -abbreviation (in finite_prob_space) - finite_mutual_information_2 :: "('a \ 'c) \ ('a \ 'd) \ real" ("\'(_ ; _')") where - "\(X ; Y) \ \\<^bsub>2\<^esub>(X ; Y)" +lemma (in finite_measure_space) measure_spaceI: "measure_space M" + by unfold_locales -lemma (in prob_space) mutual_information_cong: - assumes [simp]: "space S1 = space S3" "sets S1 = sets S3" - "space S2 = space S4" "sets S2 = sets S4" - shows "mutual_information b S1 S2 X Y = mutual_information b S3 S4 X Y" - unfolding mutual_information_def by simp +lemma prod_measure_times_finite: + assumes fms: "finite_measure_space M" "finite_measure_space M'" and a: "a \ space M \ space M'" + shows "prod_measure M M' {a} = measure M {fst a} * measure M' {snd a}" +proof (cases a) + case (Pair b c) + hence a_eq: "{a} = {b} \ {c}" by simp -lemma (in prob_space) joint_distribution: - "joint_distribution X Y = distribution (\x. (X x, Y x))" - unfolding joint_distribution_def_raw distribution_def_raw .. + with fms[THEN finite_measure_space.measure_spaceI] + fms[THEN finite_measure_space.sets_eq_Pow] a Pair + show ?thesis unfolding a_eq + by (subst prod_measure_times) simp_all +qed -lemma (in finite_prob_space) finite_mutual_information_reduce: - "\\<^bsub>b\<^esub>(X;Y) = (\ (x,y) \ X ` space M \ Y ` space M. - distribution (\x. (X x, Y x)) {(x,y)} * log b (distribution (\x. (X x, Y x)) {(x,y)} / - (distribution X {x} * distribution Y {y})))" - (is "_ = setsum ?log ?prod") - unfolding Let_def mutual_information_def KL_divergence_def -proof (subst finite_measure_space.integral_finite_singleton, simp_all add: joint_distribution) - let ?X = "\space = X ` space M, sets = Pow (X ` space M), measure_space.measure = distribution X\" - let ?Y = "\space = Y ` space M, sets = Pow (Y ` space M), measure_space.measure = distribution Y\" - let ?P = "prod_measure_space ?X ?Y" +lemma setsum_cartesian_product': + "(\x\A \ B. f x) = (\x\A. setsum (\y. f (x, y)) B)" + unfolding setsum_cartesian_product by simp - interpret X: finite_measure_space "?X" by (rule finite_measure_space) - moreover interpret Y: finite_measure_space "?Y" by (rule finite_measure_space) - ultimately have ms_X: "measure_space ?X" and ms_Y: "measure_space ?Y" by unfold_locales - - interpret P: finite_measure_space "?P" by (rule finite_measure_space_finite_prod_measure) (fact+) - - let ?P' = "measure_update (\_. distribution (\x. (X x, Y x))) ?P" - from finite_measure_space_image_prod[of X Y] - sigma_prod_sets_finite[OF X.finite_space Y.finite_space] - show "finite_measure_space ?P'" - by (simp add: X.sets_eq_Pow Y.sets_eq_Pow joint_distribution finite_measure_space_def prod_measure_space_def) +lemma (in finite_information_space) + assumes MX: "finite_prob_space \ space = space MX, sets = sets MX, measure = distribution X\" + (is "finite_prob_space ?MX") + assumes MY: "finite_prob_space \ space = space MY, sets = sets MY, measure = distribution Y\" + (is "finite_prob_space ?MY") + and X_space: "X ` space M \ space MX" and Y_space: "Y ` space M \ space MY" + shows mutual_information_eq_generic: + "mutual_information b MX MY X Y = (\ (x,y) \ space MX \ space MY. + joint_distribution X Y {(x,y)} * + log b (joint_distribution X Y {(x,y)} / + (distribution X {x} * distribution Y {y})))" + (is "?equality") + and mutual_information_positive_generic: + "0 \ mutual_information b MX MY X Y" (is "?positive") +proof - + let ?P = "prod_measure_space ?MX ?MY" + let ?measure = "joint_distribution X Y" + let ?P' = "measure_update (\_. ?measure) ?P" - show "(\x \ space ?P. log b (measure_space.RN_deriv ?P (distribution (\x. (X x, Y x))) x) * distribution (\x. (X x, Y x)) {x}) - = setsum ?log ?prod" - proof (rule setsum_cong) - show "space ?P = ?prod" unfolding prod_measure_space_def by simp - next - fix x assume x: "x \ X ` space M \ Y ` space M" - then obtain d e where x_Pair: "x = (d, e)" - and d: "d \ X ` space M" - and e: "e \ Y ` space M" by auto - - { fix x assume m_0: "measure ?P {x} = 0" - have "distribution (\x. (X x, Y x)) {x} = 0" - proof (cases x) - case (Pair a b) - hence "(\x. (X x, Y x)) -` {x} \ space M = (X -` {a} \ space M) \ (Y -` {b} \ space M)" - and x_prod: "{x} = {a} \ {b}" by auto + interpret X: finite_prob_space "?MX" using MX . + moreover interpret Y: finite_prob_space "?MY" using MY . + ultimately have ms_X: "measure_space ?MX" + and ms_Y: "measure_space ?MY" by unfold_locales - let ?PROD = "(\x. (X x, Y x)) -` {x} \ space M" + have fms_P: "finite_measure_space ?P" + by (rule finite_measure_space_finite_prod_measure) fact+ + + have fms_P': "finite_measure_space ?P'" + using finite_product_measure_space[of "space MX" "space MY"] + X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space] + X.sets_eq_Pow Y.sets_eq_Pow + by (simp add: prod_measure_space_def) - show ?thesis - proof (cases "{a} \ X ` space M \ {b} \ Y ` space M") - case False - hence "?PROD = {}" - unfolding Pair by auto - thus ?thesis by (auto simp: distribution_def) - next - have [intro]: "prob ?PROD \ 0 \ prob ?PROD = 0" - using sets_eq_Pow by (auto intro!: positive real_le_antisym[of _ 0]) + { fix x assume "x \ space ?P" + hence x_in_MX: "{fst x} \ sets MX" using X.sets_eq_Pow + by (auto simp: prod_measure_space_def) + + assume "measure ?P {x} = 0" + with prod_measure_times[OF ms_X ms_Y, of "{fst x}" "{snd x}"] x_in_MX + have "distribution X {fst x} = 0 \ distribution Y {snd x} = 0" + by (simp add: prod_measure_space_def) + + hence "joint_distribution X Y {x} = 0" + by (cases x) (auto simp: distribution_order) } + note measure_0 = this - case True - with prod_measure_times[OF ms_X ms_Y, simplified, of "{a}" "{b}"] - have "prob (X -` {a} \ space M) = 0 \ prob (Y -` {b} \ space M) = 0" (is "?X_0 \ ?Y_0") using m_0 - by (simp add: prod_measure_space_def distribution_def Pair) - thus ?thesis - proof (rule disjE) - assume ?X_0 - have "prob ?PROD \ prob (X -` {a} \ space M)" - using sets_eq_Pow Pair by (auto intro!: measure_mono) - thus ?thesis using `?X_0` by (auto simp: distribution_def) - next - assume ?Y_0 - have "prob ?PROD \ prob (Y -` {b} \ space M)" - using sets_eq_Pow Pair by (auto intro!: measure_mono) - thus ?thesis using `?Y_0` by (auto simp: distribution_def) - qed - qed - qed } - note measure_zero_joint_distribution = this + show ?equality + unfolding Let_def mutual_information_def using fms_P fms_P' measure_0 MX MY + by (subst KL_divergence_eq_finite) + (simp_all add: prod_measure_space_def prod_measure_times_finite + finite_prob_space_eq setsum_cartesian_product') - show "log b (measure_space.RN_deriv ?P (distribution (\x. (X x, Y x))) x) * distribution (\x. (X x, Y x)) {x} = ?log x" - apply (cases "distribution (\x. (X x, Y x)) {x} \ 0") - apply (subst P.RN_deriv_finite_singleton) - proof (simp_all add: x_Pair) - from `finite_measure_space ?P'` show "measure_space ?P'" by (simp add: finite_measure_space_def) - next - fix x assume m_0: "measure ?P {x} = 0" thus "distribution (\x. (X x, Y x)) {x} = 0" by fact - next - show "(d,e) \ space ?P" unfolding prod_measure_space_def using x x_Pair by simp - next - assume jd_0: "distribution (\x. (X x, Y x)) {(d, e)} \ 0" - show "measure ?P {(d,e)} \ 0" - proof - assume "measure ?P {(d,e)} = 0" - from measure_zero_joint_distribution[OF this] jd_0 - show False by simp - qed - next - assume jd_0: "distribution (\x. (X x, Y x)) {(d, e)} \ 0" - with prod_measure_times[OF ms_X ms_Y, simplified, of "{d}" "{e}"] d - show "log b (distribution (\x. (X x, Y x)) {(d, e)} / measure ?P {(d, e)}) = - log b (distribution (\x. (X x, Y x)) {(d, e)} / (distribution X {d} * distribution Y {e}))" - by (auto intro!: arg_cong[where f="log b"] simp: prod_measure_space_def) - qed + show ?positive + unfolding Let_def mutual_information_def using measure_0 b_gt_1 + proof (safe intro!: KL_divergence_positive_finite, simp_all) + from ms_X ms_Y X.top Y.top X.prob_space Y.prob_space + have "measure ?P (space ?P) = 1" + by (simp add: prod_measure_space_def, subst prod_measure_times, simp_all) + with fms_P show "finite_prob_space ?P" + by (simp add: finite_prob_space_eq) + + from ms_X ms_Y X.top Y.top X.prob_space Y.prob_space Y.not_empty X_space Y_space + have "measure ?P' (space ?P') = 1" unfolding prob_space[symmetric] + by (auto simp add: prod_measure_space_def distribution_def vimage_Times comp_def + intro!: arg_cong[where f=prob]) + with fms_P' show "finite_prob_space ?P'" + by (simp add: finite_prob_space_eq) qed qed -lemma (in finite_prob_space) distribution_log_split: - assumes "1 < b" - shows - "distribution (\x. (X x, Z x)) {(X x, z)} * log b (distribution (\x. (X x, Z x)) {(X x, z)} / - (distribution X {X x} * distribution Z {z})) = - distribution (\x. (X x, Z x)) {(X x, z)} * log b (distribution (\x. (X x, Z x)) {(X x, z)} / - distribution Z {z}) - - distribution (\x. (X x, Z x)) {(X x, z)} * log b (distribution X {X x})" - (is "?lhs = ?rhs") -proof (cases "distribution (\x. (X x, Z x)) {(X x, z)} = 0") - case True thus ?thesis by simp -next - case False - - let ?dZ = "distribution Z" - let ?dX = "distribution X" - let ?dXZ = "distribution (\x. (X x, Z x))" - - have dist_nneg: "\x X. 0 \ distribution X x" - unfolding distribution_def using sets_eq_Pow by (auto intro: positive) - - have "?lhs = ?dXZ {(X x, z)} * (log b (?dXZ {(X x, z)} / ?dZ {z}) - log b (?dX {X x}))" - proof - - have pos_dXZ: "0 < ?dXZ {(X x, z)}" - using False dist_nneg[of "\x. (X x, Z x)" "{(X x, z)}"] by auto - moreover - have "((\x. (X x, Z x)) -` {(X x, z)}) \ space M \ (X -` {X x}) \ space M" by auto - hence "?dXZ {(X x, z)} \ ?dX {X x}" - unfolding distribution_def - by (rule measure_mono) (simp_all add: sets_eq_Pow) - with pos_dXZ have "0 < ?dX {X x}" by (rule less_le_trans) - moreover - have "((\x. (X x, Z x)) -` {(X x, z)}) \ space M \ (Z -` {z}) \ space M" by auto - hence "?dXZ {(X x, z)} \ ?dZ {z}" - unfolding distribution_def - by (rule measure_mono) (simp_all add: sets_eq_Pow) - with pos_dXZ have "0 < ?dZ {z}" by (rule less_le_trans) - moreover have "0 < b" by (rule less_trans[OF _ `1 < b`]) simp - moreover have "b \ 1" by (rule ccontr) (insert `1 < b`, simp) - ultimately show ?thesis - using pos_dXZ - apply (subst (2) mult_commute) - apply (subst divide_divide_eq_left[symmetric]) - apply (subst log_divide) - by (auto intro: divide_pos_pos) - qed - also have "... = ?rhs" - by (simp add: field_simps) - finally show ?thesis . -qed - -lemma (in finite_prob_space) finite_mutual_information_reduce_prod: - "mutual_information b - \ space = X ` space M, sets = Pow (X ` space M) \ - \ space = Y ` space M \ Z ` space M, sets = Pow (Y ` space M \ Z ` space M) \ - X (\x. (Y x,Z x)) = - (\ (x, y, z) \ X`space M \ Y`space M \ Z`space M. - distribution (\x. (X x, Y x,Z x)) {(x, y, z)} * - log b (distribution (\x. (X x, Y x,Z x)) {(x, y, z)} / - (distribution X {x} * distribution (\x. (Y x,Z x)) {(y,z)})))" (is "_ = setsum ?log ?space") - unfolding Let_def mutual_information_def KL_divergence_def using finite_space -proof (subst finite_measure_space.integral_finite_singleton, - simp_all add: prod_measure_space_def sigma_prod_sets_finite joint_distribution) - let ?sets = "Pow (X ` space M \ Y ` space M \ Z ` space M)" - and ?measure = "distribution (\x. (X x, Y x, Z x))" - let ?P = "\ space = ?space, sets = ?sets, measure = ?measure\" - - show "finite_measure_space ?P" - proof (rule finite_Pow_additivity_sufficient, simp_all) - show "finite ?space" using finite_space by auto - - show "positive ?P ?measure" - using sets_eq_Pow by (auto simp: positive_def distribution_def intro!: positive) - - show "additive ?P ?measure" - proof (simp add: additive_def distribution_def, safe) - fix x y assume "x \ ?space" and "y \ ?space" - assume "x \ y = {}" - thus "prob (((\x. (X x, Y x, Z x)) -` x \ (\x. (X x, Y x, Z x)) -` y) \ space M) = - prob ((\x. (X x, Y x, Z x)) -` x \ space M) + prob ((\x. (X x, Y x, Z x)) -` y \ space M)" - apply (subst measure_additive[unfolded sets_eq_Pow, simplified]) - by (auto simp: Un_Int_distrib Un_Int_distrib2 intro!: arg_cong[where f=prob]) - qed - qed +lemma (in finite_information_space) mutual_information_eq: + "\(X;Y) = (\ (x,y) \ X ` space M \ Y ` space M. + distribution (\x. (X x, Y x)) {(x,y)} * log b (distribution (\x. (X x, Y x)) {(x,y)} / + (distribution X {x} * distribution Y {y})))" + by (subst mutual_information_eq_generic) (simp_all add: finite_prob_space_of_images) - let ?X = "\space = X ` space M, sets = Pow (X ` space M), measure = distribution X\" - and ?YZ = "\space = Y ` space M \ Z ` space M, sets = Pow (Y ` space M \ Z ` space M), measure = distribution (\x. (Y x, Z x))\" - let ?u = "prod_measure ?X ?YZ" - - from finite_measure_space[of X] finite_measure_space_image_prod[of Y Z] - have ms_X: "measure_space ?X" and ms_YZ: "measure_space ?YZ" - by (simp_all add: finite_measure_space_def) - - show "(\x \ ?space. log b (measure_space.RN_deriv \space=?space, sets=?sets, measure=?u\ - (distribution (\x. (X x, Y x, Z x))) x) * distribution (\x. (X x, Y x, Z x)) {x}) - = setsum ?log ?space" - proof (rule setsum_cong) - fix x assume x: "x \ ?space" - then obtain d e f where x_Pair: "x = (d, e, f)" - and d: "d \ X ` space M" - and e: "e \ Y ` space M" - and f: "f \ Z ` space M" by auto - - { fix x assume m_0: "?u {x} = 0" - - let ?PROD = "(\x. (X x, Y x, Z x)) -` {x} \ space M" - obtain a b c where Pair: "x = (a, b, c)" by (cases x) - hence "?PROD = (X -` {a} \ space M) \ ((\x. (Y x, Z x)) -` {(b, c)} \ space M)" - and x_prod: "{x} = {a} \ {(b, c)}" by auto - - have "distribution (\x. (X x, Y x, Z x)) {x} = 0" - proof (cases "{a} \ X ` space M") - case False - hence "?PROD = {}" - unfolding Pair by auto - thus ?thesis by (auto simp: distribution_def) - next - have [intro]: "prob ?PROD \ 0 \ prob ?PROD = 0" - using sets_eq_Pow by (auto intro!: positive real_le_antisym[of _ 0]) - - case True - with prod_measure_times[OF ms_X ms_YZ, simplified, of "{a}" "{(b,c)}"] - have "prob (X -` {a} \ space M) = 0 \ prob ((\x. (Y x, Z x)) -` {(b, c)} \ space M) = 0" - (is "prob ?X = 0 \ prob ?Y = 0") using m_0 - by (simp add: prod_measure_space_def distribution_def Pair) - thus ?thesis - proof (rule disjE) - assume "prob ?X = 0" - have "prob ?PROD \ prob ?X" - using sets_eq_Pow Pair by (auto intro!: measure_mono) - thus ?thesis using `prob ?X = 0` by (auto simp: distribution_def) - next - assume "prob ?Y = 0" - have "prob ?PROD \ prob ?Y" - using sets_eq_Pow Pair by (auto intro!: measure_mono) - thus ?thesis using `prob ?Y = 0` by (auto simp: distribution_def) - qed - qed } - note measure_zero_joint_distribution = this - - from x_Pair d e f finite_space - show "log b (measure_space.RN_deriv \space=?space, sets=?sets, measure=?u\ - (distribution (\x. (X x, Y x, Z x))) x) * distribution (\x. (X x, Y x, Z x)) {x} = ?log x" - apply (cases "distribution (\x. (X x, Y x, Z x)) {x} \ 0") - apply (subst finite_measure_space.RN_deriv_finite_singleton) - proof simp_all - show "measure_space ?P" using `finite_measure_space ?P` by (simp add: finite_measure_space_def) - - from finite_measure_space_finite_prod_measure[OF finite_measure_space[of X] - finite_measure_space_image_prod[of Y Z]] finite_space - show "finite_measure_space \space=?space, sets=?sets, measure=?u\" - by (simp add: prod_measure_space_def sigma_prod_sets_finite) - next - fix x assume "?u {x} = 0" thus "distribution (\x. (X x, Y x, Z x)) {x} = 0" by fact - next - assume jd_0: "distribution (\x. (X x, Y x, Z x)) {(d, e, f)} \ 0" - show "?u {(d,e,f)} \ 0" - proof - assume "?u {(d, e, f)} = 0" - from measure_zero_joint_distribution[OF this] jd_0 - show False by simp - qed - next - assume jd_0: "distribution (\x. (X x, Y x, Z x)) {(d, e, f)} \ 0" - with prod_measure_times[OF ms_X ms_YZ, simplified, of "{d}" "{(e,f)}"] d - show "log b (distribution (\x. (X x, Y x, Z x)) {(d, e, f)} / ?u {(d, e, f)}) = - log b (distribution (\x. (X x, Y x, Z x)) {(d, e, f)} / (distribution X {d} * distribution (\x. (Y x, Z x)) {(e,f)}))" - by (auto intro!: arg_cong[where f="log b"] simp: prod_measure_space_def) - qed - qed simp -qed +lemma (in finite_information_space) mutual_information_positive: "0 \ \(X;Y)" + by (subst mutual_information_positive_generic) (simp_all add: finite_prob_space_of_images) definition (in prob_space) "entropy b s X = mutual_information b s s X X" -abbreviation (in finite_prob_space) - finite_entropy ("\\<^bsub>_\<^esub>'(_')") where - "\\<^bsub>b\<^esub>(X) \ entropy b \ space = X`space M, sets = Pow (X`space M) \ X" - -abbreviation (in finite_prob_space) - finite_entropy_2 ("\'(_')") where - "\(X) \ \\<^bsub>2\<^esub>(X)" +abbreviation (in finite_information_space) + finite_entropy ("\'(_')") where + "\(X) \ entropy b \ space = X`space M, sets = Pow (X`space M) \ X" -lemma (in finite_prob_space) finite_entropy_reduce: - assumes "1 < b" - shows "\\<^bsub>b\<^esub>(X) = -(\ x \ X ` space M. distribution X {x} * log b (distribution X {x}))" +lemma (in finite_information_space) joint_distribution_remove[simp]: + "joint_distribution X X {(x, x)} = distribution X {x}" + unfolding distribution_def by (auto intro!: arg_cong[where f=prob]) + +lemma (in finite_information_space) entropy_eq: + "\(X) = -(\ x \ X ` space M. distribution X {x} * log b (distribution X {x}))" proof - - have fin: "finite (X ` space M)" using finite_space by simp - - have If_mult_distr: "\A B C D. If A B C * D = If A (B * D) (C * D)" by auto - + { fix f { fix x y have "(\x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto - hence "distribution (\x. (X x, X x)) {(x,y)} = (if x = y then distribution X {x} else 0)" + hence "distribution (\x. (X x, X x)) {(x,y)} * f x y = (if x = y then distribution X {x} * f x y else 0)" unfolding distribution_def by auto } - moreover - have "\x. 0 \ distribution X x" - unfolding distribution_def using finite_space sets_eq_Pow by (auto intro: positive) - hence "\x. distribution X x \ 0 \ 0 < distribution X x" by (auto simp: le_less) - ultimately - show ?thesis using `1 < b` - by (auto intro!: setsum_cong - simp: log_inverse If_mult_distr setsum_cases[OF fin] inverse_eq_divide[symmetric] - entropy_def setsum_negf[symmetric] joint_distribution finite_mutual_information_reduce - setsum_cartesian_product[symmetric]) + hence "(\(x, y) \ X ` space M \ X ` space M. joint_distribution X X {(x, y)} * f x y) = + (\x \ X ` space M. distribution X {x} * f x x)" + unfolding setsum_cartesian_product' by (simp add: setsum_cases finite_space) } + note remove_cartesian_product = this + + show ?thesis + unfolding entropy_def mutual_information_eq setsum_negf[symmetric] remove_cartesian_product + by (auto intro!: setsum_cong) qed -lemma log_inj: assumes "1 < b" shows "inj_on (log b) {0 <..}" -proof (rule inj_onI, simp) - fix x y assume pos: "0 < x" "0 < y" and *: "log b x = log b y" - show "x = y" - proof (cases rule: linorder_cases) - assume "x < y" hence "log b x < log b y" - using log_less_cancel_iff[OF `1 < b`] pos by simp - thus ?thesis using * by simp - next - assume "y < x" hence "log b y < log b x" - using log_less_cancel_iff[OF `1 < b`] pos by simp - thus ?thesis using * by simp - qed simp -qed +lemma (in finite_information_space) entropy_positive: "0 \ \(X)" + unfolding entropy_def using mutual_information_positive . definition (in prob_space) "conditional_mutual_information b s1 s2 s3 X Y Z \ @@ -524,160 +415,181 @@ mutual_information b s1 prod_space X (\x. (Y x, Z x)) - mutual_information b s1 s3 X Z" -abbreviation (in finite_prob_space) - finite_conditional_mutual_information ("\\<^bsub>_\<^esub>'( _ ; _ | _ ')") where - "\\<^bsub>b\<^esub>(X ; Y | Z) \ conditional_mutual_information b +abbreviation (in finite_information_space) + finite_conditional_mutual_information ("\'( _ ; _ | _ ')") where + "\(X ; Y | Z) \ conditional_mutual_information b \ space = X`space M, sets = Pow (X`space M) \ \ space = Y`space M, sets = Pow (Y`space M) \ \ space = Z`space M, sets = Pow (Z`space M) \ X Y Z" -abbreviation (in finite_prob_space) - finite_conditional_mutual_information_2 ("\'( _ ; _ | _ ')") where - "\(X ; Y | Z) \ \\<^bsub>2\<^esub>(X ; Y | Z)" +lemma (in finite_information_space) setsum_distribution_gen: + assumes "Z -` {c} \ space M = (\x \ X`space M. Y -` {f x}) \ space M" + and "inj_on f (X`space M)" + shows "(\x \ X`space M. distribution Y {f x}) = distribution Z {c}" + unfolding distribution_def assms + using finite_space assms + by (subst measure_finitely_additive'') + (auto simp add: disjoint_family_on_def sets_eq_Pow inj_on_def + intro!: arg_cong[where f=prob]) + +lemma (in finite_information_space) setsum_distribution: + "(\x \ X`space M. joint_distribution X Y {(x, y)}) = distribution Y {y}" + "(\y \ Y`space M. joint_distribution X Y {(x, y)}) = distribution X {x}" + "(\x \ X`space M. joint_distribution X (\x. (Y x, Z x)) {(x, y, z)}) = joint_distribution Y Z {(y, z)}" + "(\y \ Y`space M. joint_distribution X (\x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Z {(x, z)}" + "(\z \ Z`space M. joint_distribution X (\x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Y {(x, y)}" + by (auto intro!: inj_onI setsum_distribution_gen) -lemma image_pair_eq_Sigma: - "(\x. (f x, g x)) ` A = Sigma (f ` A) (\x. g ` (f -` {x} \ A))" -proof (safe intro!: imageI vimageI, simp_all) - fix a b assume *: "a \ A" "b \ A" and eq: "f a = f b" - show "(f b, g a) \ (\x. (f x, g x)) ` A" unfolding eq[symmetric] - using * by auto +lemma (in finite_information_space) conditional_mutual_information_eq_sum: + "\(X ; Y | Z) = + (\(x, y, z)\X ` space M \ (\x. (Y x, Z x)) ` space M. + distribution (\x. (X x, Y x, Z x)) {(x, y, z)} * + log b (distribution (\x. (X x, Y x, Z x)) {(x, y, z)}/ + distribution (\x. (Y x, Z x)) {(y, z)})) - + (\(x, z)\X ` space M \ Z ` space M. + distribution (\x. (X x, Z x)) {(x,z)} * log b (distribution (\x. (X x, Z x)) {(x,z)} / distribution Z {z}))" + (is "_ = ?rhs") +proof - + have setsum_product: + "\f x. (\v\(\x. (Y x, Z x)) ` space M. joint_distribution X (\x. (Y x, Z x)) {(x,v)} * f v) + = (\v\Y ` space M \ Z ` space M. joint_distribution X (\x. (Y x, Z x)) {(x,v)} * f v)" + proof (safe intro!: setsum_mono_zero_cong_left imageI) + fix x y z f + assume *: "(Y y, Z z) \ (\x. (Y x, Z x)) ` space M" and "y \ space M" "z \ space M" + hence "(\x. (X x, Y x, Z x)) -` {(x, Y y, Z z)} \ space M = {}" + proof safe + fix x' assume x': "x' \ space M" and eq: "Y x' = Y y" "Z x' = Z z" + have "(Y y, Z z) \ (\x. (Y x, Z x)) ` space M" using eq[symmetric] x' by auto + thus "x' \ {}" using * by auto + qed + thus "joint_distribution X (\x. (Y x, Z x)) {(x, Y y, Z z)} * f (Y y) (Z z) = 0" + unfolding distribution_def by simp + qed (simp add: finite_space) + + thus ?thesis + unfolding conditional_mutual_information_def Let_def mutual_information_eq + apply (subst mutual_information_eq_generic) + by (auto simp add: prod_measure_space_def sigma_prod_sets_finite finite_space + finite_prob_space_of_images finite_product_prob_space_of_images + setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf + setsum_left_distrib[symmetric] setsum_distribution + cong: setsum_cong) qed -lemma inj_on_swap: "inj_on (\(x,y). (y,x)) A" by (auto intro!: inj_onI) - -lemma (in finite_prob_space) finite_conditional_mutual_information_reduce: - assumes "1 < b" - shows "\\<^bsub>b\<^esub>(X ; Y | Z) = - - (\ (x, z) \ (X ` space M \ Z ` space M). - distribution (\x. (X x, Z x)) {(x,z)} * log b (distribution (\x. (X x, Z x)) {(x,z)} / distribution Z {z})) - + (\ (x, y, z) \ (X ` space M \ (\x. (Y x, Z x)) ` space M). +lemma (in finite_information_space) conditional_mutual_information_eq: + "\(X ; Y | Z) = (\(x, y, z) \ X ` space M \ Y ` space M \ Z ` space M. distribution (\x. (X x, Y x, Z x)) {(x, y, z)} * log b (distribution (\x. (X x, Y x, Z x)) {(x, y, z)}/ - distribution (\x. (Y x, Z x)) {(y, z)}))" (is "_ = ?rhs") -unfolding conditional_mutual_information_def Let_def using finite_space -apply (simp add: prod_measure_space_def sigma_prod_sets_finite) -apply (subst mutual_information_cong[of _ "\space = X ` space M, sets = Pow (X ` space M)\" - _ "\space = Y ` space M \ Z ` space M, sets = Pow (Y ` space M \ Z ` space M)\"], simp_all) -apply (subst finite_mutual_information_reduce_prod, simp_all) -apply (subst finite_mutual_information_reduce, simp_all) + (joint_distribution X Z {(x, z)} * joint_distribution Y Z {(y,z)} / distribution Z {z})))" + unfolding conditional_mutual_information_def Let_def mutual_information_eq + apply (subst mutual_information_eq_generic) + by (auto simp add: prod_measure_space_def sigma_prod_sets_finite finite_space + finite_prob_space_of_images finite_product_prob_space_of_images + setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf + setsum_left_distrib[symmetric] setsum_distribution setsum_commute[where A="Y`space M"] + cong: setsum_cong) + +lemma (in finite_information_space) conditional_mutual_information_eq_mutual_information: + "\(X ; Y) = \(X ; Y | (\x. ()))" +proof - + have [simp]: "(\x. ()) ` space M = {()}" using not_empty by auto + + show ?thesis + unfolding conditional_mutual_information_eq mutual_information_eq + by (simp add: setsum_cartesian_product' distribution_remove_const) +qed + +lemma (in finite_information_space) conditional_mutual_information_positive: + "0 \ \(X ; Y | Z)" proof - let ?dXYZ = "distribution (\x. (X x, Y x, Z x))" - let ?dXZ = "distribution (\x. (X x, Z x))" - let ?dYZ = "distribution (\x. (Y x, Z x))" + let ?dXZ = "joint_distribution X Z" + let ?dYZ = "joint_distribution Y Z" let ?dX = "distribution X" - let ?dY = "distribution Y" let ?dZ = "distribution Z" + let ?M = "X ` space M \ Y ` space M \ Z ` space M" + + have split_beta: "\f. split f = (\x. f (fst x) (snd x))" by (simp add: expand_fun_eq) - have If_mult_distr: "\A B C D. If A B C * D = If A (B * D) (C * D)" by auto - { fix x y - have "(\x. (X x, Y x, Z x)) -` {(X x, y)} \ space M = - (if y \ (\x. (Y x, Z x)) ` space M then (\x. (X x, Y x, Z x)) -` {(X x, y)} \ space M else {})" by auto - hence "?dXYZ {(X x, y)} = (if y \ (\x. (Y x, Z x)) ` space M then ?dXYZ {(X x, y)} else 0)" - unfolding distribution_def by auto } - note split_measure = this - - have sets: "Y ` space M \ Z ` space M \ (\x. (Y x, Z x)) ` space M = (\x. (Y x, Z x)) ` space M" by auto - - have cong: "\A B C D. \ A = C ; B = D \ \ A + B = C + D" by auto - - { fix A f have "setsum f A = setsum (\(x, y). f (y, x)) ((\(x, y). (y, x)) ` A)" - using setsum_reindex[OF inj_on_swap, of "\(x, y). f (y, x)" A] by (simp add: split_twice) } - note setsum_reindex_swap = this - - { fix A B f assume *: "finite A" "\x\A. finite (B x)" - have "(\x\Sigma A B. f x) = (\x\A. setsum (\y. f (x, y)) (B x))" - unfolding setsum_Sigma[OF *] by simp } - note setsum_Sigma = this + have "- (\(x, y, z) \ ?M. ?dXYZ {(x, y, z)} * + log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}))) + \ log b (\(x, y, z) \ ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})" + unfolding split_beta + proof (rule log_setsum_divide) + show "?M \ {}" using not_empty by simp + show "1 < b" using b_gt_1 . - { fix x - have "(\z\Z ` space M. ?dXZ {(X x, z)}) = (\yz\(\x. (Y x, Z x)) ` space M. ?dXYZ {(X x, yz)})" - apply (subst setsum_reindex_swap) - apply (simp add: image_image distribution_def) - unfolding image_pair_eq_Sigma - apply (subst setsum_Sigma) - using finite_space apply simp_all - apply (rule setsum_cong[OF refl]) - apply (subst measure_finitely_additive'') - by (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=prob]) } + fix x assume "x \ ?M" + show "0 \ ?dXYZ {(fst x, fst (snd x), snd (snd x))}" using positive_distribution . + show "0 \ ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}" + by (auto intro!: mult_nonneg_nonneg positive_distribution simp: zero_le_divide_iff) - thus "(\(x, y, z)\X ` space M \ Y ` space M \ Z ` space M. - ?dXYZ {(x, y, z)} * log b (?dXYZ {(x, y, z)} / (?dX {x} * ?dYZ {(y, z)}))) - - (\(x, y)\X ` space M \ Z ` space M. - ?dXZ {(x, y)} * log b (?dXZ {(x, y)} / (?dX {x} * ?dZ {y}))) = - - (\ (x, z) \ (X ` space M \ Z ` space M). - ?dXZ {(x,z)} * log b (?dXZ {(x,z)} / ?dZ {z})) + - (\ (x, y, z) \ (X ` space M \ (\x. (Y x, Z x)) ` space M). - ?dXYZ {(x, y, z)} * log b (?dXYZ {(x, y, z)} / ?dYZ {(y, z)}))" - using finite_space - apply (auto simp: setsum_cartesian_product[symmetric] setsum_negf[symmetric] - setsum_addf[symmetric] diff_minus - intro!: setsum_cong[OF refl]) - apply (subst split_measure) - apply (simp add: If_mult_distr setsum_cases sets distribution_log_split[OF assms, of X]) - apply (subst add_commute) - by (simp add: setsum_subtractf setsum_negf field_simps setsum_right_distrib[symmetric] sets_eq_Pow) + assume *: "0 < ?dXYZ {(fst x, fst (snd x), snd (snd x))}" + thus "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}" + by (auto intro!: divide_pos_pos mult_pos_pos + intro: distribution_order(6) distribution_mono_gt_0) + qed (simp_all add: setsum_cartesian_product' sum_over_space_distrib setsum_distribution finite_space) + also have "(\(x, y, z) \ ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\z\Z`space M. ?dZ {z})" + apply (simp add: setsum_cartesian_product') + apply (subst setsum_commute) + apply (subst (2) setsum_commute) + by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] setsum_distribution + intro!: setsum_cong) + finally show ?thesis + unfolding conditional_mutual_information_eq sum_over_space_distrib by simp qed + definition (in prob_space) "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y" -abbreviation (in finite_prob_space) - finite_conditional_entropy ("\\<^bsub>_\<^esub>'(_ | _')") where - "\\<^bsub>b\<^esub>(X | Y) \ conditional_entropy b +abbreviation (in finite_information_space) + finite_conditional_entropy ("\'(_ | _')") where + "\(X | Y) \ conditional_entropy b \ space = X`space M, sets = Pow (X`space M) \ \ space = Y`space M, sets = Pow (Y`space M) \ X Y" -abbreviation (in finite_prob_space) - finite_conditional_entropy_2 ("\'(_ | _')") where - "\(X | Y) \ \\<^bsub>2\<^esub>(X | Y)" +lemma (in finite_information_space) conditional_entropy_positive: + "0 \ \(X | Y)" unfolding conditional_entropy_def using conditional_mutual_information_positive . -lemma (in finite_prob_space) finite_conditional_entropy_reduce: - assumes "1 < b" - shows "\\<^bsub>b\<^esub>(X | Z) = +lemma (in finite_information_space) conditional_entropy_eq: + "\(X | Z) = - (\(x, z)\X ` space M \ Z ` space M. joint_distribution X Z {(x, z)} * log b (joint_distribution X Z {(x, z)} / distribution Z {z}))" proof - have *: "\x y z. (\x. (X x, X x, Z x)) -` {(x, y, z)} = (if x = y then (\x. (X x, Z x)) -` {(x, z)} else {})" by auto show ?thesis - unfolding finite_conditional_mutual_information_reduce[OF assms] - conditional_entropy_def joint_distribution_def distribution_def * + unfolding conditional_mutual_information_eq_sum + conditional_entropy_def distribution_def * by (auto intro!: setsum_0') qed -lemma (in finite_prob_space) finite_mutual_information_eq_entropy_conditional_entropy: - assumes "1 < b" shows "\\<^bsub>b\<^esub>(X ; Z) = \\<^bsub>b\<^esub>(X) - \\<^bsub>b\<^esub>(X | Z)" (is "mutual_information b ?X ?Z X Z = _") - unfolding finite_mutual_information_reduce - finite_entropy_reduce[OF assms] - finite_conditional_entropy_reduce[OF assms] - joint_distribution diff_minus_eq_add +lemma (in finite_information_space) mutual_information_eq_entropy_conditional_entropy: + "\(X ; Z) = \(X) - \(X | Z)" + unfolding mutual_information_eq entropy_eq conditional_entropy_eq using finite_space - apply (auto simp add: setsum_addf[symmetric] setsum_subtractf - setsum_Sigma[symmetric] distribution_log_split[OF assms] setsum_negf[symmetric] - intro!: setsum_cong[OF refl]) - apply (simp add: setsum_negf setsum_left_distrib[symmetric]) -proof (rule disjI2) - let ?dX = "distribution X" - and ?dXZ = "distribution (\x. (X x, Z x))" + by (auto simp add: setsum_addf setsum_subtractf setsum_cartesian_product' + setsum_left_distrib[symmetric] setsum_addf setsum_distribution) - fix x assume "x \ space M" - have "\z. (\x. (X x, Z x)) -` {(X x, z)} \ space M = (X -` {X x} \ space M) \ (Z -` {z} \ space M)" by auto - thus "(\z\Z ` space M. distribution (\x. (X x, Z x)) {(X x, z)}) = distribution X {X x}" - unfolding distribution_def - apply (subst prob_real_sum_image_fn[where e="X -` {X x} \ space M" and s = "Z`space M" and f="\z. Z -` {z} \ space M"]) - using finite_space sets_eq_Pow by auto +lemma (in finite_information_space) conditional_entropy_less_eq_entropy: + "\(X | Z) \ \(X)" +proof - + have "\(X ; Z) = \(X) - \(X | Z)" using mutual_information_eq_entropy_conditional_entropy . + with mutual_information_positive[of X Z] entropy_positive[of X] + show ?thesis by auto qed (* -------------Entropy of a RV with a certain event is zero---------------- *) -lemma (in finite_prob_space) finite_entropy_certainty_eq_0: - assumes "x \ X ` space M" and "distribution X {x} = 1" and "b > 1" - shows "\\<^bsub>b\<^esub>(X) = 0" +lemma (in finite_information_space) finite_entropy_certainty_eq_0: + assumes "x \ X ` space M" and "distribution X {x} = 1" + shows "\(X) = 0" proof - interpret X: finite_prob_space "\ space = X ` space M, sets = Pow (X ` space M), - measure = distribution X\" by (rule finite_prob_space) + measure = distribution X\" by (rule finite_prob_space_of_images) have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}" using X.measure_compl[of "{x}"] assms by auto @@ -694,34 +606,18 @@ have y: "\y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp - show ?thesis - unfolding finite_entropy_reduce[OF `b > 1`] by (auto simp: y fi) + show ?thesis unfolding entropy_eq by (auto simp: y fi) qed (* --------------- upper bound on entropy for a rv ------------------------- *) -lemma log_setsum: - assumes "finite s" "s \ {}" - assumes "b > 1" - assumes "(\ i \ s. a i) = 1" - assumes "\ i. i \ s \ a i \ 0" - assumes "\ i. i \ s \ y i \ {0 <..}" - shows "log b (\ i \ s. a i * y i) \ (\ i \ s. a i * log b (y i))" -proof - - have "convex_on {0 <..} (\ x. - log b x)" - by (rule minus_log_convex[OF `b > 1`]) - hence "- log b (\ i \ s. a i * y i) \ (\ i \ s. a i * - log b (y i))" - using convex_on_setsum[of _ _ "\ x. - log b x"] assms pos_is_convex by fastsimp - thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le) -qed - -lemma (in finite_prob_space) finite_entropy_le_card: - assumes "1 < b" - shows "\\<^bsub>b\<^esub>(X) \ log b (real (card (X ` space M \ {x . distribution X {x} \ 0})))" +lemma (in finite_information_space) finite_entropy_le_card: + "\(X) \ log b (real (card (X ` space M \ {x . distribution X {x} \ 0})))" proof - interpret X: finite_prob_space "\space = X ` space M, sets = Pow (X ` space M), measure = distribution X\" - using finite_prob_space by auto + using finite_prob_space_of_images by auto + have triv: "\ x. (if distribution X {x} \ 0 then distribution X {x} else 0) = distribution X {x}" by auto hence sum1: "(\ x \ X ` space M \ {y. distribution X {y} \ 0}. distribution X {x}) = 1" @@ -753,7 +649,7 @@ also have "\ = (if distribution X {x} \ 0 then distribution X {x} * log b (inverse (distribution X {x})) else 0)" - using log_inverse `1 < b` X.positive[of "{x}"] asm by auto + using log_inverse b_gt_1 X.positive[of "{x}"] asm by auto finally have "- distribution X {x} * log b (distribution X {x}) = (if distribution X {x} \ 0 then distribution X {x} * log b (inverse (distribution X {x})) @@ -769,7 +665,7 @@ unfolding setsum_restrict_set[OF finite_imageI[OF finite_space, of X]] by auto also have "\ \ log b (\ x \ X ` space M \ {y. distribution X {y} \ 0}. distribution X {x} * (inverse (distribution X {x})))" - apply (subst log_setsum[OF _ _ `b > 1` sum1, + apply (subst log_setsum[OF _ _ b_gt_1 sum1, unfolded greaterThan_iff, OF _ _ _]) using pos sets_eq_Pow X.finite_space assms X.positive not_empty by auto also have "\ = log b (\ x \ X ` space M \ {y. distribution X {y} \ 0}. 1)" @@ -778,7 +674,7 @@ by auto finally have "- (\x\X ` space M. distribution X {x} * log b (distribution X {x})) \ log b (real_of_nat (card (X ` space M \ {y. distribution X {y} \ 0})))" by simp - thus ?thesis unfolding finite_entropy_reduce[OF assms] real_eq_of_nat by auto + thus ?thesis unfolding entropy_eq real_eq_of_nat by auto qed (* --------------- entropy is maximal for a uniform rv --------------------- *) @@ -808,34 +704,31 @@ by (auto simp:field_simps) qed -lemma (in finite_prob_space) finite_entropy_uniform_max: - assumes "b > 1" +lemma (in finite_information_space) finite_entropy_uniform_max: assumes "\x y. \ x \ X ` space M ; y \ X ` space M \ \ distribution X {x} = distribution X {y}" - shows "\\<^bsub>b\<^esub>(X) = log b (real (card (X ` space M)))" + shows "\(X) = log b (real (card (X ` space M)))" proof - interpret X: finite_prob_space "\space = X ` space M, sets = Pow (X ` space M), measure = distribution X\" - using finite_prob_space by auto + using finite_prob_space_of_images by auto + { fix x assume xasm: "x \ X ` space M" hence card_gt0: "real (card (X ` space M)) > 0" using card_gt_0_iff X.finite_space by auto from xasm have "\ y. y \ X ` space M \ distribution X {y} = distribution X {x}" using assms by blast hence "- (\x\X ` space M. distribution X {x} * log b (distribution X {x})) - = - (\ y \ X ` space M. distribution X {x} * log b (distribution X {x}))" - by auto - also have "\ = - real_of_nat (card (X ` space M)) * distribution X {x} * log b (distribution X {x})" - by auto + = - real (card (X ` space M)) * distribution X {x} * log b (distribution X {x})" + unfolding real_eq_of_nat by auto also have "\ = - real (card (X ` space M)) * (1 / real (card (X ` space M))) * log b (1 / real (card (X ` space M)))" - unfolding real_eq_of_nat[symmetric] - by (auto simp: X.uniform_prob[simplified, OF xasm assms(2)]) + by (auto simp: X.uniform_prob[simplified, OF xasm assms]) also have "\ = log b (real (card (X ` space M)))" unfolding inverse_eq_divide[symmetric] - using card_gt0 log_inverse `b > 1` + using card_gt0 log_inverse b_gt_1 by (auto simp add:field_simps card_gt0) finally have ?thesis - unfolding finite_entropy_reduce[OF `b > 1`] by auto } + unfolding entropy_eq by auto } moreover { assume "X ` space M = {}" hence "distribution X (X ` space M) = 0" @@ -844,4 +737,199 @@ ultimately show ?thesis by auto qed +definition "subvimage A f g \ (\x \ A. f -` {f x} \ A \ g -` {g x} \ A)" + +lemma subvimageI: + assumes "\x y. \ x \ A ; y \ A ; f x = f y \ \ g x = g y" + shows "subvimage A f g" + using assms unfolding subvimage_def by blast + +lemma subvimageE[consumes 1]: + assumes "subvimage A f g" + obtains "\x y. \ x \ A ; y \ A ; f x = f y \ \ g x = g y" + using assms unfolding subvimage_def by blast + +lemma subvimageD: + "\ subvimage A f g ; x \ A ; y \ A ; f x = f y \ \ g x = g y" + using assms unfolding subvimage_def by blast + +lemma subvimage_subset: + "\ subvimage B f g ; A \ B \ \ subvimage A f g" + unfolding subvimage_def by auto + +lemma subvimage_idem[intro]: "subvimage A g g" + by (safe intro!: subvimageI) + +lemma subvimage_comp_finer[intro]: + assumes svi: "subvimage A g h" + shows "subvimage A g (f \ h)" +proof (rule subvimageI, simp) + fix x y assume "x \ A" "y \ A" "g x = g y" + from svi[THEN subvimageD, OF this] + show "f (h x) = f (h y)" by simp +qed + +lemma subvimage_comp_gran: + assumes svi: "subvimage A g h" + assumes inj: "inj_on f (g ` A)" + shows "subvimage A (f \ g) h" + by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj]) + +lemma subvimage_comp: + assumes svi: "subvimage (f ` A) g h" + shows "subvimage A (g \ f) (h \ f)" + by (rule subvimageI) (auto intro!: svi[THEN subvimageD]) + +lemma subvimage_trans: + assumes fg: "subvimage A f g" + assumes gh: "subvimage A g h" + shows "subvimage A f h" + by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD]) + +lemma subvimage_translator: + assumes svi: "subvimage A f g" + shows "\h. \x \ A. h (f x) = g x" +proof (safe intro!: exI[of _ "\x. (THE z. z \ (g ` (f -` {x} \ A)))"]) + fix x assume "x \ A" + show "(THE x'. x' \ (g ` (f -` {f x} \ A))) = g x" + by (rule theI2[of _ "g x"]) + (insert `x \ A`, auto intro!: svi[THEN subvimageD]) +qed + +lemma subvimage_translator_image: + assumes svi: "subvimage A f g" + shows "\h. h ` f ` A = g ` A" +proof - + from subvimage_translator[OF svi] + obtain h where "\x. x \ A \ h (f x) = g x" by auto + thus ?thesis + by (auto intro!: exI[of _ h] + simp: image_compose[symmetric] comp_def cong: image_cong) +qed + +lemma subvimage_finite: + assumes svi: "subvimage A f g" and fin: "finite (f`A)" + shows "finite (g`A)" +proof - + from subvimage_translator_image[OF svi] + obtain h where "g`A = h`f`A" by fastsimp + with fin show "finite (g`A)" by simp +qed + +lemma subvimage_disj: + assumes svi: "subvimage A f g" + shows "f -` {x} \ A \ g -` {y} \ A \ + f -` {x} \ g -` {y} \ A = {}" (is "?sub \ ?dist") +proof (rule disjCI) + assume "\ ?dist" + then obtain z where "z \ A" and "x = f z" and "y = g z" by auto + thus "?sub" using svi unfolding subvimage_def by auto +qed + +lemma setsum_image_split: + assumes svi: "subvimage A f g" and fin: "finite (f ` A)" + shows "(\x\f`A. h x) = (\y\g`A. \x\f`(g -` {y} \ A). h x)" + (is "?lhs = ?rhs") +proof - + have "f ` A = + snd ` (SIGMA x : g ` A. f ` (g -` {x} \ A))" + (is "_ = snd ` ?SIGMA") + unfolding image_split_eq_Sigma[symmetric] + by (simp add: image_compose[symmetric] comp_def) + moreover + have snd_inj: "inj_on snd ?SIGMA" + unfolding image_split_eq_Sigma[symmetric] + by (auto intro!: inj_onI subvimageD[OF svi]) + ultimately + have "(\x\f`A. h x) = (\(x,y)\?SIGMA. h y)" + by (auto simp: setsum_reindex intro: setsum_cong) + also have "... = ?rhs" + using subvimage_finite[OF svi fin] fin + apply (subst setsum_Sigma[symmetric]) + by (auto intro!: finite_subset[of _ "f`A"]) + finally show ?thesis . +qed + +lemma (in finite_information_space) entropy_partition: + assumes svi: "subvimage (space M) X P" + shows "\(X) = \(P) + \(X|P)" +proof - + have "(\x\X ` space M. distribution X {x} * log b (distribution X {x})) = + (\y\P `space M. \x\X ` space M. + joint_distribution X P {(x, y)} * log b (joint_distribution X P {(x, y)}))" + proof (subst setsum_image_split[OF svi], + safe intro!: finite_imageI finite_space setsum_mono_zero_cong_left imageI) + fix p x assume in_space: "p \ space M" "x \ space M" + assume "joint_distribution X P {(X x, P p)} * log b (joint_distribution X P {(X x, P p)}) \ 0" + hence "(\x. (X x, P x)) -` {(X x, P p)} \ space M \ {}" by (auto simp: distribution_def) + with svi[unfolded subvimage_def, rule_format, OF `x \ space M`] + show "x \ P -` {P p}" by auto + next + fix p x assume in_space: "p \ space M" "x \ space M" + assume "P x = P p" + from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \ space M`] + have "X -` {X x} \ space M \ P -` {P p} \ space M" + by auto + hence "(\x. (X x, P x)) -` {(X x, P p)} \ space M = X -` {X x} \ space M" + by auto + thus "distribution X {X x} * log b (distribution X {X x}) = + joint_distribution X P {(X x, P p)} * + log b (joint_distribution X P {(X x, P p)})" + by (auto simp: distribution_def) + qed + thus ?thesis + unfolding entropy_eq conditional_entropy_eq + by (simp add: setsum_cartesian_product' setsum_subtractf setsum_distribution + setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"]) +qed + +corollary (in finite_information_space) entropy_data_processing: + "\(f \ X) \ \(X)" + by (subst (2) entropy_partition[of _ "f \ X"]) (auto intro: conditional_entropy_positive) + +lemma (in prob_space) distribution_cong: + assumes "\x. x \ space M \ X x = Y x" + shows "distribution X = distribution Y" + unfolding distribution_def expand_fun_eq + using assms by (auto intro!: arg_cong[where f=prob]) + +lemma (in prob_space) joint_distribution_cong: + assumes "\x. x \ space M \ X x = X' x" + assumes "\x. x \ space M \ Y x = Y' x" + shows "joint_distribution X Y = joint_distribution X' Y'" + unfolding distribution_def expand_fun_eq + using assms by (auto intro!: arg_cong[where f=prob]) + +lemma image_cong: + "\ \x. x \ S \ X x = X' x \ \ X ` S = X' ` S" + by (auto intro!: image_eqI) + +lemma (in finite_information_space) mutual_information_cong: + assumes X: "\x. x \ space M \ X x = X' x" + assumes Y: "\x. x \ space M \ Y x = Y' x" + shows "\(X ; Y) = \(X' ; Y')" +proof - + have "X ` space M = X' ` space M" using X by (rule image_cong) + moreover have "Y ` space M = Y' ` space M" using Y by (rule image_cong) + ultimately show ?thesis + unfolding mutual_information_eq + using + assms[THEN distribution_cong] + joint_distribution_cong[OF assms] + by (auto intro!: setsum_cong) +qed + +corollary (in finite_information_space) entropy_of_inj: + assumes "inj_on f (X`space M)" + shows "\(f \ X) = \(X)" +proof (rule antisym) + show "\(f \ X) \ \(X)" using entropy_data_processing . +next + have "\(X) = \(the_inv_into (X`space M) f \ (f \ X))" + by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF assms]) + also have "... \ \(f \ X)" + using entropy_data_processing . + finally show "\(X) \ \(f \ X)" . +qed + end