rule to show that conditional mutual information is non-negative in the continuous case
authorhoelzl
Wed, 10 Oct 2012 12:12:25 +0200
changeset 49787 d8de705b48d4
parent 49786 f33d5f009627
child 49788 3c10763f5cb4
rule to show that conditional mutual information is non-negative in the continuous case
src/HOL/Probability/Information.thy
--- a/src/HOL/Probability/Information.thy	Wed Oct 10 12:12:25 2012 +0200
+++ b/src/HOL/Probability/Information.thy	Wed Oct 10 12:12:25 2012 +0200
@@ -22,104 +22,6 @@
   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
   unfolding setsum_cartesian_product by simp
 
-section "Convex theory"
-
-lemma log_setsum:
-  assumes "finite s" "s \<noteq> {}"
-  assumes "b > 1"
-  assumes "(\<Sum> i \<in> s. a i) = 1"
-  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
-  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
-  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
-proof -
-  have "convex_on {0 <..} (\<lambda> x. - log b x)"
-    by (rule minus_log_convex[OF `b > 1`])
-  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
-    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastforce
-  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
-qed
-
-lemma log_setsum':
-  assumes "finite s" "s \<noteq> {}"
-  assumes "b > 1"
-  assumes "(\<Sum> i \<in> s. a i) = 1"
-  assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
-          "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
-  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
-proof -
-  have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
-    using assms by (auto intro!: setsum_mono_zero_cong_left)
-  moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
-  proof (rule log_setsum)
-    have "setsum a (s - {i. a i = 0}) = setsum a s"
-      using assms(1) by (rule setsum_mono_zero_cong_left) auto
-    thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
-      "finite (s - {i. a i = 0})" using assms by simp_all
-
-    show "s - {i. a i = 0} \<noteq> {}"
-    proof
-      assume *: "s - {i. a i = 0} = {}"
-      hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
-      with sum_1 show False by simp
-    qed
-
-    fix i assume "i \<in> s - {i. a i = 0}"
-    hence "i \<in> s" "a i \<noteq> 0" by simp_all
-    thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
-  qed fact+
-  ultimately show ?thesis by simp
-qed
-
-lemma log_setsum_divide:
-  assumes "finite S" and "S \<noteq> {}" and "1 < b"
-  assumes "(\<Sum>x\<in>S. g x) = 1"
-  assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
-  assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
-  shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
-proof -
-  have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
-    using `1 < b` by (subst log_le_cancel_iff) auto
-
-  have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
-  proof (unfold setsum_negf[symmetric], rule setsum_cong)
-    fix x assume x: "x \<in> S"
-    show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
-    proof (cases "g x = 0")
-      case False
-      with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
-      thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
-    qed simp
-  qed rule
-  also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
-  proof (rule log_setsum')
-    fix x assume x: "x \<in> S" "0 < g x"
-    with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
-  qed fact+
-  also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
-    by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
-        split: split_if_asm)
-  also have "... \<le> log b (\<Sum>x\<in>S. f x)"
-  proof (rule log_mono)
-    have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
-    also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
-    proof (rule setsum_strict_mono)
-      show "finite (S - {x. g x = 0})" using `finite S` by simp
-      show "S - {x. g x = 0} \<noteq> {}"
-      proof
-        assume "S - {x. g x = 0} = {}"
-        hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
-        with `(\<Sum>x\<in>S. g x) = 1` show False by simp
-      qed
-      fix x assume "x \<in> S - {x. g x = 0}"
-      thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
-    qed
-    finally show "0 < ?sum" .
-    show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
-      using `finite S` pos by (auto intro!: setsum_mono2)
-  qed
-  finally show ?thesis .
-qed
-
 lemma split_pairs:
   "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
@@ -831,39 +733,25 @@
   assumes X: "simple_distributed M X f"
   shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))"
 proof -
-  have "\<H>(X) = (\<Sum>x\<in>X`space M. f x * log b (1 / f x))"
-    unfolding entropy_simple_distributed[OF X] setsum_negf[symmetric]
-    using X by (auto dest: simple_distributed_nonneg intro!: setsum_cong simp: log_simps less_le)
-  also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. f x * (1 / f x))"
-    using not_empty b_gt_1 `simple_distributed M X f`
-    by (intro log_setsum') (auto simp: simple_distributed_nonneg simple_distributed_setsum_space)
-  also have "\<dots> = log b (\<Sum>x\<in>X`space M. if f x \<noteq> 0 then 1 else 0)"
-    by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) auto
-  finally show ?thesis
-    using `simple_distributed M X f` by (auto simp: setsum_cases real_eq_of_nat)
+  let ?X = "count_space (X`space M)"
+  have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})"
+    by (rule entropy_le[OF simple_distributed[OF X]])
+       (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
+  also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})"
+    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
+  finally show ?thesis .
 qed
 
 lemma (in information_space) entropy_le_card:
-  assumes "simple_distributed M X f"
+  assumes X: "simple_distributed M X f"
   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
-proof cases
-  assume "X ` space M \<inter> {x. f x \<noteq> 0} = {}"
-  then have "\<And>x. x\<in>X`space M \<Longrightarrow> f x = 0" by auto
-  moreover
-  have "0 < card (X`space M)"
-    using `simple_distributed M X f` not_empty by (auto simp: card_gt_0_iff)
-  then have "log b 1 \<le> log b (real (card (X`space M)))"
-    using b_gt_1 by (intro log_le) auto
-  ultimately show ?thesis using assms by (simp add: entropy_simple_distributed)
-next
-  assume False: "X ` space M \<inter> {x. f x \<noteq> 0} \<noteq> {}"
-  have "card (X ` space M \<inter> {x. f x \<noteq> 0}) \<le> card (X ` space M)"
-    (is "?A \<le> ?B") using assms not_empty
-    by (auto intro!: card_mono simp: simple_function_def simple_distributed_def)
-  note entropy_le_card_not_0[OF assms]
-  also have "log b (real ?A) \<le> log b (real ?B)"
-    using b_gt_1 False not_empty `?A \<le> ?B` assms
-    by (auto intro!: log_le simp: card_gt_0_iff simp: simple_distributed_def)
+proof -
+  let ?X = "count_space (X`space M)"
+  have "\<H>(X) \<le> log b (measure ?X (space ?X))"
+    by (rule entropy_le_space[OF simple_distributed[OF X]])
+       (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
+  also have "measure ?X (space ?X) = card (X ` space M)"
+    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
   finally show ?thesis .
 qed
 
@@ -879,7 +767,18 @@
   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
     (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"
 
-lemma (in information_space) conditional_mutual_information_generic_eq:
+lemma (in pair_sigma_finite) borel_measurable_positive_integral_fst:
+  "(\<lambda>(x, y). f x y) \<in> borel_measurable (M1 \<Otimes>\<^isub>M M2) \<Longrightarrow> (\<lambda>x. \<integral>\<^isup>+ y. f x y \<partial>M2) \<in> borel_measurable M1"
+  using positive_integral_fst_measurable(1)[of "\<lambda>(x, y). f x y"] by simp
+
+lemma (in pair_sigma_finite) borel_measurable_positive_integral_snd:
+  assumes "(\<lambda>(x, y). f x y) \<in> borel_measurable (M2 \<Otimes>\<^isub>M M1)" shows "(\<lambda>x. \<integral>\<^isup>+ y. f x y \<partial>M1) \<in> borel_measurable M2"
+proof -
+  interpret Q: pair_sigma_finite M2 M1 by default
+  from Q.borel_measurable_positive_integral_fst assms show ?thesis by simp
+qed
+
+lemma (in information_space)
   assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
   assumes Px: "distributed M S X Px"
   assumes Pz: "distributed M P Z Pz"
@@ -888,16 +787,19 @@
   assumes Pxyz: "distributed M (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
   assumes I1: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
   assumes I2: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
-  shows "conditional_mutual_information b S T P X Y Z
-    = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))"
+  shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
+    = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" (is "?eq")
+    and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
 proof -
   interpret S: sigma_finite_measure S by fact
   interpret T: sigma_finite_measure T by fact
   interpret P: sigma_finite_measure P by fact
   interpret TP: pair_sigma_finite T P ..
   interpret SP: pair_sigma_finite S P ..
+  interpret ST: pair_sigma_finite S T ..
   interpret SPT: pair_sigma_finite "S \<Otimes>\<^isub>M P" T ..
   interpret STP: pair_sigma_finite S "T \<Otimes>\<^isub>M P" ..
+  interpret TPS: pair_sigma_finite "T \<Otimes>\<^isub>M P" S ..
   have TP: "sigma_finite_measure (T \<Otimes>\<^isub>M P)" ..
   have SP: "sigma_finite_measure (S \<Otimes>\<^isub>M P)" ..
   have YZ: "random_variable (T \<Otimes>\<^isub>M P) (\<lambda>x. (Y x, Z x))"
@@ -933,27 +835,27 @@
   finally have mi_eq:
     "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" .
   
-  have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
+  have ae1: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
     by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
-  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
+  moreover have ae2: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
     by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) (auto intro: measurable_snd')
-  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
+  moreover have ae3: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
     by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto intro: measurable_Pair measurable_snd')
-  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
+  moreover have ae4: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
     by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto intro: measurable_Pair)
-  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Px (fst x)"
+  moreover have ae5: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Px (fst x)"
     using Px by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable)
-  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pyz (snd x)"
+  moreover have ae6: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pyz (snd x)"
     using Pyz by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
-  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd (snd x))"
+  moreover have ae7: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd (snd x))"
     using Pz Pz[THEN distributed_real_measurable] by (auto intro!: measurable_snd'' TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
-  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
+  moreover have ae8: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
     using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
     using measurable_comp[OF measurable_Pair[OF measurable_fst measurable_comp[OF measurable_snd measurable_snd]] Pxz[THEN distributed_real_measurable], of T]
     using measurable_comp[OF measurable_snd measurable_Pair2[OF Pxz[THEN distributed_real_measurable]], of _ T]
     by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure simp: comp_def)
   moreover note Pxyz[THEN distributed_real_AE]
-  ultimately have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P.
+  ultimately have ae: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P.
     Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
     Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
     Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
@@ -968,13 +870,164 @@
         using b_gt_1 by (simp add: log_simps mult_pos_pos less_imp_le field_simps)
     qed simp
   qed
-  with I1 I2 show ?thesis
+  with I1 I2 show ?eq
     unfolding conditional_mutual_information_def
     apply (subst mi_eq)
     apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
     apply (subst integral_diff(2)[symmetric])
     apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
     done
+
+  let ?P = "density (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) Pxyz"
+  interpret P: prob_space ?P
+    unfolding distributed_distr_eq_density[OF Pxyz, symmetric]
+    using distributed_measurable[OF Pxyz] by (rule prob_space_distr)
+
+  let ?Q = "density (T \<Otimes>\<^isub>M P) Pyz"
+  interpret Q: prob_space ?Q
+    unfolding distributed_distr_eq_density[OF Pyz, symmetric]
+    using distributed_measurable[OF Pyz] by (rule prob_space_distr)
+
+  let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
+
+  from subdensity_real[of snd, OF _ Pyz Pz]
+  have aeX1: "AE x in T \<Otimes>\<^isub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
+  have aeX2: "AE x in T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd x)"
+    using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
+
+  have aeX3: "AE y in T \<Otimes>\<^isub>M P. (\<integral>\<^isup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
+    using Pz distributed_marginal_eq_joint[OF P S Px Pz Pxz]
+    apply (intro TP.AE_pair_measure)
+    apply (auto simp: comp_def measurable_split_conv
+                intro!: measurable_snd'' borel_measurable_ereal_eq borel_measurable_ereal
+                        SP.borel_measurable_positive_integral_snd measurable_compose[OF _ Pxz[THEN distributed_real_measurable]]
+                        measurable_Pair
+                dest: distributed_real_AE distributed_real_measurable)
+    done
+
+  note M = borel_measurable_divide borel_measurable_diff borel_measurable_times borel_measurable_ereal
+           measurable_compose[OF _ measurable_snd]
+           measurable_Pair
+           measurable_compose[OF _ Pxyz[THEN distributed_real_measurable]]
+           measurable_compose[OF _ Pxz[THEN distributed_real_measurable]]
+           measurable_compose[OF _ Pyz[THEN distributed_real_measurable]]
+           measurable_compose[OF _ Pz[THEN distributed_real_measurable]]
+           measurable_compose[OF _ Px[THEN distributed_real_measurable]]
+           STP.borel_measurable_positive_integral_snd
+  have "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^isup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))"
+    apply (subst positive_integral_density)
+    apply (rule distributed_borel_measurable[OF Pxyz])
+    apply (rule distributed_AE[OF Pxyz])
+    apply (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M) []
+    apply (rule positive_integral_mono_AE)
+    using ae5 ae6 ae7 ae8
+    apply eventually_elim
+    apply (auto intro!: divide_nonneg_nonneg mult_nonneg_nonneg)
+    done
+  also have "\<dots> = (\<integral>\<^isup>+(y, z). \<integral>\<^isup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^isub>M P)"
+    by (subst STP.positive_integral_snd_measurable[symmetric])
+       (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M)
+  also have "\<dots> = (\<integral>\<^isup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^isub>M P)"
+    apply (rule positive_integral_cong_AE)
+    using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
+    apply eventually_elim
+  proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
+    fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
+      "(\<integral>\<^isup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)" 
+    then show "(\<integral>\<^isup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
+      apply (subst positive_integral_multc)
+      apply (auto intro!: borel_measurable_ereal divide_nonneg_nonneg
+                          measurable_compose[OF _ Pxz[THEN distributed_real_measurable]] measurable_Pair
+                  split: prod.split)
+      done
+  qed
+  also have "\<dots> = 1"
+    using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
+    by (subst positive_integral_density[symmetric]) (auto intro!: M)
+  finally have le1: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<le> 1" .
+  also have "\<dots> < \<infinity>" by simp
+  finally have fin: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
+
+  have pos: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<noteq> 0"
+    apply (subst positive_integral_density)
+    apply (rule distributed_borel_measurable[OF Pxyz])
+    apply (rule distributed_AE[OF Pxyz])
+    apply (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M) []
+    apply (simp add: split_beta')
+  proof
+    let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
+    assume "(\<integral>\<^isup>+ x. ?g x \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P)) = 0"
+    then have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. ?g x \<le> 0"
+      by (intro positive_integral_0_iff_AE[THEN iffD1]) (auto intro!: M borel_measurable_ereal measurable_If)
+    then have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxyz x = 0"
+      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
+      by eventually_elim (auto split: split_if_asm simp: mult_le_0_iff divide_le_0_iff)
+    then have "(\<integral>\<^isup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) = 0"
+      by (subst positive_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
+    with P.emeasure_space_1 show False
+      by (subst (asm) emeasure_density) (auto intro!: M cong: positive_integral_cong)
+  qed
+
+  have neg: "(\<integral>\<^isup>+ x. - ?f x \<partial>?P) = 0"
+    apply (rule positive_integral_0_iff_AE[THEN iffD2])
+    apply (auto intro!: M simp: split_beta') []
+    apply (subst AE_density)
+    apply (auto intro!: M simp: split_beta') []
+    using ae5 ae6 ae7 ae8
+    apply eventually_elim
+    apply (auto intro!: mult_nonneg_nonneg divide_nonneg_nonneg)
+    done
+
+  have I3: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
+    apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integral_diff(1)[OF I1 I2]])
+    using ae
+    apply (auto intro!: M simp: split_beta')
+    done
+
+  have "- log b 1 \<le> - log b (integral\<^isup>L ?P ?f)"
+  proof (intro le_imp_neg_le log_le[OF b_gt_1])
+    show "0 < integral\<^isup>L ?P ?f"
+      using neg pos fin positive_integral_positive[of ?P ?f]
+      by (cases "(\<integral>\<^isup>+ x. ?f x \<partial>?P)") (auto simp add: lebesgue_integral_def less_le split_beta')
+    show "integral\<^isup>L ?P ?f \<le> 1"
+      using neg le1 fin positive_integral_positive[of ?P ?f]
+      by (cases "(\<integral>\<^isup>+ x. ?f x \<partial>?P)") (auto simp add: lebesgue_integral_def split_beta' one_ereal_def)
+  qed
+  also have "- log b (integral\<^isup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
+  proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
+    show "AE x in ?P. ?f x \<in> {0<..}"
+      unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
+      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
+      by eventually_elim (auto simp: divide_pos_pos mult_pos_pos)
+    show "integrable ?P ?f"
+      unfolding integrable_def 
+      using fin neg by (auto intro!: M simp: split_beta')
+    show "integrable ?P (\<lambda>x. - log b (?f x))"
+      apply (subst integral_density)
+      apply (auto intro!: M) []
+      apply (auto intro!: M distributed_real_AE[OF Pxyz]) []
+      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
+      apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
+      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
+      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
+      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
+      apply eventually_elim
+      apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
+      done
+  qed (auto simp: b_gt_1 minus_log_convex)
+  also have "\<dots> = conditional_mutual_information b S T P X Y Z"
+    unfolding `?eq`
+    apply (subst integral_density)
+    apply (auto intro!: M) []
+    apply (auto intro!: M distributed_real_AE[OF Pxyz]) []
+    apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
+    apply (intro integral_cong_AE)
+    using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
+    apply eventually_elim
+    apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
+    done
+  finally show ?nonneg
+    by simp
 qed
 
 lemma (in information_space) conditional_mutual_information_eq:
@@ -1031,86 +1084,21 @@
   assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
   shows "0 \<le> \<I>(X ; Y | Z)"
 proof -
-  def Pz \<equiv> "\<lambda>x. if x \<in> Z`space M then measure M (Z -` {x} \<inter> space M) else 0"
-  def Pxz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x))`space M then measure M ((\<lambda>x. (X x, Z x)) -` {x} \<inter> space M) else 0"
-  def Pyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x))`space M then measure M ((\<lambda>x. (Y x, Z x)) -` {x} \<inter> space M) else 0"
-  def Pxyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x))`space M then measure M ((\<lambda>x. (X x, Y x, Z x)) -` {x} \<inter> space M) else 0"
-  let ?M = "X`space M \<times> Y`space M \<times> Z`space M"
-
-  note XZ = simple_function_Pair[OF X Z]
-  note YZ = simple_function_Pair[OF Y Z]
-  note XYZ = simple_function_Pair[OF X simple_function_Pair[OF Y Z]]
-  have Pz: "simple_distributed M Z Pz"
-    using Z by (rule simple_distributedI) (auto simp: Pz_def)
-  have Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz"
-    using XZ by (rule simple_distributedI) (auto simp: Pxz_def)
-  have Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz"
-    using YZ by (rule simple_distributedI) (auto simp: Pyz_def)
-  have Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz"
-    using XYZ by (rule simple_distributedI) (auto simp: Pxyz_def)
-
-  { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>x\<in>X ` space M. Pxz (x, z)) = Pz z"
-      using distributed_marginal_eq_joint_simple[OF X Pz Pxz z]
-      by (auto intro!: setsum_cong simp: Pxz_def) }
-  note marginal1 = this
-
-  { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>y\<in>Y ` space M. Pyz (y, z)) = Pz z"
-      using distributed_marginal_eq_joint_simple[OF Y Pz Pyz z]
-      by (auto intro!: setsum_cong simp: Pyz_def) }
-  note marginal2 = this
-
-  have "- \<I>(X ; Y | Z) = - (\<Sum>(x, y, z) \<in> ?M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
-    unfolding conditional_mutual_information_eq[OF Pz Pyz Pxz Pxyz]
-    using X Y Z by (auto intro!: setsum_mono_zero_left simp: Pxyz_def simple_functionD)
-  also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z))"
-    unfolding split_beta'
-  proof (rule log_setsum_divide)
-    show "?M \<noteq> {}" using not_empty by simp
-    show "1 < b" using b_gt_1 .
-
-    show "finite ?M" using X Y Z by (auto simp: simple_functionD)
-
-    then show "(\<Sum>x\<in>?M. Pxyz (fst x, fst (snd x), snd (snd x))) = 1"
-      apply (subst Pxyz[THEN simple_distributed_setsum_space, symmetric])
-      apply simp
-      apply (intro setsum_mono_zero_right)
-      apply (auto simp: Pxyz_def)
-      done
-    let ?N = "(\<lambda>x. (X x, Y x, Z x)) ` space M"
-    fix x assume x: "x \<in> ?M"
-    let ?Q = "Pxyz (fst x, fst (snd x), snd (snd x))"
-    let ?P = "Pxz (fst x, snd (snd x)) * (Pyz (fst (snd x), snd (snd x)) / Pz (snd (snd x)))"
-    from x show "0 \<le> ?Q" "0 \<le> ?P"
-      using Pxyz[THEN simple_distributed, THEN distributed_real_AE]
-      using Pxz[THEN simple_distributed, THEN distributed_real_AE]
-      using Pyz[THEN simple_distributed, THEN distributed_real_AE]
-      using Pz[THEN simple_distributed, THEN distributed_real_AE]
-      by (auto intro!: mult_nonneg_nonneg divide_nonneg_nonneg simp: AE_count_space Pxyz_def Pxz_def Pyz_def Pz_def)
-    moreover assume "0 < ?Q"
-    moreover have "AE x in count_space ?N. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
-      by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz[THEN simple_distributed] Pz[THEN simple_distributed]]) (auto intro: measurable_snd')
-    then have "\<And>x. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
-      by (auto simp: Pz_def Pxyz_def AE_count_space)
-    moreover have "AE x in count_space ?N. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
-      by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz[THEN simple_distributed] Pxz[THEN simple_distributed]]) (auto intro: measurable_Pair measurable_snd')
-    then have "\<And>x. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
-      by (auto simp: Pz_def Pxyz_def AE_count_space)
-    moreover have "AE x in count_space ?N. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
-      by (intro subdensity_real[of snd, OF _ Pxyz[THEN simple_distributed] Pyz[THEN simple_distributed]]) (auto intro: measurable_Pair)
-    then have "\<And>x. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
-      by (auto simp: Pz_def Pxyz_def AE_count_space)
-    ultimately show "0 < ?P" using x by (auto intro!: divide_pos_pos mult_pos_pos simp: less_le)
-  qed
-  also have "(\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z)) = (\<Sum>z\<in>Z`space M. Pz z)"
-    apply (simp add: setsum_cartesian_product')
-    apply (subst setsum_commute)
-    apply (subst (2) setsum_commute)
-    apply (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] marginal1 marginal2
-          intro!: setsum_cong)
-    done
-  also have "log b (\<Sum>z\<in>Z`space M. Pz z) = 0"
-    using Pz[THEN simple_distributed_setsum_space] by simp
-  finally show ?thesis by simp
+  have [simp]: "count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M) \<Otimes>\<^isub>M count_space (Z ` space M) =
+      count_space (X`space M \<times> Y`space M \<times> Z`space M)"
+    by (simp add: pair_measure_count_space X Y Z simple_functionD)
+  note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
+  note sd = simple_distributedI[OF _ refl]
+  note sp = simple_function_Pair
+  show ?thesis
+   apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
+   apply (rule simple_distributed[OF sd[OF X]])
+   apply (rule simple_distributed[OF sd[OF Z]])
+   apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
+   apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]])
+   apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
+   apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
+   done
 qed
 
 subsection {* Conditional Entropy *}