rule to show that conditional mutual information is non-negative in the continuous case
authorhoelzl
Wed Oct 10 12:12:25 2012 +0200 (2012-10-10)
changeset 49787d8de705b48d4
parent 49786 f33d5f009627
child 49788 3c10763f5cb4
rule to show that conditional mutual information is non-negative in the continuous case
src/HOL/Probability/Information.thy
     1.1 --- a/src/HOL/Probability/Information.thy	Wed Oct 10 12:12:25 2012 +0200
     1.2 +++ b/src/HOL/Probability/Information.thy	Wed Oct 10 12:12:25 2012 +0200
     1.3 @@ -22,104 +22,6 @@
     1.4    "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
     1.5    unfolding setsum_cartesian_product by simp
     1.6  
     1.7 -section "Convex theory"
     1.8 -
     1.9 -lemma log_setsum:
    1.10 -  assumes "finite s" "s \<noteq> {}"
    1.11 -  assumes "b > 1"
    1.12 -  assumes "(\<Sum> i \<in> s. a i) = 1"
    1.13 -  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
    1.14 -  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
    1.15 -  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    1.16 -proof -
    1.17 -  have "convex_on {0 <..} (\<lambda> x. - log b x)"
    1.18 -    by (rule minus_log_convex[OF `b > 1`])
    1.19 -  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
    1.20 -    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastforce
    1.21 -  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
    1.22 -qed
    1.23 -
    1.24 -lemma log_setsum':
    1.25 -  assumes "finite s" "s \<noteq> {}"
    1.26 -  assumes "b > 1"
    1.27 -  assumes "(\<Sum> i \<in> s. a i) = 1"
    1.28 -  assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
    1.29 -          "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
    1.30 -  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    1.31 -proof -
    1.32 -  have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
    1.33 -    using assms by (auto intro!: setsum_mono_zero_cong_left)
    1.34 -  moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
    1.35 -  proof (rule log_setsum)
    1.36 -    have "setsum a (s - {i. a i = 0}) = setsum a s"
    1.37 -      using assms(1) by (rule setsum_mono_zero_cong_left) auto
    1.38 -    thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
    1.39 -      "finite (s - {i. a i = 0})" using assms by simp_all
    1.40 -
    1.41 -    show "s - {i. a i = 0} \<noteq> {}"
    1.42 -    proof
    1.43 -      assume *: "s - {i. a i = 0} = {}"
    1.44 -      hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
    1.45 -      with sum_1 show False by simp
    1.46 -    qed
    1.47 -
    1.48 -    fix i assume "i \<in> s - {i. a i = 0}"
    1.49 -    hence "i \<in> s" "a i \<noteq> 0" by simp_all
    1.50 -    thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
    1.51 -  qed fact+
    1.52 -  ultimately show ?thesis by simp
    1.53 -qed
    1.54 -
    1.55 -lemma log_setsum_divide:
    1.56 -  assumes "finite S" and "S \<noteq> {}" and "1 < b"
    1.57 -  assumes "(\<Sum>x\<in>S. g x) = 1"
    1.58 -  assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
    1.59 -  assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
    1.60 -  shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
    1.61 -proof -
    1.62 -  have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
    1.63 -    using `1 < b` by (subst log_le_cancel_iff) auto
    1.64 -
    1.65 -  have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
    1.66 -  proof (unfold setsum_negf[symmetric], rule setsum_cong)
    1.67 -    fix x assume x: "x \<in> S"
    1.68 -    show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
    1.69 -    proof (cases "g x = 0")
    1.70 -      case False
    1.71 -      with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
    1.72 -      thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
    1.73 -    qed simp
    1.74 -  qed rule
    1.75 -  also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
    1.76 -  proof (rule log_setsum')
    1.77 -    fix x assume x: "x \<in> S" "0 < g x"
    1.78 -    with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
    1.79 -  qed fact+
    1.80 -  also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
    1.81 -    by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
    1.82 -        split: split_if_asm)
    1.83 -  also have "... \<le> log b (\<Sum>x\<in>S. f x)"
    1.84 -  proof (rule log_mono)
    1.85 -    have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
    1.86 -    also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
    1.87 -    proof (rule setsum_strict_mono)
    1.88 -      show "finite (S - {x. g x = 0})" using `finite S` by simp
    1.89 -      show "S - {x. g x = 0} \<noteq> {}"
    1.90 -      proof
    1.91 -        assume "S - {x. g x = 0} = {}"
    1.92 -        hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
    1.93 -        with `(\<Sum>x\<in>S. g x) = 1` show False by simp
    1.94 -      qed
    1.95 -      fix x assume "x \<in> S - {x. g x = 0}"
    1.96 -      thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
    1.97 -    qed
    1.98 -    finally show "0 < ?sum" .
    1.99 -    show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
   1.100 -      using `finite S` pos by (auto intro!: setsum_mono2)
   1.101 -  qed
   1.102 -  finally show ?thesis .
   1.103 -qed
   1.104 -
   1.105  lemma split_pairs:
   1.106    "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   1.107    "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
   1.108 @@ -831,39 +733,25 @@
   1.109    assumes X: "simple_distributed M X f"
   1.110    shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))"
   1.111  proof -
   1.112 -  have "\<H>(X) = (\<Sum>x\<in>X`space M. f x * log b (1 / f x))"
   1.113 -    unfolding entropy_simple_distributed[OF X] setsum_negf[symmetric]
   1.114 -    using X by (auto dest: simple_distributed_nonneg intro!: setsum_cong simp: log_simps less_le)
   1.115 -  also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. f x * (1 / f x))"
   1.116 -    using not_empty b_gt_1 `simple_distributed M X f`
   1.117 -    by (intro log_setsum') (auto simp: simple_distributed_nonneg simple_distributed_setsum_space)
   1.118 -  also have "\<dots> = log b (\<Sum>x\<in>X`space M. if f x \<noteq> 0 then 1 else 0)"
   1.119 -    by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) auto
   1.120 -  finally show ?thesis
   1.121 -    using `simple_distributed M X f` by (auto simp: setsum_cases real_eq_of_nat)
   1.122 +  let ?X = "count_space (X`space M)"
   1.123 +  have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})"
   1.124 +    by (rule entropy_le[OF simple_distributed[OF X]])
   1.125 +       (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
   1.126 +  also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})"
   1.127 +    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
   1.128 +  finally show ?thesis .
   1.129  qed
   1.130  
   1.131  lemma (in information_space) entropy_le_card:
   1.132 -  assumes "simple_distributed M X f"
   1.133 +  assumes X: "simple_distributed M X f"
   1.134    shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   1.135 -proof cases
   1.136 -  assume "X ` space M \<inter> {x. f x \<noteq> 0} = {}"
   1.137 -  then have "\<And>x. x\<in>X`space M \<Longrightarrow> f x = 0" by auto
   1.138 -  moreover
   1.139 -  have "0 < card (X`space M)"
   1.140 -    using `simple_distributed M X f` not_empty by (auto simp: card_gt_0_iff)
   1.141 -  then have "log b 1 \<le> log b (real (card (X`space M)))"
   1.142 -    using b_gt_1 by (intro log_le) auto
   1.143 -  ultimately show ?thesis using assms by (simp add: entropy_simple_distributed)
   1.144 -next
   1.145 -  assume False: "X ` space M \<inter> {x. f x \<noteq> 0} \<noteq> {}"
   1.146 -  have "card (X ` space M \<inter> {x. f x \<noteq> 0}) \<le> card (X ` space M)"
   1.147 -    (is "?A \<le> ?B") using assms not_empty
   1.148 -    by (auto intro!: card_mono simp: simple_function_def simple_distributed_def)
   1.149 -  note entropy_le_card_not_0[OF assms]
   1.150 -  also have "log b (real ?A) \<le> log b (real ?B)"
   1.151 -    using b_gt_1 False not_empty `?A \<le> ?B` assms
   1.152 -    by (auto intro!: log_le simp: card_gt_0_iff simp: simple_distributed_def)
   1.153 +proof -
   1.154 +  let ?X = "count_space (X`space M)"
   1.155 +  have "\<H>(X) \<le> log b (measure ?X (space ?X))"
   1.156 +    by (rule entropy_le_space[OF simple_distributed[OF X]])
   1.157 +       (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
   1.158 +  also have "measure ?X (space ?X) = card (X ` space M)"
   1.159 +    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
   1.160    finally show ?thesis .
   1.161  qed
   1.162  
   1.163 @@ -879,7 +767,18 @@
   1.164    "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   1.165      (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"
   1.166  
   1.167 -lemma (in information_space) conditional_mutual_information_generic_eq:
   1.168 +lemma (in pair_sigma_finite) borel_measurable_positive_integral_fst:
   1.169 +  "(\<lambda>(x, y). f x y) \<in> borel_measurable (M1 \<Otimes>\<^isub>M M2) \<Longrightarrow> (\<lambda>x. \<integral>\<^isup>+ y. f x y \<partial>M2) \<in> borel_measurable M1"
   1.170 +  using positive_integral_fst_measurable(1)[of "\<lambda>(x, y). f x y"] by simp
   1.171 +
   1.172 +lemma (in pair_sigma_finite) borel_measurable_positive_integral_snd:
   1.173 +  assumes "(\<lambda>(x, y). f x y) \<in> borel_measurable (M2 \<Otimes>\<^isub>M M1)" shows "(\<lambda>x. \<integral>\<^isup>+ y. f x y \<partial>M1) \<in> borel_measurable M2"
   1.174 +proof -
   1.175 +  interpret Q: pair_sigma_finite M2 M1 by default
   1.176 +  from Q.borel_measurable_positive_integral_fst assms show ?thesis by simp
   1.177 +qed
   1.178 +
   1.179 +lemma (in information_space)
   1.180    assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
   1.181    assumes Px: "distributed M S X Px"
   1.182    assumes Pz: "distributed M P Z Pz"
   1.183 @@ -888,16 +787,19 @@
   1.184    assumes Pxyz: "distributed M (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
   1.185    assumes I1: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
   1.186    assumes I2: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
   1.187 -  shows "conditional_mutual_information b S T P X Y Z
   1.188 -    = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))"
   1.189 +  shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
   1.190 +    = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" (is "?eq")
   1.191 +    and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
   1.192  proof -
   1.193    interpret S: sigma_finite_measure S by fact
   1.194    interpret T: sigma_finite_measure T by fact
   1.195    interpret P: sigma_finite_measure P by fact
   1.196    interpret TP: pair_sigma_finite T P ..
   1.197    interpret SP: pair_sigma_finite S P ..
   1.198 +  interpret ST: pair_sigma_finite S T ..
   1.199    interpret SPT: pair_sigma_finite "S \<Otimes>\<^isub>M P" T ..
   1.200    interpret STP: pair_sigma_finite S "T \<Otimes>\<^isub>M P" ..
   1.201 +  interpret TPS: pair_sigma_finite "T \<Otimes>\<^isub>M P" S ..
   1.202    have TP: "sigma_finite_measure (T \<Otimes>\<^isub>M P)" ..
   1.203    have SP: "sigma_finite_measure (S \<Otimes>\<^isub>M P)" ..
   1.204    have YZ: "random_variable (T \<Otimes>\<^isub>M P) (\<lambda>x. (Y x, Z x))"
   1.205 @@ -933,27 +835,27 @@
   1.206    finally have mi_eq:
   1.207      "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" .
   1.208    
   1.209 -  have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
   1.210 +  have ae1: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
   1.211      by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
   1.212 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.213 +  moreover have ae2: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.214      by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) (auto intro: measurable_snd')
   1.215 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.216 +  moreover have ae3: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.217      by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto intro: measurable_Pair measurable_snd')
   1.218 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
   1.219 +  moreover have ae4: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
   1.220      by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto intro: measurable_Pair)
   1.221 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Px (fst x)"
   1.222 +  moreover have ae5: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Px (fst x)"
   1.223      using Px by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable)
   1.224 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pyz (snd x)"
   1.225 +  moreover have ae6: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pyz (snd x)"
   1.226      using Pyz by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
   1.227 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd (snd x))"
   1.228 +  moreover have ae7: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd (snd x))"
   1.229      using Pz Pz[THEN distributed_real_measurable] by (auto intro!: measurable_snd'' TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
   1.230 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
   1.231 +  moreover have ae8: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
   1.232      using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
   1.233      using measurable_comp[OF measurable_Pair[OF measurable_fst measurable_comp[OF measurable_snd measurable_snd]] Pxz[THEN distributed_real_measurable], of T]
   1.234      using measurable_comp[OF measurable_snd measurable_Pair2[OF Pxz[THEN distributed_real_measurable]], of _ T]
   1.235      by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure simp: comp_def)
   1.236    moreover note Pxyz[THEN distributed_real_AE]
   1.237 -  ultimately have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P.
   1.238 +  ultimately have ae: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P.
   1.239      Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
   1.240      Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
   1.241      Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
   1.242 @@ -968,13 +870,164 @@
   1.243          using b_gt_1 by (simp add: log_simps mult_pos_pos less_imp_le field_simps)
   1.244      qed simp
   1.245    qed
   1.246 -  with I1 I2 show ?thesis
   1.247 +  with I1 I2 show ?eq
   1.248      unfolding conditional_mutual_information_def
   1.249      apply (subst mi_eq)
   1.250      apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
   1.251      apply (subst integral_diff(2)[symmetric])
   1.252      apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
   1.253      done
   1.254 +
   1.255 +  let ?P = "density (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) Pxyz"
   1.256 +  interpret P: prob_space ?P
   1.257 +    unfolding distributed_distr_eq_density[OF Pxyz, symmetric]
   1.258 +    using distributed_measurable[OF Pxyz] by (rule prob_space_distr)
   1.259 +
   1.260 +  let ?Q = "density (T \<Otimes>\<^isub>M P) Pyz"
   1.261 +  interpret Q: prob_space ?Q
   1.262 +    unfolding distributed_distr_eq_density[OF Pyz, symmetric]
   1.263 +    using distributed_measurable[OF Pyz] by (rule prob_space_distr)
   1.264 +
   1.265 +  let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
   1.266 +
   1.267 +  from subdensity_real[of snd, OF _ Pyz Pz]
   1.268 +  have aeX1: "AE x in T \<Otimes>\<^isub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
   1.269 +  have aeX2: "AE x in T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd x)"
   1.270 +    using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
   1.271 +
   1.272 +  have aeX3: "AE y in T \<Otimes>\<^isub>M P. (\<integral>\<^isup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
   1.273 +    using Pz distributed_marginal_eq_joint[OF P S Px Pz Pxz]
   1.274 +    apply (intro TP.AE_pair_measure)
   1.275 +    apply (auto simp: comp_def measurable_split_conv
   1.276 +                intro!: measurable_snd'' borel_measurable_ereal_eq borel_measurable_ereal
   1.277 +                        SP.borel_measurable_positive_integral_snd measurable_compose[OF _ Pxz[THEN distributed_real_measurable]]
   1.278 +                        measurable_Pair
   1.279 +                dest: distributed_real_AE distributed_real_measurable)
   1.280 +    done
   1.281 +
   1.282 +  note M = borel_measurable_divide borel_measurable_diff borel_measurable_times borel_measurable_ereal
   1.283 +           measurable_compose[OF _ measurable_snd]
   1.284 +           measurable_Pair
   1.285 +           measurable_compose[OF _ Pxyz[THEN distributed_real_measurable]]
   1.286 +           measurable_compose[OF _ Pxz[THEN distributed_real_measurable]]
   1.287 +           measurable_compose[OF _ Pyz[THEN distributed_real_measurable]]
   1.288 +           measurable_compose[OF _ Pz[THEN distributed_real_measurable]]
   1.289 +           measurable_compose[OF _ Px[THEN distributed_real_measurable]]
   1.290 +           STP.borel_measurable_positive_integral_snd
   1.291 +  have "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^isup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))"
   1.292 +    apply (subst positive_integral_density)
   1.293 +    apply (rule distributed_borel_measurable[OF Pxyz])
   1.294 +    apply (rule distributed_AE[OF Pxyz])
   1.295 +    apply (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M) []
   1.296 +    apply (rule positive_integral_mono_AE)
   1.297 +    using ae5 ae6 ae7 ae8
   1.298 +    apply eventually_elim
   1.299 +    apply (auto intro!: divide_nonneg_nonneg mult_nonneg_nonneg)
   1.300 +    done
   1.301 +  also have "\<dots> = (\<integral>\<^isup>+(y, z). \<integral>\<^isup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^isub>M P)"
   1.302 +    by (subst STP.positive_integral_snd_measurable[symmetric])
   1.303 +       (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M)
   1.304 +  also have "\<dots> = (\<integral>\<^isup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^isub>M P)"
   1.305 +    apply (rule positive_integral_cong_AE)
   1.306 +    using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
   1.307 +    apply eventually_elim
   1.308 +  proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
   1.309 +    fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
   1.310 +      "(\<integral>\<^isup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)" 
   1.311 +    then show "(\<integral>\<^isup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
   1.312 +      apply (subst positive_integral_multc)
   1.313 +      apply (auto intro!: borel_measurable_ereal divide_nonneg_nonneg
   1.314 +                          measurable_compose[OF _ Pxz[THEN distributed_real_measurable]] measurable_Pair
   1.315 +                  split: prod.split)
   1.316 +      done
   1.317 +  qed
   1.318 +  also have "\<dots> = 1"
   1.319 +    using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
   1.320 +    by (subst positive_integral_density[symmetric]) (auto intro!: M)
   1.321 +  finally have le1: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<le> 1" .
   1.322 +  also have "\<dots> < \<infinity>" by simp
   1.323 +  finally have fin: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
   1.324 +
   1.325 +  have pos: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<noteq> 0"
   1.326 +    apply (subst positive_integral_density)
   1.327 +    apply (rule distributed_borel_measurable[OF Pxyz])
   1.328 +    apply (rule distributed_AE[OF Pxyz])
   1.329 +    apply (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M) []
   1.330 +    apply (simp add: split_beta')
   1.331 +  proof
   1.332 +    let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
   1.333 +    assume "(\<integral>\<^isup>+ x. ?g x \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P)) = 0"
   1.334 +    then have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. ?g x \<le> 0"
   1.335 +      by (intro positive_integral_0_iff_AE[THEN iffD1]) (auto intro!: M borel_measurable_ereal measurable_If)
   1.336 +    then have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxyz x = 0"
   1.337 +      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
   1.338 +      by eventually_elim (auto split: split_if_asm simp: mult_le_0_iff divide_le_0_iff)
   1.339 +    then have "(\<integral>\<^isup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) = 0"
   1.340 +      by (subst positive_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
   1.341 +    with P.emeasure_space_1 show False
   1.342 +      by (subst (asm) emeasure_density) (auto intro!: M cong: positive_integral_cong)
   1.343 +  qed
   1.344 +
   1.345 +  have neg: "(\<integral>\<^isup>+ x. - ?f x \<partial>?P) = 0"
   1.346 +    apply (rule positive_integral_0_iff_AE[THEN iffD2])
   1.347 +    apply (auto intro!: M simp: split_beta') []
   1.348 +    apply (subst AE_density)
   1.349 +    apply (auto intro!: M simp: split_beta') []
   1.350 +    using ae5 ae6 ae7 ae8
   1.351 +    apply eventually_elim
   1.352 +    apply (auto intro!: mult_nonneg_nonneg divide_nonneg_nonneg)
   1.353 +    done
   1.354 +
   1.355 +  have I3: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
   1.356 +    apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integral_diff(1)[OF I1 I2]])
   1.357 +    using ae
   1.358 +    apply (auto intro!: M simp: split_beta')
   1.359 +    done
   1.360 +
   1.361 +  have "- log b 1 \<le> - log b (integral\<^isup>L ?P ?f)"
   1.362 +  proof (intro le_imp_neg_le log_le[OF b_gt_1])
   1.363 +    show "0 < integral\<^isup>L ?P ?f"
   1.364 +      using neg pos fin positive_integral_positive[of ?P ?f]
   1.365 +      by (cases "(\<integral>\<^isup>+ x. ?f x \<partial>?P)") (auto simp add: lebesgue_integral_def less_le split_beta')
   1.366 +    show "integral\<^isup>L ?P ?f \<le> 1"
   1.367 +      using neg le1 fin positive_integral_positive[of ?P ?f]
   1.368 +      by (cases "(\<integral>\<^isup>+ x. ?f x \<partial>?P)") (auto simp add: lebesgue_integral_def split_beta' one_ereal_def)
   1.369 +  qed
   1.370 +  also have "- log b (integral\<^isup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
   1.371 +  proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
   1.372 +    show "AE x in ?P. ?f x \<in> {0<..}"
   1.373 +      unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
   1.374 +      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
   1.375 +      by eventually_elim (auto simp: divide_pos_pos mult_pos_pos)
   1.376 +    show "integrable ?P ?f"
   1.377 +      unfolding integrable_def 
   1.378 +      using fin neg by (auto intro!: M simp: split_beta')
   1.379 +    show "integrable ?P (\<lambda>x. - log b (?f x))"
   1.380 +      apply (subst integral_density)
   1.381 +      apply (auto intro!: M) []
   1.382 +      apply (auto intro!: M distributed_real_AE[OF Pxyz]) []
   1.383 +      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
   1.384 +      apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
   1.385 +      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
   1.386 +      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
   1.387 +      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
   1.388 +      apply eventually_elim
   1.389 +      apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
   1.390 +      done
   1.391 +  qed (auto simp: b_gt_1 minus_log_convex)
   1.392 +  also have "\<dots> = conditional_mutual_information b S T P X Y Z"
   1.393 +    unfolding `?eq`
   1.394 +    apply (subst integral_density)
   1.395 +    apply (auto intro!: M) []
   1.396 +    apply (auto intro!: M distributed_real_AE[OF Pxyz]) []
   1.397 +    apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
   1.398 +    apply (intro integral_cong_AE)
   1.399 +    using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
   1.400 +    apply eventually_elim
   1.401 +    apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
   1.402 +    done
   1.403 +  finally show ?nonneg
   1.404 +    by simp
   1.405  qed
   1.406  
   1.407  lemma (in information_space) conditional_mutual_information_eq:
   1.408 @@ -1031,86 +1084,21 @@
   1.409    assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
   1.410    shows "0 \<le> \<I>(X ; Y | Z)"
   1.411  proof -
   1.412 -  def Pz \<equiv> "\<lambda>x. if x \<in> Z`space M then measure M (Z -` {x} \<inter> space M) else 0"
   1.413 -  def Pxz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x))`space M then measure M ((\<lambda>x. (X x, Z x)) -` {x} \<inter> space M) else 0"
   1.414 -  def Pyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x))`space M then measure M ((\<lambda>x. (Y x, Z x)) -` {x} \<inter> space M) else 0"
   1.415 -  def Pxyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x))`space M then measure M ((\<lambda>x. (X x, Y x, Z x)) -` {x} \<inter> space M) else 0"
   1.416 -  let ?M = "X`space M \<times> Y`space M \<times> Z`space M"
   1.417 -
   1.418 -  note XZ = simple_function_Pair[OF X Z]
   1.419 -  note YZ = simple_function_Pair[OF Y Z]
   1.420 -  note XYZ = simple_function_Pair[OF X simple_function_Pair[OF Y Z]]
   1.421 -  have Pz: "simple_distributed M Z Pz"
   1.422 -    using Z by (rule simple_distributedI) (auto simp: Pz_def)
   1.423 -  have Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz"
   1.424 -    using XZ by (rule simple_distributedI) (auto simp: Pxz_def)
   1.425 -  have Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz"
   1.426 -    using YZ by (rule simple_distributedI) (auto simp: Pyz_def)
   1.427 -  have Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz"
   1.428 -    using XYZ by (rule simple_distributedI) (auto simp: Pxyz_def)
   1.429 -
   1.430 -  { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>x\<in>X ` space M. Pxz (x, z)) = Pz z"
   1.431 -      using distributed_marginal_eq_joint_simple[OF X Pz Pxz z]
   1.432 -      by (auto intro!: setsum_cong simp: Pxz_def) }
   1.433 -  note marginal1 = this
   1.434 -
   1.435 -  { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>y\<in>Y ` space M. Pyz (y, z)) = Pz z"
   1.436 -      using distributed_marginal_eq_joint_simple[OF Y Pz Pyz z]
   1.437 -      by (auto intro!: setsum_cong simp: Pyz_def) }
   1.438 -  note marginal2 = this
   1.439 -
   1.440 -  have "- \<I>(X ; Y | Z) = - (\<Sum>(x, y, z) \<in> ?M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
   1.441 -    unfolding conditional_mutual_information_eq[OF Pz Pyz Pxz Pxyz]
   1.442 -    using X Y Z by (auto intro!: setsum_mono_zero_left simp: Pxyz_def simple_functionD)
   1.443 -  also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z))"
   1.444 -    unfolding split_beta'
   1.445 -  proof (rule log_setsum_divide)
   1.446 -    show "?M \<noteq> {}" using not_empty by simp
   1.447 -    show "1 < b" using b_gt_1 .
   1.448 -
   1.449 -    show "finite ?M" using X Y Z by (auto simp: simple_functionD)
   1.450 -
   1.451 -    then show "(\<Sum>x\<in>?M. Pxyz (fst x, fst (snd x), snd (snd x))) = 1"
   1.452 -      apply (subst Pxyz[THEN simple_distributed_setsum_space, symmetric])
   1.453 -      apply simp
   1.454 -      apply (intro setsum_mono_zero_right)
   1.455 -      apply (auto simp: Pxyz_def)
   1.456 -      done
   1.457 -    let ?N = "(\<lambda>x. (X x, Y x, Z x)) ` space M"
   1.458 -    fix x assume x: "x \<in> ?M"
   1.459 -    let ?Q = "Pxyz (fst x, fst (snd x), snd (snd x))"
   1.460 -    let ?P = "Pxz (fst x, snd (snd x)) * (Pyz (fst (snd x), snd (snd x)) / Pz (snd (snd x)))"
   1.461 -    from x show "0 \<le> ?Q" "0 \<le> ?P"
   1.462 -      using Pxyz[THEN simple_distributed, THEN distributed_real_AE]
   1.463 -      using Pxz[THEN simple_distributed, THEN distributed_real_AE]
   1.464 -      using Pyz[THEN simple_distributed, THEN distributed_real_AE]
   1.465 -      using Pz[THEN simple_distributed, THEN distributed_real_AE]
   1.466 -      by (auto intro!: mult_nonneg_nonneg divide_nonneg_nonneg simp: AE_count_space Pxyz_def Pxz_def Pyz_def Pz_def)
   1.467 -    moreover assume "0 < ?Q"
   1.468 -    moreover have "AE x in count_space ?N. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.469 -      by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz[THEN simple_distributed] Pz[THEN simple_distributed]]) (auto intro: measurable_snd')
   1.470 -    then have "\<And>x. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.471 -      by (auto simp: Pz_def Pxyz_def AE_count_space)
   1.472 -    moreover have "AE x in count_space ?N. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.473 -      by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz[THEN simple_distributed] Pxz[THEN simple_distributed]]) (auto intro: measurable_Pair measurable_snd')
   1.474 -    then have "\<And>x. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
   1.475 -      by (auto simp: Pz_def Pxyz_def AE_count_space)
   1.476 -    moreover have "AE x in count_space ?N. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
   1.477 -      by (intro subdensity_real[of snd, OF _ Pxyz[THEN simple_distributed] Pyz[THEN simple_distributed]]) (auto intro: measurable_Pair)
   1.478 -    then have "\<And>x. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
   1.479 -      by (auto simp: Pz_def Pxyz_def AE_count_space)
   1.480 -    ultimately show "0 < ?P" using x by (auto intro!: divide_pos_pos mult_pos_pos simp: less_le)
   1.481 -  qed
   1.482 -  also have "(\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z)) = (\<Sum>z\<in>Z`space M. Pz z)"
   1.483 -    apply (simp add: setsum_cartesian_product')
   1.484 -    apply (subst setsum_commute)
   1.485 -    apply (subst (2) setsum_commute)
   1.486 -    apply (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] marginal1 marginal2
   1.487 -          intro!: setsum_cong)
   1.488 -    done
   1.489 -  also have "log b (\<Sum>z\<in>Z`space M. Pz z) = 0"
   1.490 -    using Pz[THEN simple_distributed_setsum_space] by simp
   1.491 -  finally show ?thesis by simp
   1.492 +  have [simp]: "count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M) \<Otimes>\<^isub>M count_space (Z ` space M) =
   1.493 +      count_space (X`space M \<times> Y`space M \<times> Z`space M)"
   1.494 +    by (simp add: pair_measure_count_space X Y Z simple_functionD)
   1.495 +  note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
   1.496 +  note sd = simple_distributedI[OF _ refl]
   1.497 +  note sp = simple_function_Pair
   1.498 +  show ?thesis
   1.499 +   apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
   1.500 +   apply (rule simple_distributed[OF sd[OF X]])
   1.501 +   apply (rule simple_distributed[OF sd[OF Z]])
   1.502 +   apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
   1.503 +   apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]])
   1.504 +   apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
   1.505 +   apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
   1.506 +   done
   1.507  qed
   1.508  
   1.509  subsection {* Conditional Entropy *}