author hoelzl Wed Oct 10 12:12:25 2012 +0200 (2012-10-10) changeset 49787 d8de705b48d4 parent 49786 f33d5f009627 child 49788 3c10763f5cb4
rule to show that conditional mutual information is non-negative in the continuous case
```     1.1 --- a/src/HOL/Probability/Information.thy	Wed Oct 10 12:12:25 2012 +0200
1.2 +++ b/src/HOL/Probability/Information.thy	Wed Oct 10 12:12:25 2012 +0200
1.3 @@ -22,104 +22,6 @@
1.4    "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
1.5    unfolding setsum_cartesian_product by simp
1.6
1.7 -section "Convex theory"
1.8 -
1.9 -lemma log_setsum:
1.10 -  assumes "finite s" "s \<noteq> {}"
1.11 -  assumes "b > 1"
1.12 -  assumes "(\<Sum> i \<in> s. a i) = 1"
1.13 -  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
1.14 -  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
1.15 -  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
1.16 -proof -
1.17 -  have "convex_on {0 <..} (\<lambda> x. - log b x)"
1.18 -    by (rule minus_log_convex[OF `b > 1`])
1.19 -  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
1.20 -    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastforce
1.21 -  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
1.22 -qed
1.23 -
1.24 -lemma log_setsum':
1.25 -  assumes "finite s" "s \<noteq> {}"
1.26 -  assumes "b > 1"
1.27 -  assumes "(\<Sum> i \<in> s. a i) = 1"
1.28 -  assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
1.29 -          "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
1.30 -  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
1.31 -proof -
1.32 -  have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
1.33 -    using assms by (auto intro!: setsum_mono_zero_cong_left)
1.34 -  moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
1.35 -  proof (rule log_setsum)
1.36 -    have "setsum a (s - {i. a i = 0}) = setsum a s"
1.37 -      using assms(1) by (rule setsum_mono_zero_cong_left) auto
1.38 -    thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
1.39 -      "finite (s - {i. a i = 0})" using assms by simp_all
1.40 -
1.41 -    show "s - {i. a i = 0} \<noteq> {}"
1.42 -    proof
1.43 -      assume *: "s - {i. a i = 0} = {}"
1.44 -      hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
1.45 -      with sum_1 show False by simp
1.46 -    qed
1.47 -
1.48 -    fix i assume "i \<in> s - {i. a i = 0}"
1.49 -    hence "i \<in> s" "a i \<noteq> 0" by simp_all
1.50 -    thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
1.51 -  qed fact+
1.52 -  ultimately show ?thesis by simp
1.53 -qed
1.54 -
1.55 -lemma log_setsum_divide:
1.56 -  assumes "finite S" and "S \<noteq> {}" and "1 < b"
1.57 -  assumes "(\<Sum>x\<in>S. g x) = 1"
1.58 -  assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
1.59 -  assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
1.60 -  shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
1.61 -proof -
1.62 -  have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
1.63 -    using `1 < b` by (subst log_le_cancel_iff) auto
1.64 -
1.65 -  have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
1.66 -  proof (unfold setsum_negf[symmetric], rule setsum_cong)
1.67 -    fix x assume x: "x \<in> S"
1.68 -    show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
1.69 -    proof (cases "g x = 0")
1.70 -      case False
1.71 -      with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
1.72 -      thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
1.73 -    qed simp
1.74 -  qed rule
1.75 -  also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
1.76 -  proof (rule log_setsum')
1.77 -    fix x assume x: "x \<in> S" "0 < g x"
1.78 -    with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
1.79 -  qed fact+
1.80 -  also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
1.81 -    by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
1.82 -        split: split_if_asm)
1.83 -  also have "... \<le> log b (\<Sum>x\<in>S. f x)"
1.84 -  proof (rule log_mono)
1.85 -    have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
1.86 -    also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
1.87 -    proof (rule setsum_strict_mono)
1.88 -      show "finite (S - {x. g x = 0})" using `finite S` by simp
1.89 -      show "S - {x. g x = 0} \<noteq> {}"
1.90 -      proof
1.91 -        assume "S - {x. g x = 0} = {}"
1.92 -        hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
1.93 -        with `(\<Sum>x\<in>S. g x) = 1` show False by simp
1.94 -      qed
1.95 -      fix x assume "x \<in> S - {x. g x = 0}"
1.96 -      thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
1.97 -    qed
1.98 -    finally show "0 < ?sum" .
1.99 -    show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
1.100 -      using `finite S` pos by (auto intro!: setsum_mono2)
1.101 -  qed
1.102 -  finally show ?thesis .
1.103 -qed
1.104 -
1.105  lemma split_pairs:
1.106    "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
1.107    "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
1.108 @@ -831,39 +733,25 @@
1.109    assumes X: "simple_distributed M X f"
1.110    shows "\<H>(X) \<le> log b (card (X ` space M \<inter> {x. f x \<noteq> 0}))"
1.111  proof -
1.112 -  have "\<H>(X) = (\<Sum>x\<in>X`space M. f x * log b (1 / f x))"
1.113 -    unfolding entropy_simple_distributed[OF X] setsum_negf[symmetric]
1.114 -    using X by (auto dest: simple_distributed_nonneg intro!: setsum_cong simp: log_simps less_le)
1.115 -  also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. f x * (1 / f x))"
1.116 -    using not_empty b_gt_1 `simple_distributed M X f`
1.117 -    by (intro log_setsum') (auto simp: simple_distributed_nonneg simple_distributed_setsum_space)
1.118 -  also have "\<dots> = log b (\<Sum>x\<in>X`space M. if f x \<noteq> 0 then 1 else 0)"
1.119 -    by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) auto
1.120 -  finally show ?thesis
1.121 -    using `simple_distributed M X f` by (auto simp: setsum_cases real_eq_of_nat)
1.122 +  let ?X = "count_space (X`space M)"
1.123 +  have "\<H>(X) \<le> log b (measure ?X {x \<in> space ?X. f x \<noteq> 0})"
1.124 +    by (rule entropy_le[OF simple_distributed[OF X]])
1.125 +       (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space)
1.126 +  also have "measure ?X {x \<in> space ?X. f x \<noteq> 0} = card (X ` space M \<inter> {x. f x \<noteq> 0})"
1.127 +    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def Int_def)
1.128 +  finally show ?thesis .
1.129  qed
1.130
1.131  lemma (in information_space) entropy_le_card:
1.132 -  assumes "simple_distributed M X f"
1.133 +  assumes X: "simple_distributed M X f"
1.134    shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
1.135 -proof cases
1.136 -  assume "X ` space M \<inter> {x. f x \<noteq> 0} = {}"
1.137 -  then have "\<And>x. x\<in>X`space M \<Longrightarrow> f x = 0" by auto
1.138 -  moreover
1.139 -  have "0 < card (X`space M)"
1.140 -    using `simple_distributed M X f` not_empty by (auto simp: card_gt_0_iff)
1.141 -  then have "log b 1 \<le> log b (real (card (X`space M)))"
1.142 -    using b_gt_1 by (intro log_le) auto
1.143 -  ultimately show ?thesis using assms by (simp add: entropy_simple_distributed)
1.144 -next
1.145 -  assume False: "X ` space M \<inter> {x. f x \<noteq> 0} \<noteq> {}"
1.146 -  have "card (X ` space M \<inter> {x. f x \<noteq> 0}) \<le> card (X ` space M)"
1.147 -    (is "?A \<le> ?B") using assms not_empty
1.148 -    by (auto intro!: card_mono simp: simple_function_def simple_distributed_def)
1.149 -  note entropy_le_card_not_0[OF assms]
1.150 -  also have "log b (real ?A) \<le> log b (real ?B)"
1.151 -    using b_gt_1 False not_empty `?A \<le> ?B` assms
1.152 -    by (auto intro!: log_le simp: card_gt_0_iff simp: simple_distributed_def)
1.153 +proof -
1.154 +  let ?X = "count_space (X`space M)"
1.155 +  have "\<H>(X) \<le> log b (measure ?X (space ?X))"
1.156 +    by (rule entropy_le_space[OF simple_distributed[OF X]])
1.157 +       (simp_all add: simple_distributed_finite[OF X] subset_eq integrable_count_space emeasure_count_space finite_measure_count_space)
1.158 +  also have "measure ?X (space ?X) = card (X ` space M)"
1.159 +    by (simp_all add: simple_distributed_finite[OF X] subset_eq emeasure_count_space measure_def)
1.160    finally show ?thesis .
1.161  qed
1.162
1.163 @@ -879,7 +767,18 @@
1.164    "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
1.165      (count_space (X ` space M)) (count_space (Y ` space M)) (count_space (Z ` space M)) X Y Z"
1.166
1.167 -lemma (in information_space) conditional_mutual_information_generic_eq:
1.168 +lemma (in pair_sigma_finite) borel_measurable_positive_integral_fst:
1.169 +  "(\<lambda>(x, y). f x y) \<in> borel_measurable (M1 \<Otimes>\<^isub>M M2) \<Longrightarrow> (\<lambda>x. \<integral>\<^isup>+ y. f x y \<partial>M2) \<in> borel_measurable M1"
1.170 +  using positive_integral_fst_measurable(1)[of "\<lambda>(x, y). f x y"] by simp
1.171 +
1.172 +lemma (in pair_sigma_finite) borel_measurable_positive_integral_snd:
1.173 +  assumes "(\<lambda>(x, y). f x y) \<in> borel_measurable (M2 \<Otimes>\<^isub>M M1)" shows "(\<lambda>x. \<integral>\<^isup>+ y. f x y \<partial>M1) \<in> borel_measurable M2"
1.174 +proof -
1.175 +  interpret Q: pair_sigma_finite M2 M1 by default
1.176 +  from Q.borel_measurable_positive_integral_fst assms show ?thesis by simp
1.177 +qed
1.178 +
1.179 +lemma (in information_space)
1.180    assumes S: "sigma_finite_measure S" and T: "sigma_finite_measure T" and P: "sigma_finite_measure P"
1.181    assumes Px: "distributed M S X Px"
1.182    assumes Pz: "distributed M P Z Pz"
1.183 @@ -888,16 +787,19 @@
1.184    assumes Pxyz: "distributed M (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>x. (X x, Y x, Z x)) Pxyz"
1.185    assumes I1: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Px x * Pyz (y, z))))"
1.186    assumes I2: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxz (x, z) / (Px x * Pz z)))"
1.187 -  shows "conditional_mutual_information b S T P X Y Z
1.188 -    = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))"
1.189 +  shows conditional_mutual_information_generic_eq: "conditional_mutual_information b S T P X Y Z
1.190 +    = (\<integral>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" (is "?eq")
1.191 +    and conditional_mutual_information_generic_nonneg: "0 \<le> conditional_mutual_information b S T P X Y Z" (is "?nonneg")
1.192  proof -
1.193    interpret S: sigma_finite_measure S by fact
1.194    interpret T: sigma_finite_measure T by fact
1.195    interpret P: sigma_finite_measure P by fact
1.196    interpret TP: pair_sigma_finite T P ..
1.197    interpret SP: pair_sigma_finite S P ..
1.198 +  interpret ST: pair_sigma_finite S T ..
1.199    interpret SPT: pair_sigma_finite "S \<Otimes>\<^isub>M P" T ..
1.200    interpret STP: pair_sigma_finite S "T \<Otimes>\<^isub>M P" ..
1.201 +  interpret TPS: pair_sigma_finite "T \<Otimes>\<^isub>M P" S ..
1.202    have TP: "sigma_finite_measure (T \<Otimes>\<^isub>M P)" ..
1.203    have SP: "sigma_finite_measure (S \<Otimes>\<^isub>M P)" ..
1.204    have YZ: "random_variable (T \<Otimes>\<^isub>M P) (\<lambda>x. (Y x, Z x))"
1.205 @@ -933,27 +835,27 @@
1.206    finally have mi_eq:
1.207      "mutual_information b S P X Z = (\<integral>(x,y,z). Pxyz (x,y,z) * log b (Pxz (x,z) / (Px x * Pz z)) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))" .
1.208
1.209 -  have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
1.210 +  have ae1: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Px (fst x) = 0 \<longrightarrow> Pxyz x = 0"
1.211      by (intro subdensity_real[of fst, OF _ Pxyz Px]) auto
1.212 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.213 +  moreover have ae2: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.214      by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz Pz]) (auto intro: measurable_snd')
1.215 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.216 +  moreover have ae3: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.217      by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz Pxz]) (auto intro: measurable_Pair measurable_snd')
1.218 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
1.219 +  moreover have ae4: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
1.220      by (intro subdensity_real[of snd, OF _ Pxyz Pyz]) (auto intro: measurable_Pair)
1.221 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Px (fst x)"
1.222 +  moreover have ae5: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Px (fst x)"
1.223      using Px by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_fst'' dest: distributed_real_AE distributed_real_measurable)
1.224 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pyz (snd x)"
1.225 +  moreover have ae6: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pyz (snd x)"
1.226      using Pyz by (intro STP.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
1.227 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd (snd x))"
1.228 +  moreover have ae7: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd (snd x))"
1.229      using Pz Pz[THEN distributed_real_measurable] by (auto intro!: measurable_snd'' TP.AE_pair_measure STP.AE_pair_measure AE_I2[of S] dest: distributed_real_AE)
1.230 -  moreover have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
1.231 +  moreover have ae8: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. 0 \<le> Pxz (fst x, snd (snd x))"
1.232      using Pxz[THEN distributed_real_AE, THEN SP.AE_pair]
1.233      using measurable_comp[OF measurable_Pair[OF measurable_fst measurable_comp[OF measurable_snd measurable_snd]] Pxz[THEN distributed_real_measurable], of T]
1.234      using measurable_comp[OF measurable_snd measurable_Pair2[OF Pxz[THEN distributed_real_measurable]], of _ T]
1.235      by (auto intro!: TP.AE_pair_measure STP.AE_pair_measure simp: comp_def)
1.236    moreover note Pxyz[THEN distributed_real_AE]
1.237 -  ultimately have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P.
1.238 +  ultimately have ae: "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P.
1.239      Pxyz x * log b (Pxyz x / (Px (fst x) * Pyz (snd x))) -
1.240      Pxyz x * log b (Pxz (fst x, snd (snd x)) / (Px (fst x) * Pz (snd (snd x)))) =
1.241      Pxyz x * log b (Pxyz x * Pz (snd (snd x)) / (Pxz (fst x, snd (snd x)) * Pyz (snd x))) "
1.242 @@ -968,13 +870,164 @@
1.243          using b_gt_1 by (simp add: log_simps mult_pos_pos less_imp_le field_simps)
1.244      qed simp
1.245    qed
1.246 -  with I1 I2 show ?thesis
1.247 +  with I1 I2 show ?eq
1.248      unfolding conditional_mutual_information_def
1.249      apply (subst mi_eq)
1.250      apply (subst mutual_information_distr[OF S TP Px Pyz Pxyz])
1.251      apply (subst integral_diff(2)[symmetric])
1.252      apply (auto intro!: integral_cong_AE simp: split_beta' simp del: integral_diff)
1.253      done
1.254 +
1.255 +  let ?P = "density (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) Pxyz"
1.256 +  interpret P: prob_space ?P
1.257 +    unfolding distributed_distr_eq_density[OF Pxyz, symmetric]
1.258 +    using distributed_measurable[OF Pxyz] by (rule prob_space_distr)
1.259 +
1.260 +  let ?Q = "density (T \<Otimes>\<^isub>M P) Pyz"
1.261 +  interpret Q: prob_space ?Q
1.262 +    unfolding distributed_distr_eq_density[OF Pyz, symmetric]
1.263 +    using distributed_measurable[OF Pyz] by (rule prob_space_distr)
1.264 +
1.265 +  let ?f = "\<lambda>(x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) / Pxyz (x, y, z)"
1.266 +
1.267 +  from subdensity_real[of snd, OF _ Pyz Pz]
1.268 +  have aeX1: "AE x in T \<Otimes>\<^isub>M P. Pz (snd x) = 0 \<longrightarrow> Pyz x = 0" by (auto simp: comp_def)
1.269 +  have aeX2: "AE x in T \<Otimes>\<^isub>M P. 0 \<le> Pz (snd x)"
1.270 +    using Pz by (intro TP.AE_pair_measure) (auto simp: comp_def intro!: measurable_snd'' dest: distributed_real_AE distributed_real_measurable)
1.271 +
1.272 +  have aeX3: "AE y in T \<Otimes>\<^isub>M P. (\<integral>\<^isup>+ x. ereal (Pxz (x, snd y)) \<partial>S) = ereal (Pz (snd y))"
1.273 +    using Pz distributed_marginal_eq_joint[OF P S Px Pz Pxz]
1.274 +    apply (intro TP.AE_pair_measure)
1.275 +    apply (auto simp: comp_def measurable_split_conv
1.276 +                intro!: measurable_snd'' borel_measurable_ereal_eq borel_measurable_ereal
1.277 +                        SP.borel_measurable_positive_integral_snd measurable_compose[OF _ Pxz[THEN distributed_real_measurable]]
1.278 +                        measurable_Pair
1.279 +                dest: distributed_real_AE distributed_real_measurable)
1.280 +    done
1.281 +
1.282 +  note M = borel_measurable_divide borel_measurable_diff borel_measurable_times borel_measurable_ereal
1.283 +           measurable_compose[OF _ measurable_snd]
1.284 +           measurable_Pair
1.285 +           measurable_compose[OF _ Pxyz[THEN distributed_real_measurable]]
1.286 +           measurable_compose[OF _ Pxz[THEN distributed_real_measurable]]
1.287 +           measurable_compose[OF _ Pyz[THEN distributed_real_measurable]]
1.288 +           measurable_compose[OF _ Pz[THEN distributed_real_measurable]]
1.289 +           measurable_compose[OF _ Px[THEN distributed_real_measurable]]
1.290 +           STP.borel_measurable_positive_integral_snd
1.291 +  have "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<le> (\<integral>\<^isup>+ (x, y, z). Pxz (x, z) * (Pyz (y, z) / Pz z) \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P))"
1.292 +    apply (subst positive_integral_density)
1.293 +    apply (rule distributed_borel_measurable[OF Pxyz])
1.294 +    apply (rule distributed_AE[OF Pxyz])
1.295 +    apply (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M) []
1.296 +    apply (rule positive_integral_mono_AE)
1.297 +    using ae5 ae6 ae7 ae8
1.298 +    apply eventually_elim
1.299 +    apply (auto intro!: divide_nonneg_nonneg mult_nonneg_nonneg)
1.300 +    done
1.301 +  also have "\<dots> = (\<integral>\<^isup>+(y, z). \<integral>\<^isup>+ x. ereal (Pxz (x, z)) * ereal (Pyz (y, z) / Pz z) \<partial>S \<partial>T \<Otimes>\<^isub>M P)"
1.302 +    by (subst STP.positive_integral_snd_measurable[symmetric])
1.303 +       (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M)
1.304 +  also have "\<dots> = (\<integral>\<^isup>+x. ereal (Pyz x) * 1 \<partial>T \<Otimes>\<^isub>M P)"
1.305 +    apply (rule positive_integral_cong_AE)
1.306 +    using aeX1 aeX2 aeX3 distributed_AE[OF Pyz] AE_space
1.307 +    apply eventually_elim
1.308 +  proof (case_tac x, simp del: times_ereal.simps add: space_pair_measure)
1.309 +    fix a b assume "Pz b = 0 \<longrightarrow> Pyz (a, b) = 0" "0 \<le> Pz b" "a \<in> space T \<and> b \<in> space P"
1.310 +      "(\<integral>\<^isup>+ x. ereal (Pxz (x, b)) \<partial>S) = ereal (Pz b)" "0 \<le> Pyz (a, b)"
1.311 +    then show "(\<integral>\<^isup>+ x. ereal (Pxz (x, b)) * ereal (Pyz (a, b) / Pz b) \<partial>S) = ereal (Pyz (a, b))"
1.312 +      apply (subst positive_integral_multc)
1.313 +      apply (auto intro!: borel_measurable_ereal divide_nonneg_nonneg
1.314 +                          measurable_compose[OF _ Pxz[THEN distributed_real_measurable]] measurable_Pair
1.315 +                  split: prod.split)
1.316 +      done
1.317 +  qed
1.318 +  also have "\<dots> = 1"
1.319 +    using Q.emeasure_space_1 distributed_AE[OF Pyz] distributed_distr_eq_density[OF Pyz]
1.320 +    by (subst positive_integral_density[symmetric]) (auto intro!: M)
1.321 +  finally have le1: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<le> 1" .
1.322 +  also have "\<dots> < \<infinity>" by simp
1.323 +  finally have fin: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<noteq> \<infinity>" by simp
1.324 +
1.325 +  have pos: "(\<integral>\<^isup>+ x. ?f x \<partial>?P) \<noteq> 0"
1.326 +    apply (subst positive_integral_density)
1.327 +    apply (rule distributed_borel_measurable[OF Pxyz])
1.328 +    apply (rule distributed_AE[OF Pxyz])
1.329 +    apply (auto simp add: borel_measurable_ereal_iff split_beta' intro!: M) []
1.330 +    apply (simp add: split_beta')
1.331 +  proof
1.332 +    let ?g = "\<lambda>x. ereal (if Pxyz x = 0 then 0 else Pxz (fst x, snd (snd x)) * Pyz (snd x) / Pz (snd (snd x)))"
1.333 +    assume "(\<integral>\<^isup>+ x. ?g x \<partial>(S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P)) = 0"
1.334 +    then have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. ?g x \<le> 0"
1.335 +      by (intro positive_integral_0_iff_AE[THEN iffD1]) (auto intro!: M borel_measurable_ereal measurable_If)
1.336 +    then have "AE x in S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P. Pxyz x = 0"
1.337 +      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
1.338 +      by eventually_elim (auto split: split_if_asm simp: mult_le_0_iff divide_le_0_iff)
1.339 +    then have "(\<integral>\<^isup>+ x. ereal (Pxyz x) \<partial>S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) = 0"
1.340 +      by (subst positive_integral_cong_AE[of _ "\<lambda>x. 0"]) auto
1.341 +    with P.emeasure_space_1 show False
1.342 +      by (subst (asm) emeasure_density) (auto intro!: M cong: positive_integral_cong)
1.343 +  qed
1.344 +
1.345 +  have neg: "(\<integral>\<^isup>+ x. - ?f x \<partial>?P) = 0"
1.346 +    apply (rule positive_integral_0_iff_AE[THEN iffD2])
1.347 +    apply (auto intro!: M simp: split_beta') []
1.348 +    apply (subst AE_density)
1.349 +    apply (auto intro!: M simp: split_beta') []
1.350 +    using ae5 ae6 ae7 ae8
1.351 +    apply eventually_elim
1.352 +    apply (auto intro!: mult_nonneg_nonneg divide_nonneg_nonneg)
1.353 +    done
1.354 +
1.355 +  have I3: "integrable (S \<Otimes>\<^isub>M T \<Otimes>\<^isub>M P) (\<lambda>(x, y, z). Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
1.356 +    apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ integral_diff(1)[OF I1 I2]])
1.357 +    using ae
1.358 +    apply (auto intro!: M simp: split_beta')
1.359 +    done
1.360 +
1.361 +  have "- log b 1 \<le> - log b (integral\<^isup>L ?P ?f)"
1.362 +  proof (intro le_imp_neg_le log_le[OF b_gt_1])
1.363 +    show "0 < integral\<^isup>L ?P ?f"
1.364 +      using neg pos fin positive_integral_positive[of ?P ?f]
1.365 +      by (cases "(\<integral>\<^isup>+ x. ?f x \<partial>?P)") (auto simp add: lebesgue_integral_def less_le split_beta')
1.366 +    show "integral\<^isup>L ?P ?f \<le> 1"
1.367 +      using neg le1 fin positive_integral_positive[of ?P ?f]
1.368 +      by (cases "(\<integral>\<^isup>+ x. ?f x \<partial>?P)") (auto simp add: lebesgue_integral_def split_beta' one_ereal_def)
1.369 +  qed
1.370 +  also have "- log b (integral\<^isup>L ?P ?f) \<le> (\<integral> x. - log b (?f x) \<partial>?P)"
1.371 +  proof (rule P.jensens_inequality[where a=0 and b=1 and I="{0<..}"])
1.372 +    show "AE x in ?P. ?f x \<in> {0<..}"
1.373 +      unfolding AE_density[OF distributed_borel_measurable[OF Pxyz]]
1.374 +      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
1.375 +      by eventually_elim (auto simp: divide_pos_pos mult_pos_pos)
1.376 +    show "integrable ?P ?f"
1.377 +      unfolding integrable_def
1.378 +      using fin neg by (auto intro!: M simp: split_beta')
1.379 +    show "integrable ?P (\<lambda>x. - log b (?f x))"
1.380 +      apply (subst integral_density)
1.381 +      apply (auto intro!: M) []
1.382 +      apply (auto intro!: M distributed_real_AE[OF Pxyz]) []
1.383 +      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
1.384 +      apply (rule integrable_cong_AE[THEN iffD1, OF _ _ _ I3])
1.385 +      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
1.386 +      apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
1.387 +      using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
1.388 +      apply eventually_elim
1.389 +      apply (auto simp: log_divide_eq log_mult_eq zero_le_mult_iff zero_less_mult_iff zero_less_divide_iff field_simps)
1.390 +      done
1.391 +  qed (auto simp: b_gt_1 minus_log_convex)
1.392 +  also have "\<dots> = conditional_mutual_information b S T P X Y Z"
1.393 +    unfolding `?eq`
1.394 +    apply (subst integral_density)
1.395 +    apply (auto intro!: M) []
1.396 +    apply (auto intro!: M distributed_real_AE[OF Pxyz]) []
1.397 +    apply (auto intro!: M borel_measurable_uminus borel_measurable_log simp: split_beta') []
1.398 +    apply (intro integral_cong_AE)
1.399 +    using ae1 ae2 ae3 ae4 ae5 ae6 ae7 ae8 Pxyz[THEN distributed_real_AE]
1.400 +    apply eventually_elim
1.401 +    apply (auto simp: log_divide_eq zero_less_mult_iff zero_less_divide_iff field_simps)
1.402 +    done
1.403 +  finally show ?nonneg
1.404 +    by simp
1.405  qed
1.406
1.407  lemma (in information_space) conditional_mutual_information_eq:
1.408 @@ -1031,86 +1084,21 @@
1.409    assumes X: "simple_function M X" and Y: "simple_function M Y" and Z: "simple_function M Z"
1.410    shows "0 \<le> \<I>(X ; Y | Z)"
1.411  proof -
1.412 -  def Pz \<equiv> "\<lambda>x. if x \<in> Z`space M then measure M (Z -` {x} \<inter> space M) else 0"
1.413 -  def Pxz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Z x))`space M then measure M ((\<lambda>x. (X x, Z x)) -` {x} \<inter> space M) else 0"
1.414 -  def Pyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (Y x, Z x))`space M then measure M ((\<lambda>x. (Y x, Z x)) -` {x} \<inter> space M) else 0"
1.415 -  def Pxyz \<equiv> "\<lambda>x. if x \<in> (\<lambda>x. (X x, Y x, Z x))`space M then measure M ((\<lambda>x. (X x, Y x, Z x)) -` {x} \<inter> space M) else 0"
1.416 -  let ?M = "X`space M \<times> Y`space M \<times> Z`space M"
1.417 -
1.418 -  note XZ = simple_function_Pair[OF X Z]
1.419 -  note YZ = simple_function_Pair[OF Y Z]
1.420 -  note XYZ = simple_function_Pair[OF X simple_function_Pair[OF Y Z]]
1.421 -  have Pz: "simple_distributed M Z Pz"
1.422 -    using Z by (rule simple_distributedI) (auto simp: Pz_def)
1.423 -  have Pxz: "simple_distributed M (\<lambda>x. (X x, Z x)) Pxz"
1.424 -    using XZ by (rule simple_distributedI) (auto simp: Pxz_def)
1.425 -  have Pyz: "simple_distributed M (\<lambda>x. (Y x, Z x)) Pyz"
1.426 -    using YZ by (rule simple_distributedI) (auto simp: Pyz_def)
1.427 -  have Pxyz: "simple_distributed M (\<lambda>x. (X x, Y x, Z x)) Pxyz"
1.428 -    using XYZ by (rule simple_distributedI) (auto simp: Pxyz_def)
1.429 -
1.430 -  { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>x\<in>X ` space M. Pxz (x, z)) = Pz z"
1.431 -      using distributed_marginal_eq_joint_simple[OF X Pz Pxz z]
1.432 -      by (auto intro!: setsum_cong simp: Pxz_def) }
1.433 -  note marginal1 = this
1.434 -
1.435 -  { fix z assume z: "z \<in> Z ` space M" then have "(\<Sum>y\<in>Y ` space M. Pyz (y, z)) = Pz z"
1.436 -      using distributed_marginal_eq_joint_simple[OF Y Pz Pyz z]
1.437 -      by (auto intro!: setsum_cong simp: Pyz_def) }
1.438 -  note marginal2 = this
1.439 -
1.440 -  have "- \<I>(X ; Y | Z) = - (\<Sum>(x, y, z) \<in> ?M. Pxyz (x, y, z) * log b (Pxyz (x, y, z) / (Pxz (x, z) * (Pyz (y,z) / Pz z))))"
1.441 -    unfolding conditional_mutual_information_eq[OF Pz Pyz Pxz Pxyz]
1.442 -    using X Y Z by (auto intro!: setsum_mono_zero_left simp: Pxyz_def simple_functionD)
1.443 -  also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z))"
1.444 -    unfolding split_beta'
1.445 -  proof (rule log_setsum_divide)
1.446 -    show "?M \<noteq> {}" using not_empty by simp
1.447 -    show "1 < b" using b_gt_1 .
1.448 -
1.449 -    show "finite ?M" using X Y Z by (auto simp: simple_functionD)
1.450 -
1.451 -    then show "(\<Sum>x\<in>?M. Pxyz (fst x, fst (snd x), snd (snd x))) = 1"
1.452 -      apply (subst Pxyz[THEN simple_distributed_setsum_space, symmetric])
1.453 -      apply simp
1.454 -      apply (intro setsum_mono_zero_right)
1.455 -      apply (auto simp: Pxyz_def)
1.456 -      done
1.457 -    let ?N = "(\<lambda>x. (X x, Y x, Z x)) ` space M"
1.458 -    fix x assume x: "x \<in> ?M"
1.459 -    let ?Q = "Pxyz (fst x, fst (snd x), snd (snd x))"
1.460 -    let ?P = "Pxz (fst x, snd (snd x)) * (Pyz (fst (snd x), snd (snd x)) / Pz (snd (snd x)))"
1.461 -    from x show "0 \<le> ?Q" "0 \<le> ?P"
1.462 -      using Pxyz[THEN simple_distributed, THEN distributed_real_AE]
1.463 -      using Pxz[THEN simple_distributed, THEN distributed_real_AE]
1.464 -      using Pyz[THEN simple_distributed, THEN distributed_real_AE]
1.465 -      using Pz[THEN simple_distributed, THEN distributed_real_AE]
1.466 -      by (auto intro!: mult_nonneg_nonneg divide_nonneg_nonneg simp: AE_count_space Pxyz_def Pxz_def Pyz_def Pz_def)
1.467 -    moreover assume "0 < ?Q"
1.468 -    moreover have "AE x in count_space ?N. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.469 -      by (intro subdensity_real[of "\<lambda>x. snd (snd x)", OF _ Pxyz[THEN simple_distributed] Pz[THEN simple_distributed]]) (auto intro: measurable_snd')
1.470 -    then have "\<And>x. Pz (snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.471 -      by (auto simp: Pz_def Pxyz_def AE_count_space)
1.472 -    moreover have "AE x in count_space ?N. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.473 -      by (intro subdensity_real[of "\<lambda>x. (fst x, snd (snd x))", OF _ Pxyz[THEN simple_distributed] Pxz[THEN simple_distributed]]) (auto intro: measurable_Pair measurable_snd')
1.474 -    then have "\<And>x. Pxz (fst x, snd (snd x)) = 0 \<longrightarrow> Pxyz x = 0"
1.475 -      by (auto simp: Pz_def Pxyz_def AE_count_space)
1.476 -    moreover have "AE x in count_space ?N. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
1.477 -      by (intro subdensity_real[of snd, OF _ Pxyz[THEN simple_distributed] Pyz[THEN simple_distributed]]) (auto intro: measurable_Pair)
1.478 -    then have "\<And>x. Pyz (snd x) = 0 \<longrightarrow> Pxyz x = 0"
1.479 -      by (auto simp: Pz_def Pxyz_def AE_count_space)
1.480 -    ultimately show "0 < ?P" using x by (auto intro!: divide_pos_pos mult_pos_pos simp: less_le)
1.481 -  qed
1.482 -  also have "(\<Sum>(x, y, z) \<in> ?M. Pxz (x, z) * (Pyz (y,z) / Pz z)) = (\<Sum>z\<in>Z`space M. Pz z)"
1.483 -    apply (simp add: setsum_cartesian_product')
1.484 -    apply (subst setsum_commute)
1.485 -    apply (subst (2) setsum_commute)
1.486 -    apply (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] marginal1 marginal2
1.487 -          intro!: setsum_cong)
1.488 -    done
1.489 -  also have "log b (\<Sum>z\<in>Z`space M. Pz z) = 0"
1.490 -    using Pz[THEN simple_distributed_setsum_space] by simp
1.491 -  finally show ?thesis by simp
1.492 +  have [simp]: "count_space (X ` space M) \<Otimes>\<^isub>M count_space (Y ` space M) \<Otimes>\<^isub>M count_space (Z ` space M) =
1.493 +      count_space (X`space M \<times> Y`space M \<times> Z`space M)"
1.494 +    by (simp add: pair_measure_count_space X Y Z simple_functionD)
1.495 +  note sf = sigma_finite_measure_count_space_finite[OF simple_functionD(1)]
1.496 +  note sd = simple_distributedI[OF _ refl]
1.497 +  note sp = simple_function_Pair
1.498 +  show ?thesis
1.499 +   apply (rule conditional_mutual_information_generic_nonneg[OF sf[OF X] sf[OF Y] sf[OF Z]])
1.500 +   apply (rule simple_distributed[OF sd[OF X]])
1.501 +   apply (rule simple_distributed[OF sd[OF Z]])
1.502 +   apply (rule simple_distributed_joint[OF sd[OF sp[OF Y Z]]])
1.503 +   apply (rule simple_distributed_joint[OF sd[OF sp[OF X Z]]])
1.504 +   apply (rule simple_distributed_joint2[OF sd[OF sp[OF X sp[OF Y Z]]]])
1.505 +   apply (auto intro!: integrable_count_space simp: X Y Z simple_functionD)
1.506 +   done
1.507  qed
1.508
1.509  subsection {* Conditional Entropy *}
```