Cleanup information theory
authorhoelzl
Mon May 03 14:35:10 2010 +0200 (2010-05-03)
changeset 3662425153c08655e
parent 36623 d26348b667f2
child 36632 f96aa31b739d
child 36641 83d4e01ebda5
Cleanup information theory
src/HOL/Probability/Information.thy
src/HOL/Probability/Lebesgue.thy
src/HOL/Probability/Measure.thy
src/HOL/Probability/Probability_Space.thy
src/HOL/Probability/ex/Dining_Cryptographers.thy
     1.1 --- a/src/HOL/Probability/Information.thy	Mon May 03 14:35:10 2010 +0200
     1.2 +++ b/src/HOL/Probability/Information.thy	Mon May 03 14:35:10 2010 +0200
     1.3 @@ -2,168 +2,263 @@
     1.4  imports Probability_Space Product_Measure "../Multivariate_Analysis/Convex"
     1.5  begin
     1.6  
     1.7 -lemma pos_neg_part_abs:
     1.8 -  fixes f :: "'a \<Rightarrow> real"
     1.9 -  shows "pos_part f x + neg_part f x = \<bar>f x\<bar>"
    1.10 -unfolding real_abs_def pos_part_def neg_part_def by auto
    1.11 +section "Convex theory"
    1.12  
    1.13 -lemma pos_part_abs:
    1.14 -  fixes f :: "'a \<Rightarrow> real"
    1.15 -  shows "pos_part (\<lambda> x. \<bar>f x\<bar>) y = \<bar>f y\<bar>"
    1.16 -unfolding pos_part_def real_abs_def by auto
    1.17 -
    1.18 -lemma neg_part_abs:
    1.19 -  fixes f :: "'a \<Rightarrow> real"
    1.20 -  shows "neg_part (\<lambda> x. \<bar>f x\<bar>) y = 0"
    1.21 -unfolding neg_part_def real_abs_def by auto
    1.22 +lemma log_setsum:
    1.23 +  assumes "finite s" "s \<noteq> {}"
    1.24 +  assumes "b > 1"
    1.25 +  assumes "(\<Sum> i \<in> s. a i) = 1"
    1.26 +  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
    1.27 +  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
    1.28 +  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    1.29 +proof -
    1.30 +  have "convex_on {0 <..} (\<lambda> x. - log b x)"
    1.31 +    by (rule minus_log_convex[OF `b > 1`])
    1.32 +  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
    1.33 +    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
    1.34 +  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
    1.35 +qed
    1.36  
    1.37 -lemma (in measure_space) int_abs:
    1.38 -  assumes "integrable f"
    1.39 -  shows "integrable (\<lambda> x. \<bar>f x\<bar>)"
    1.40 -using assms
    1.41 +lemma log_setsum':
    1.42 +  assumes "finite s" "s \<noteq> {}"
    1.43 +  assumes "b > 1"
    1.44 +  assumes "(\<Sum> i \<in> s. a i) = 1"
    1.45 +  assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
    1.46 +          "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
    1.47 +  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    1.48  proof -
    1.49 -  from assms obtain p q where pq: "p \<in> nnfis (pos_part f)" "q \<in> nnfis (neg_part f)"
    1.50 -    unfolding integrable_def by auto
    1.51 -  hence "p + q \<in> nnfis (\<lambda> x. pos_part f x + neg_part f x)"
    1.52 -    using nnfis_add by auto
    1.53 -  hence "p + q \<in> nnfis (\<lambda> x. \<bar>f x\<bar>)" using pos_neg_part_abs[of f] by simp
    1.54 -  thus ?thesis unfolding integrable_def
    1.55 -    using ext[OF pos_part_abs[of f], of "\<lambda> y. y"]
    1.56 -      ext[OF neg_part_abs[of f], of "\<lambda> y. y"]
    1.57 -    using nnfis_0 by auto
    1.58 +  have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
    1.59 +    using assms by (auto intro!: setsum_mono_zero_cong_left)
    1.60 +  moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
    1.61 +  proof (rule log_setsum)
    1.62 +    have "setsum a (s - {i. a i = 0}) = setsum a s"
    1.63 +      using assms(1) by (rule setsum_mono_zero_cong_left) auto
    1.64 +    thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
    1.65 +      "finite (s - {i. a i = 0})" using assms by simp_all
    1.66 +
    1.67 +    show "s - {i. a i = 0} \<noteq> {}"
    1.68 +    proof
    1.69 +      assume *: "s - {i. a i = 0} = {}"
    1.70 +      hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
    1.71 +      with sum_1 show False by simp
    1.72 +qed
    1.73 +
    1.74 +    fix i assume "i \<in> s - {i. a i = 0}"
    1.75 +    hence "i \<in> s" "a i \<noteq> 0" by simp_all
    1.76 +    thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
    1.77 +  qed fact+
    1.78 +  ultimately show ?thesis by simp
    1.79  qed
    1.80  
    1.81 -lemma (in measure_space) measure_mono:
    1.82 -  assumes "a \<subseteq> b" "a \<in> sets M" "b \<in> sets M"
    1.83 -  shows "measure M a \<le> measure M b"
    1.84 +section "Information theory"
    1.85 +
    1.86 +lemma (in finite_prob_space) sum_over_space_distrib:
    1.87 +  "(\<Sum>x\<in>X`space M. distribution X {x}) = 1"
    1.88 +  unfolding distribution_def prob_space[symmetric] using finite_space
    1.89 +  by (subst measure_finitely_additive'')
    1.90 +     (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=prob])
    1.91 +
    1.92 +locale finite_information_space = finite_prob_space +
    1.93 +  fixes b :: real assumes b_gt_1: "1 < b"
    1.94 +
    1.95 +definition
    1.96 +  "KL_divergence b M X Y =
    1.97 +    measure_space.integral (M\<lparr>measure := X\<rparr>)
    1.98 +                           (\<lambda>x. log b ((measure_space.RN_deriv (M \<lparr>measure := Y\<rparr> ) X) x))"
    1.99 +
   1.100 +lemma (in finite_prob_space) distribution_mono:
   1.101 +  assumes "\<And>t. \<lbrakk> t \<in> space M ; X t \<in> x \<rbrakk> \<Longrightarrow> Y t \<in> y"
   1.102 +  shows "distribution X x \<le> distribution Y y"
   1.103 +  unfolding distribution_def
   1.104 +  using assms by (auto simp: sets_eq_Pow intro!: measure_mono)
   1.105 +
   1.106 +lemma (in prob_space) distribution_remove_const:
   1.107 +  shows "joint_distribution X (\<lambda>x. ()) {(x, ())} = distribution X {x}"
   1.108 +  and "joint_distribution (\<lambda>x. ()) X {((), x)} = distribution X {x}"
   1.109 +  and "joint_distribution X (\<lambda>x. (Y x, ())) {(x, y, ())} = joint_distribution X Y {(x, y)}"
   1.110 +  and "joint_distribution X (\<lambda>x. ((), Y x)) {(x, (), y)} = joint_distribution X Y {(x, y)}"
   1.111 +  and "distribution (\<lambda>x. ()) {()} = 1"
   1.112 +  unfolding prob_space[symmetric]
   1.113 +  by (auto intro!: arg_cong[where f=prob] simp: distribution_def)
   1.114 +
   1.115 +
   1.116 +context finite_information_space
   1.117 +begin
   1.118 +
   1.119 +lemma distribution_mono_gt_0:
   1.120 +  assumes gt_0: "0 < distribution X x"
   1.121 +  assumes *: "\<And>t. \<lbrakk> t \<in> space M ; X t \<in> x \<rbrakk> \<Longrightarrow> Y t \<in> y"
   1.122 +  shows "0 < distribution Y y"
   1.123 +  by (rule less_le_trans[OF gt_0 distribution_mono]) (rule *)
   1.124 +
   1.125 +lemma
   1.126 +  assumes "0 \<le> A" and pos: "0 < A \<Longrightarrow> 0 < B" "0 < A \<Longrightarrow> 0 < C"
   1.127 +  shows mult_log_mult: "A * log b (B * C) = A * log b B + A * log b C" (is "?mult")
   1.128 +  and mult_log_divide: "A * log b (B / C) = A * log b B - A * log b C" (is "?div")
   1.129  proof -
   1.130 -  have "b = a \<union> (b - a)" using assms by auto
   1.131 -  moreover have "{} = a \<inter> (b - a)" by auto
   1.132 -  ultimately have "measure M b = measure M a + measure M (b - a)"
   1.133 -    using measure_additive[of a "b - a"] local.Diff[of b a] assms by auto
   1.134 -  moreover have "measure M (b - a) \<ge> 0" using positive assms by auto
   1.135 -  ultimately show "measure M a \<le> measure M b" by auto
   1.136 +  have "?mult \<and> ?div"
   1.137 +proof (cases "A = 0")
   1.138 +  case False
   1.139 +  hence "0 < A" using `0 \<le> A` by auto
   1.140 +    with pos[OF this] show "?mult \<and> ?div" using b_gt_1
   1.141 +      by (auto simp: log_divide log_mult field_simps)
   1.142 +qed simp
   1.143 +  thus ?mult and ?div by auto
   1.144  qed
   1.145  
   1.146 -lemma (in measure_space) integral_0:
   1.147 -  fixes f :: "'a \<Rightarrow> real"
   1.148 -  assumes "integrable f" "integral f = 0" "nonneg f" and borel: "f \<in> borel_measurable M"
   1.149 -  shows "measure M ({x. f x \<noteq> 0} \<inter> space M) = 0"
   1.150 -proof -
   1.151 -  have "{x. f x \<noteq> 0} = {x. \<bar>f x\<bar> > 0}" by auto
   1.152 -  moreover
   1.153 -  { fix y assume "y \<in> {x. \<bar> f x \<bar> > 0}"
   1.154 -    hence "\<bar> f y \<bar> > 0" by auto
   1.155 -    hence "\<exists> n. \<bar>f y\<bar> \<ge> inverse (real (Suc n))"
   1.156 -      using ex_inverse_of_nat_Suc_less[of "\<bar>f y\<bar>"] less_imp_le unfolding real_of_nat_def by auto
   1.157 -    hence "y \<in> (\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))})"
   1.158 -      by auto }
   1.159 -  moreover
   1.160 -  { fix y assume "y \<in> (\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))})"
   1.161 -    then obtain n where n: "y \<in> {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))}" by auto
   1.162 -    hence "\<bar>f y\<bar> \<ge> inverse (real (Suc n))" by auto
   1.163 -    hence "\<bar>f y\<bar> > 0"
   1.164 -      using real_of_nat_Suc_gt_zero
   1.165 -        positive_imp_inverse_positive[of "real_of_nat (Suc n)"] by fastsimp
   1.166 -    hence "y \<in> {x. \<bar>f x\<bar> > 0}" by auto }
   1.167 -  ultimately have fneq0_UN: "{x. f x \<noteq> 0} = (\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))})"
   1.168 -    by blast
   1.169 -  { fix n
   1.170 -    have int_one: "integrable (\<lambda> x. \<bar>f x\<bar> ^ 1)" using int_abs assms by auto
   1.171 -    have "measure M (f -` {inverse (real (Suc n))..} \<inter> space M)
   1.172 -           \<le> integral (\<lambda> x. \<bar>f x\<bar> ^ 1) / (inverse (real (Suc n)) ^ 1)"
   1.173 -      using markov_ineq[OF `integrable f` _ int_one] real_of_nat_Suc_gt_zero by auto
   1.174 -    hence le0: "measure M (f -` {inverse (real (Suc n))..} \<inter> space M) \<le> 0"
   1.175 -      using assms unfolding nonneg_def by auto
   1.176 -    have "{x. f x \<ge> inverse (real (Suc n))} \<inter> space M \<in> sets M"
   1.177 -      apply (subst Int_commute) unfolding Int_def
   1.178 -      using borel[unfolded borel_measurable_ge_iff] by simp
   1.179 -    hence m0: "measure M ({x. f x \<ge> inverse (real (Suc n))} \<inter> space M) = 0 \<and>
   1.180 -      {x. f x \<ge> inverse (real (Suc n))} \<inter> space M \<in> sets M"
   1.181 -      using positive le0 unfolding atLeast_def by fastsimp }
   1.182 -  moreover hence "range (\<lambda> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M) \<subseteq> sets M"
   1.183 -    by auto
   1.184 -  moreover
   1.185 -  { fix n
   1.186 -    have "inverse (real (Suc n)) \<ge> inverse (real (Suc (Suc n)))"
   1.187 -      using less_imp_inverse_less real_of_nat_Suc_gt_zero[of n] by fastsimp
   1.188 -    hence "\<And> x. f x \<ge> inverse (real (Suc n)) \<Longrightarrow> f x \<ge> inverse (real (Suc (Suc n)))" by (rule order_trans)
   1.189 -    hence "{x. f x \<ge> inverse (real (Suc n))} \<inter> space M
   1.190 -         \<subseteq> {x. f x \<ge> inverse (real (Suc (Suc n)))} \<inter> space M" by auto }
   1.191 -  ultimately have "(\<lambda> x. 0) ----> measure M (\<Union> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M)"
   1.192 -    using monotone_convergence[of "\<lambda> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M"]
   1.193 -    unfolding o_def by (simp del: of_nat_Suc)
   1.194 -  hence "measure M (\<Union> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M) = 0"
   1.195 -    using LIMSEQ_const[of 0] LIMSEQ_unique by simp
   1.196 -  hence "measure M ((\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))}) \<inter> space M) = 0"
   1.197 -    using assms unfolding nonneg_def by auto
   1.198 -  thus "measure M ({x. f x \<noteq> 0} \<inter> space M) = 0" using fneq0_UN by simp
   1.199 +lemma split_pairs:
   1.200 +  shows
   1.201 +    "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   1.202 +    "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
   1.203 +
   1.204 +ML {*
   1.205 +
   1.206 +  (* tactic to solve equations of the form @{term "W * log b (X / (Y * Z)) = W * log b X - W * log b (Y * Z)"}
   1.207 +     where @{term W} is a joint distribution of @{term X}, @{term Y}, and @{term Z}. *)
   1.208 +
   1.209 +  val mult_log_intros = [@{thm mult_log_divide}, @{thm mult_log_mult}]
   1.210 +  val intros = [@{thm divide_pos_pos}, @{thm mult_pos_pos}, @{thm positive_distribution}]
   1.211 +
   1.212 +  val distribution_gt_0_tac = (rtac @{thm distribution_mono_gt_0}
   1.213 +    THEN' assume_tac
   1.214 +    THEN' clarsimp_tac (clasimpset_of @{context} addsimps2 @{thms split_pairs}))
   1.215 +
   1.216 +  val distr_mult_log_eq_tac = REPEAT_ALL_NEW (CHANGED o TRY o
   1.217 +    (resolve_tac (mult_log_intros @ intros)
   1.218 +      ORELSE' distribution_gt_0_tac
   1.219 +      ORELSE' clarsimp_tac (clasimpset_of @{context})))
   1.220 +
   1.221 +  fun instanciate_term thy redex intro =
   1.222 +    let
   1.223 +      val intro_concl = Thm.concl_of intro
   1.224 +
   1.225 +      val lhs = intro_concl |> HOLogic.dest_Trueprop |> HOLogic.dest_eq |> fst
   1.226 +
   1.227 +      val m = SOME (Pattern.match thy (lhs, redex) (Vartab.empty, Vartab.empty))
   1.228 +        handle Pattern.MATCH => NONE
   1.229 +
   1.230 +    in
   1.231 +      Option.map (fn m => Envir.subst_term m intro_concl) m
   1.232 +    end
   1.233 +
   1.234 +  fun mult_log_simproc simpset redex =
   1.235 +  let
   1.236 +    val ctxt = Simplifier.the_context simpset
   1.237 +    val thy = ProofContext.theory_of ctxt
   1.238 +    fun prove (SOME thm) = (SOME
   1.239 +          (Goal.prove ctxt [] [] thm (K (distr_mult_log_eq_tac 1))
   1.240 +           |> mk_meta_eq)
   1.241 +            handle THM _ => NONE)
   1.242 +      | prove NONE = NONE
   1.243 +  in
   1.244 +    get_first (instanciate_term thy (term_of redex) #> prove) mult_log_intros
   1.245 +  end
   1.246 +*}
   1.247 +
   1.248 +simproc_setup mult_log ("distribution X x * log b (A * B)" |
   1.249 +                        "distribution X x * log b (A / B)") = {* K mult_log_simproc *}
   1.250 +
   1.251 +end
   1.252 +
   1.253 +lemma KL_divergence_eq_finite:
   1.254 +  assumes u: "finite_measure_space (M\<lparr>measure := u\<rparr>)"
   1.255 +  assumes v: "finite_measure_space (M\<lparr>measure := v\<rparr>)"
   1.256 +  assumes u_0: "\<And>x. \<lbrakk> x \<in> space M ; v {x} = 0 \<rbrakk> \<Longrightarrow> u {x} = 0"
   1.257 +  shows "KL_divergence b M u v = (\<Sum>x\<in>space M. u {x} * log b (u {x} / v {x}))" (is "_ = ?sum")
   1.258 +proof (simp add: KL_divergence_def, subst finite_measure_space.integral_finite_singleton, simp_all add: u)
   1.259 +  have ms_u: "measure_space (M\<lparr>measure := u\<rparr>)"
   1.260 +    using u unfolding finite_measure_space_def by simp
   1.261 +
   1.262 +  show "(\<Sum>x \<in> space M. log b (measure_space.RN_deriv (M\<lparr>measure := v\<rparr>) u x) * u {x}) = ?sum"
   1.263 +    apply (rule setsum_cong[OF refl])
   1.264 +    apply simp
   1.265 +    apply (safe intro!: arg_cong[where f="log b"] )
   1.266 +    apply (subst finite_measure_space.RN_deriv_finite_singleton)
   1.267 +    using assms ms_u by auto
   1.268  qed
   1.269  
   1.270 -definition
   1.271 -  "KL_divergence b M u v =
   1.272 -    measure_space.integral (M\<lparr>measure := u\<rparr>)
   1.273 -                           (\<lambda>x. log b ((measure_space.RN_deriv (M \<lparr>measure := v\<rparr> ) u) x))"
   1.274 -
   1.275 -lemma (in finite_prob_space) finite_measure_space:
   1.276 -  shows "finite_measure_space \<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   1.277 -    (is "finite_measure_space ?S")
   1.278 -proof (rule finite_Pow_additivity_sufficient, simp_all)
   1.279 -  show "finite (X ` space M)" using finite_space by simp
   1.280 -
   1.281 -  show "positive ?S (distribution X)" unfolding distribution_def
   1.282 -    unfolding positive_def using positive'[unfolded positive_def] sets_eq_Pow by auto
   1.283 +lemma log_setsum_divide:
   1.284 +  assumes "finite S" and "S \<noteq> {}" and "1 < b"
   1.285 +  assumes "(\<Sum>x\<in>S. g x) = 1"
   1.286 +  assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
   1.287 +  assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
   1.288 +  shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
   1.289 +proof -
   1.290 +  have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
   1.291 +    using `1 < b` by (subst log_le_cancel_iff) auto
   1.292  
   1.293 -  show "additive ?S (distribution X)" unfolding additive_def distribution_def
   1.294 -  proof (simp, safe)
   1.295 -    fix x y
   1.296 -    have x: "(X -` x) \<inter> space M \<in> sets M"
   1.297 -      and y: "(X -` y) \<inter> space M \<in> sets M" using sets_eq_Pow by auto
   1.298 -    assume "x \<inter> y = {}"
   1.299 -    from additive[unfolded additive_def, rule_format, OF x y] this
   1.300 -    have "prob (((X -` x) \<union> (X -` y)) \<inter> space M) =
   1.301 -      prob ((X -` x) \<inter> space M) + prob ((X -` y) \<inter> space M)"
   1.302 -      apply (subst Int_Un_distrib2)
   1.303 -      by auto
   1.304 -    thus "prob ((X -` x \<union> X -` y) \<inter> space M) = prob (X -` x \<inter> space M) + prob (X -` y \<inter> space M)"
   1.305 -      by auto
   1.306 +  have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
   1.307 +  proof (unfold setsum_negf[symmetric], rule setsum_cong)
   1.308 +    fix x assume x: "x \<in> S"
   1.309 +    show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
   1.310 +    proof (cases "g x = 0")
   1.311 +      case False
   1.312 +      with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
   1.313 +      thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
   1.314 +    qed simp
   1.315 +  qed rule
   1.316 +  also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
   1.317 +  proof (rule log_setsum')
   1.318 +    fix x assume x: "x \<in> S" "0 < g x"
   1.319 +    with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
   1.320 +  qed fact+
   1.321 +  also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
   1.322 +    by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
   1.323 +        split: split_if_asm)
   1.324 +  also have "... \<le> log b (\<Sum>x\<in>S. f x)"
   1.325 +  proof (rule log_mono)
   1.326 +    have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
   1.327 +    also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
   1.328 +    proof (rule setsum_strict_mono)
   1.329 +      show "finite (S - {x. g x = 0})" using `finite S` by simp
   1.330 +      show "S - {x. g x = 0} \<noteq> {}"
   1.331 +      proof
   1.332 +        assume "S - {x. g x = 0} = {}"
   1.333 +        hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
   1.334 +        with `(\<Sum>x\<in>S. g x) = 1` show False by simp
   1.335 +      qed
   1.336 +      fix x assume "x \<in> S - {x. g x = 0}"
   1.337 +      thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
   1.338 +    qed
   1.339 +    finally show "0 < ?sum" .
   1.340 +    show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
   1.341 +      using `finite S` pos by (auto intro!: setsum_mono2)
   1.342    qed
   1.343 +  finally show ?thesis .
   1.344  qed
   1.345  
   1.346 -lemma (in finite_prob_space) finite_prob_space:
   1.347 -  "finite_prob_space \<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   1.348 -  (is "finite_prob_space ?S")
   1.349 -  unfolding finite_prob_space_def prob_space_def prob_space_axioms_def
   1.350 -proof safe
   1.351 -  show "finite_measure_space ?S" by (rule finite_measure_space)
   1.352 -  thus "measure_space ?S" by (simp add: finite_measure_space_def)
   1.353 +lemma KL_divergence_positive_finite:
   1.354 +  assumes u: "finite_prob_space (M\<lparr>measure := u\<rparr>)"
   1.355 +  assumes v: "finite_prob_space (M\<lparr>measure := v\<rparr>)"
   1.356 +  assumes u_0: "\<And>x. \<lbrakk> x \<in> space M ; v {x} = 0 \<rbrakk> \<Longrightarrow> u {x} = 0"
   1.357 +  and "1 < b"
   1.358 +  shows "0 \<le> KL_divergence b M u v"
   1.359 +proof -
   1.360 +  interpret u: finite_prob_space "M\<lparr>measure := u\<rparr>" using u .
   1.361 +  interpret v: finite_prob_space "M\<lparr>measure := v\<rparr>" using v .
   1.362  
   1.363 -  have "X -` X ` space M \<inter> space M = space M" by auto
   1.364 -  thus "measure ?S (space ?S) = 1"
   1.365 -    by (simp add: distribution_def prob_space)
   1.366 -qed
   1.367 +  have *: "space M \<noteq> {}" using u.not_empty by simp
   1.368  
   1.369 -lemma (in finite_prob_space) finite_measure_space_image_prod:
   1.370 -  "finite_measure_space \<lparr>space = X ` space M \<times> Y ` space M,
   1.371 -    sets = Pow (X ` space M \<times> Y ` space M), measure_space.measure = distribution (\<lambda>x. (X x, Y x))\<rparr>"
   1.372 -  (is "finite_measure_space ?Z")
   1.373 -proof (rule finite_Pow_additivity_sufficient, simp_all)
   1.374 -  show "finite (X ` space M \<times> Y ` space M)" using finite_space by simp
   1.375 +  have "- (KL_divergence b M u v) \<le> log b (\<Sum>x\<in>space M. v {x})"
   1.376 +  proof (subst KL_divergence_eq_finite, safe intro!: log_setsum_divide *)
   1.377 +    show "finite_measure_space (M\<lparr>measure := u\<rparr>)"
   1.378 +      "finite_measure_space (M\<lparr>measure := v\<rparr>)"
   1.379 +       using u v unfolding finite_prob_space_eq by simp_all
   1.380  
   1.381 -  let ?d = "distribution (\<lambda>x. (X x, Y x))"
   1.382 +     show "finite (space M)" using u.finite_space by simp
   1.383 +     show "1 < b" by fact
   1.384 +     show "(\<Sum>x\<in>space M. u {x}) = 1" using u.sum_over_space_eq_1 by simp
   1.385  
   1.386 -  show "positive ?Z ?d"
   1.387 -    using sets_eq_Pow by (auto simp: positive_def distribution_def intro!: positive)
   1.388 +     fix x assume x: "x \<in> space M"
   1.389 +     thus pos: "0 \<le> u {x}" "0 \<le> v {x}"
   1.390 +       using u.positive u.sets_eq_Pow v.positive v.sets_eq_Pow by simp_all
   1.391  
   1.392 -  show "additive ?Z ?d" unfolding additive_def
   1.393 -  proof safe
   1.394 -    fix x y assume "x \<in> sets ?Z" and "y \<in> sets ?Z"
   1.395 -    assume "x \<inter> y = {}"
   1.396 -    thus "?d (x \<union> y) = ?d x + ?d y"
   1.397 -      apply (simp add: distribution_def)
   1.398 -      apply (subst measure_additive[unfolded sets_eq_Pow, simplified])
   1.399 -      by (auto simp: Un_Int_distrib Un_Int_distrib2 intro!: arg_cong[where f=prob])
   1.400 +     { assume "v {x} = 0" from u_0[OF x this] show "u {x} = 0" . }
   1.401 +     { assume "0 < u {x}"
   1.402 +       hence "v {x} \<noteq> 0" using u_0[OF x] by auto
   1.403 +       with pos show "0 < v {x}" by simp }
   1.404    qed
   1.405 +  thus "0 \<le> KL_divergence b M u v" using v.sum_over_space_eq_1 by simp
   1.406  qed
   1.407  
   1.408  definition (in prob_space)
   1.409 @@ -174,346 +269,142 @@
   1.410      in
   1.411        KL_divergence b prod_space (joint_distribution X Y) (measure prod_space)"
   1.412  
   1.413 -abbreviation (in finite_prob_space)
   1.414 -  finite_mutual_information ("\<I>\<^bsub>_\<^esub>'(_ ; _')") where
   1.415 -  "\<I>\<^bsub>b\<^esub>(X ; Y) \<equiv> mutual_information b
   1.416 +abbreviation (in finite_information_space)
   1.417 +  finite_mutual_information ("\<I>'(_ ; _')") where
   1.418 +  "\<I>(X ; Y) \<equiv> mutual_information b
   1.419      \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   1.420      \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   1.421  
   1.422 -abbreviation (in finite_prob_space)
   1.423 -  finite_mutual_information_2 :: "('a \<Rightarrow> 'c) \<Rightarrow> ('a \<Rightarrow> 'd) \<Rightarrow> real" ("\<I>'(_ ; _')") where
   1.424 -  "\<I>(X ; Y) \<equiv> \<I>\<^bsub>2\<^esub>(X ; Y)"
   1.425 +lemma (in finite_measure_space) measure_spaceI: "measure_space M"
   1.426 +  by unfold_locales
   1.427  
   1.428 -lemma (in prob_space) mutual_information_cong:
   1.429 -  assumes [simp]: "space S1 = space S3" "sets S1 = sets S3"
   1.430 -    "space S2 = space S4" "sets S2 = sets S4"
   1.431 -  shows "mutual_information b S1 S2 X Y = mutual_information b S3 S4 X Y"
   1.432 -  unfolding mutual_information_def by simp
   1.433 +lemma prod_measure_times_finite:
   1.434 +  assumes fms: "finite_measure_space M" "finite_measure_space M'" and a: "a \<in> space M \<times> space M'"
   1.435 +  shows "prod_measure M M' {a} = measure M {fst a} * measure M' {snd a}"
   1.436 +proof (cases a)
   1.437 +  case (Pair b c)
   1.438 +  hence a_eq: "{a} = {b} \<times> {c}" by simp
   1.439  
   1.440 -lemma (in prob_space) joint_distribution:
   1.441 -  "joint_distribution X Y = distribution (\<lambda>x. (X x, Y x))"
   1.442 -  unfolding joint_distribution_def_raw distribution_def_raw ..
   1.443 +  with fms[THEN finite_measure_space.measure_spaceI]
   1.444 +    fms[THEN finite_measure_space.sets_eq_Pow] a Pair
   1.445 +  show ?thesis unfolding a_eq
   1.446 +    by (subst prod_measure_times) simp_all
   1.447 +qed
   1.448  
   1.449 -lemma (in finite_prob_space) finite_mutual_information_reduce:
   1.450 -  "\<I>\<^bsub>b\<^esub>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
   1.451 -    distribution (\<lambda>x. (X x, Y x)) {(x,y)} * log b (distribution (\<lambda>x. (X x, Y x)) {(x,y)} /
   1.452 -                                                   (distribution X {x} * distribution Y {y})))"
   1.453 -  (is "_ = setsum ?log ?prod")
   1.454 -  unfolding Let_def mutual_information_def KL_divergence_def
   1.455 -proof (subst finite_measure_space.integral_finite_singleton, simp_all add: joint_distribution)
   1.456 -  let ?X = "\<lparr>space = X ` space M, sets = Pow (X ` space M), measure_space.measure = distribution X\<rparr>"
   1.457 -  let ?Y = "\<lparr>space = Y ` space M, sets = Pow (Y ` space M), measure_space.measure = distribution Y\<rparr>"
   1.458 -  let ?P = "prod_measure_space ?X ?Y"
   1.459 +lemma setsum_cartesian_product':
   1.460 +  "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
   1.461 +  unfolding setsum_cartesian_product by simp
   1.462  
   1.463 -  interpret X: finite_measure_space "?X" by (rule finite_measure_space)
   1.464 -  moreover interpret Y: finite_measure_space "?Y" by (rule finite_measure_space)
   1.465 -  ultimately have ms_X: "measure_space ?X" and ms_Y: "measure_space ?Y" by unfold_locales
   1.466 -
   1.467 -  interpret P: finite_measure_space "?P" by (rule finite_measure_space_finite_prod_measure) (fact+)
   1.468 -
   1.469 -  let ?P' = "measure_update (\<lambda>_. distribution (\<lambda>x. (X x, Y x))) ?P"
   1.470 -  from finite_measure_space_image_prod[of X Y]
   1.471 -    sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
   1.472 -  show "finite_measure_space ?P'"
   1.473 -    by (simp add: X.sets_eq_Pow Y.sets_eq_Pow joint_distribution finite_measure_space_def prod_measure_space_def)
   1.474 +lemma (in finite_information_space)
   1.475 +  assumes MX: "finite_prob_space \<lparr> space = space MX, sets = sets MX, measure = distribution X\<rparr>"
   1.476 +    (is "finite_prob_space ?MX")
   1.477 +  assumes MY: "finite_prob_space \<lparr> space = space MY, sets = sets MY, measure = distribution Y\<rparr>"
   1.478 +    (is "finite_prob_space ?MY")
   1.479 +  and X_space: "X ` space M \<subseteq> space MX" and Y_space: "Y ` space M \<subseteq> space MY"
   1.480 +  shows mutual_information_eq_generic:
   1.481 +    "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   1.482 +      joint_distribution X Y {(x,y)} *
   1.483 +      log b (joint_distribution X Y {(x,y)} /
   1.484 +      (distribution X {x} * distribution Y {y})))"
   1.485 +    (is "?equality")
   1.486 +  and mutual_information_positive_generic:
   1.487 +    "0 \<le> mutual_information b MX MY X Y" (is "?positive")
   1.488 +proof -
   1.489 +  let ?P = "prod_measure_space ?MX ?MY"
   1.490 +  let ?measure = "joint_distribution X Y"
   1.491 +  let ?P' = "measure_update (\<lambda>_. ?measure) ?P"
   1.492  
   1.493 -  show "(\<Sum>x \<in> space ?P. log b (measure_space.RN_deriv ?P (distribution (\<lambda>x. (X x, Y x))) x) * distribution (\<lambda>x. (X x, Y x)) {x})
   1.494 -    = setsum ?log ?prod"
   1.495 -  proof (rule setsum_cong)
   1.496 -    show "space ?P = ?prod" unfolding prod_measure_space_def by simp
   1.497 -  next
   1.498 -    fix x assume x: "x \<in> X ` space M \<times> Y ` space M"
   1.499 -    then obtain d e where x_Pair: "x = (d, e)"
   1.500 -      and d: "d \<in> X ` space M"
   1.501 -      and e: "e \<in> Y ` space M" by auto
   1.502 -
   1.503 -    { fix x assume m_0: "measure ?P {x} = 0"
   1.504 -      have "distribution (\<lambda>x. (X x, Y x)) {x} = 0"
   1.505 -      proof (cases x)
   1.506 -        case (Pair a b)
   1.507 -        hence "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = (X -` {a} \<inter> space M) \<inter> (Y -` {b} \<inter> space M)"
   1.508 -          and x_prod: "{x} = {a} \<times> {b}" by auto
   1.509 +  interpret X: finite_prob_space "?MX" using MX .
   1.510 +  moreover interpret Y: finite_prob_space "?MY" using MY .
   1.511 +  ultimately have ms_X: "measure_space ?MX"
   1.512 +    and ms_Y: "measure_space ?MY" by unfold_locales
   1.513  
   1.514 -        let ?PROD = "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M"
   1.515 +  have fms_P: "finite_measure_space ?P"
   1.516 +      by (rule finite_measure_space_finite_prod_measure) fact+
   1.517 +
   1.518 +  have fms_P': "finite_measure_space ?P'"
   1.519 +      using finite_product_measure_space[of "space MX" "space MY"]
   1.520 +        X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
   1.521 +        X.sets_eq_Pow Y.sets_eq_Pow
   1.522 +      by (simp add: prod_measure_space_def)
   1.523  
   1.524 -        show ?thesis
   1.525 -        proof (cases "{a} \<subseteq> X ` space M \<and> {b} \<subseteq> Y ` space M")
   1.526 -          case False
   1.527 -          hence "?PROD = {}"
   1.528 -            unfolding Pair by auto
   1.529 -          thus ?thesis by (auto simp: distribution_def)
   1.530 -        next
   1.531 -          have [intro]: "prob ?PROD \<le> 0 \<Longrightarrow> prob ?PROD = 0"
   1.532 -            using sets_eq_Pow by (auto intro!: positive real_le_antisym[of _ 0])
   1.533 +  { fix x assume "x \<in> space ?P"
   1.534 +    hence x_in_MX: "{fst x} \<in> sets MX" using X.sets_eq_Pow
   1.535 +      by (auto simp: prod_measure_space_def)
   1.536 +
   1.537 +    assume "measure ?P {x} = 0"
   1.538 +    with prod_measure_times[OF ms_X ms_Y, of "{fst x}" "{snd x}"] x_in_MX
   1.539 +    have "distribution X {fst x} = 0 \<or> distribution Y {snd x} = 0"
   1.540 +      by (simp add: prod_measure_space_def)
   1.541 +
   1.542 +    hence "joint_distribution X Y {x} = 0"
   1.543 +      by (cases x) (auto simp: distribution_order) }
   1.544 +  note measure_0 = this
   1.545  
   1.546 -          case True
   1.547 -          with prod_measure_times[OF ms_X ms_Y, simplified, of "{a}" "{b}"]
   1.548 -          have "prob (X -` {a} \<inter> space M) = 0 \<or> prob (Y -` {b} \<inter> space M) = 0" (is "?X_0 \<or> ?Y_0") using m_0
   1.549 -            by (simp add: prod_measure_space_def distribution_def Pair)
   1.550 -          thus ?thesis
   1.551 -          proof (rule disjE)
   1.552 -            assume ?X_0
   1.553 -            have "prob ?PROD \<le> prob (X -` {a} \<inter> space M)"
   1.554 -              using sets_eq_Pow Pair by (auto intro!: measure_mono)
   1.555 -            thus ?thesis using `?X_0` by (auto simp: distribution_def)
   1.556 -          next
   1.557 -            assume ?Y_0
   1.558 -            have "prob ?PROD \<le> prob (Y -` {b} \<inter> space M)"
   1.559 -              using sets_eq_Pow Pair by (auto intro!: measure_mono)
   1.560 -            thus ?thesis using `?Y_0` by (auto simp: distribution_def)
   1.561 -          qed
   1.562 -        qed
   1.563 -      qed }
   1.564 -    note measure_zero_joint_distribution = this
   1.565 +  show ?equality
   1.566 +    unfolding Let_def mutual_information_def using fms_P fms_P' measure_0 MX MY
   1.567 +    by (subst KL_divergence_eq_finite)
   1.568 +       (simp_all add: prod_measure_space_def prod_measure_times_finite
   1.569 +         finite_prob_space_eq setsum_cartesian_product')
   1.570  
   1.571 -    show "log b (measure_space.RN_deriv ?P (distribution (\<lambda>x. (X x, Y x))) x) * distribution (\<lambda>x. (X x, Y x)) {x} = ?log x"
   1.572 -    apply (cases "distribution (\<lambda>x. (X x, Y x)) {x} \<noteq> 0")
   1.573 -    apply (subst P.RN_deriv_finite_singleton)
   1.574 -    proof (simp_all add: x_Pair)
   1.575 -      from `finite_measure_space ?P'` show "measure_space ?P'" by (simp add: finite_measure_space_def)
   1.576 -    next
   1.577 -      fix x assume m_0: "measure ?P {x} = 0" thus "distribution (\<lambda>x. (X x, Y x)) {x} = 0" by fact
   1.578 -    next
   1.579 -      show "(d,e) \<in> space ?P" unfolding prod_measure_space_def using x x_Pair by simp
   1.580 -    next
   1.581 -      assume jd_0: "distribution (\<lambda>x. (X x, Y x)) {(d, e)} \<noteq> 0"
   1.582 -      show "measure ?P {(d,e)} \<noteq> 0"
   1.583 -      proof
   1.584 -        assume "measure ?P {(d,e)} = 0"
   1.585 -        from measure_zero_joint_distribution[OF this] jd_0
   1.586 -        show False by simp
   1.587 -      qed
   1.588 -    next
   1.589 -      assume jd_0: "distribution (\<lambda>x. (X x, Y x)) {(d, e)} \<noteq> 0"
   1.590 -      with prod_measure_times[OF ms_X ms_Y, simplified, of "{d}" "{e}"] d
   1.591 -      show "log b (distribution (\<lambda>x. (X x, Y x)) {(d, e)} / measure ?P {(d, e)}) =
   1.592 -        log b (distribution (\<lambda>x. (X x, Y x)) {(d, e)} / (distribution X {d} * distribution Y {e}))"
   1.593 -        by (auto intro!: arg_cong[where f="log b"] simp: prod_measure_space_def)
   1.594 -    qed
   1.595 +  show ?positive
   1.596 +    unfolding Let_def mutual_information_def using measure_0 b_gt_1
   1.597 +  proof (safe intro!: KL_divergence_positive_finite, simp_all)
   1.598 +    from ms_X ms_Y X.top Y.top X.prob_space Y.prob_space
   1.599 +    have "measure ?P (space ?P) = 1"
   1.600 +      by (simp add: prod_measure_space_def, subst prod_measure_times, simp_all)
   1.601 +    with fms_P show "finite_prob_space ?P"
   1.602 +      by (simp add: finite_prob_space_eq)
   1.603 +
   1.604 +    from ms_X ms_Y X.top Y.top X.prob_space Y.prob_space Y.not_empty X_space Y_space
   1.605 +    have "measure ?P' (space ?P') = 1" unfolding prob_space[symmetric]
   1.606 +      by (auto simp add: prod_measure_space_def distribution_def vimage_Times comp_def
   1.607 +        intro!: arg_cong[where f=prob])
   1.608 +    with fms_P' show "finite_prob_space ?P'"
   1.609 +      by (simp add: finite_prob_space_eq)
   1.610    qed
   1.611  qed
   1.612  
   1.613 -lemma (in finite_prob_space) distribution_log_split:
   1.614 -  assumes "1 < b"
   1.615 -  shows
   1.616 -  "distribution (\<lambda>x. (X x, Z x)) {(X x, z)} * log b (distribution (\<lambda>x. (X x, Z x)) {(X x, z)} /
   1.617 -                                                     (distribution X {X x} * distribution Z {z})) =
   1.618 -   distribution (\<lambda>x. (X x, Z x)) {(X x, z)} * log b (distribution (\<lambda>x. (X x, Z x)) {(X x, z)} /
   1.619 -                                                     distribution Z {z}) -
   1.620 -   distribution (\<lambda>x. (X x, Z x)) {(X x, z)} * log b (distribution X {X x})"
   1.621 -  (is "?lhs = ?rhs")
   1.622 -proof (cases "distribution (\<lambda>x. (X x, Z x)) {(X x, z)} = 0")
   1.623 -  case True thus ?thesis by simp
   1.624 -next
   1.625 -  case False
   1.626 -
   1.627 -  let ?dZ = "distribution Z"
   1.628 -  let ?dX = "distribution X"
   1.629 -  let ?dXZ = "distribution (\<lambda>x. (X x, Z x))"
   1.630 -
   1.631 -  have dist_nneg: "\<And>x X. 0 \<le> distribution X x"
   1.632 -    unfolding distribution_def using sets_eq_Pow by (auto intro: positive)
   1.633 -
   1.634 -  have "?lhs = ?dXZ {(X x, z)} * (log b (?dXZ {(X x, z)} / ?dZ {z}) - log b (?dX {X x}))"
   1.635 -  proof -
   1.636 -    have pos_dXZ: "0 < ?dXZ {(X x, z)}"
   1.637 -      using False dist_nneg[of "\<lambda>x. (X x, Z x)" "{(X x, z)}"] by auto
   1.638 -    moreover
   1.639 -    have "((\<lambda>x. (X x, Z x)) -` {(X x, z)}) \<inter> space M \<subseteq> (X -` {X x}) \<inter> space M" by auto
   1.640 -    hence "?dXZ {(X x, z)} \<le> ?dX {X x}"
   1.641 -      unfolding distribution_def
   1.642 -      by (rule measure_mono) (simp_all add: sets_eq_Pow)
   1.643 -    with pos_dXZ have "0 < ?dX {X x}" by (rule less_le_trans)
   1.644 -    moreover
   1.645 -    have "((\<lambda>x. (X x, Z x)) -` {(X x, z)}) \<inter> space M \<subseteq> (Z -` {z}) \<inter> space M" by auto
   1.646 -    hence "?dXZ {(X x, z)} \<le> ?dZ {z}"
   1.647 -      unfolding distribution_def
   1.648 -      by (rule measure_mono) (simp_all add: sets_eq_Pow)
   1.649 -    with pos_dXZ have "0 < ?dZ {z}" by (rule less_le_trans)
   1.650 -    moreover have "0 < b" by (rule less_trans[OF _ `1 < b`]) simp
   1.651 -    moreover have "b \<noteq> 1" by (rule ccontr) (insert `1 < b`, simp)
   1.652 -    ultimately show ?thesis
   1.653 -      using pos_dXZ
   1.654 -      apply (subst (2) mult_commute)
   1.655 -      apply (subst divide_divide_eq_left[symmetric])
   1.656 -      apply (subst log_divide)
   1.657 -      by (auto intro: divide_pos_pos)
   1.658 -  qed
   1.659 -  also have "... = ?rhs"
   1.660 -    by (simp add: field_simps)
   1.661 -  finally show ?thesis .
   1.662 -qed
   1.663 -
   1.664 -lemma (in finite_prob_space) finite_mutual_information_reduce_prod:
   1.665 -  "mutual_information b
   1.666 -    \<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>
   1.667 -    \<lparr> space = Y ` space M \<times> Z ` space M, sets = Pow (Y ` space M \<times> Z ` space M) \<rparr>
   1.668 -    X (\<lambda>x. (Y x,Z x)) =
   1.669 -    (\<Sum> (x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
   1.670 -      distribution (\<lambda>x. (X x, Y x,Z x)) {(x, y, z)} *
   1.671 -      log b (distribution (\<lambda>x. (X x, Y x,Z x)) {(x, y, z)} /
   1.672 -              (distribution X {x} * distribution (\<lambda>x. (Y x,Z x)) {(y,z)})))" (is "_ = setsum ?log ?space")
   1.673 -  unfolding Let_def mutual_information_def KL_divergence_def using finite_space
   1.674 -proof (subst finite_measure_space.integral_finite_singleton,
   1.675 -       simp_all add: prod_measure_space_def sigma_prod_sets_finite joint_distribution)
   1.676 -  let ?sets = "Pow (X ` space M \<times> Y ` space M \<times> Z ` space M)"
   1.677 -    and ?measure = "distribution (\<lambda>x. (X x, Y x, Z x))"
   1.678 -  let ?P = "\<lparr> space = ?space, sets = ?sets, measure = ?measure\<rparr>"
   1.679 -
   1.680 -  show "finite_measure_space ?P"
   1.681 -  proof (rule finite_Pow_additivity_sufficient, simp_all)
   1.682 -    show "finite ?space" using finite_space by auto
   1.683 -
   1.684 -    show "positive ?P ?measure"
   1.685 -      using sets_eq_Pow by (auto simp: positive_def distribution_def intro!: positive)
   1.686 -
   1.687 -    show "additive ?P ?measure"
   1.688 -    proof (simp add: additive_def distribution_def, safe)
   1.689 -      fix x y assume "x \<subseteq> ?space" and "y \<subseteq> ?space"
   1.690 -      assume "x \<inter> y = {}"
   1.691 -      thus "prob (((\<lambda>x. (X x, Y x, Z x)) -` x \<union> (\<lambda>x. (X x, Y x, Z x)) -` y) \<inter> space M) =
   1.692 -            prob ((\<lambda>x. (X x, Y x, Z x)) -` x \<inter> space M) + prob ((\<lambda>x. (X x, Y x, Z x)) -` y \<inter> space M)"
   1.693 -        apply (subst measure_additive[unfolded sets_eq_Pow, simplified])
   1.694 -        by (auto simp: Un_Int_distrib Un_Int_distrib2 intro!: arg_cong[where f=prob])
   1.695 -    qed
   1.696 -  qed
   1.697 +lemma (in finite_information_space) mutual_information_eq:
   1.698 +  "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
   1.699 +    distribution (\<lambda>x. (X x, Y x)) {(x,y)} * log b (distribution (\<lambda>x. (X x, Y x)) {(x,y)} /
   1.700 +                                                   (distribution X {x} * distribution Y {y})))"
   1.701 +  by (subst mutual_information_eq_generic) (simp_all add: finite_prob_space_of_images)
   1.702  
   1.703 -  let ?X = "\<lparr>space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   1.704 -  and ?YZ = "\<lparr>space = Y ` space M \<times> Z ` space M, sets = Pow (Y ` space M \<times> Z ` space M), measure = distribution (\<lambda>x. (Y x, Z x))\<rparr>"
   1.705 -  let ?u = "prod_measure ?X ?YZ"
   1.706 -
   1.707 -  from finite_measure_space[of X] finite_measure_space_image_prod[of Y Z]
   1.708 -  have ms_X: "measure_space ?X" and ms_YZ: "measure_space ?YZ"
   1.709 -    by (simp_all add: finite_measure_space_def)
   1.710 -
   1.711 -  show "(\<Sum>x \<in> ?space. log b (measure_space.RN_deriv \<lparr>space=?space, sets=?sets, measure=?u\<rparr>
   1.712 -    (distribution (\<lambda>x. (X x, Y x, Z x))) x) * distribution (\<lambda>x. (X x, Y x, Z x)) {x})
   1.713 -    = setsum ?log ?space"
   1.714 -  proof (rule setsum_cong)
   1.715 -    fix x assume x: "x \<in> ?space"
   1.716 -    then obtain d e f where x_Pair: "x = (d, e, f)"
   1.717 -      and d: "d \<in> X ` space M"
   1.718 -      and e: "e \<in> Y ` space M"
   1.719 -      and f: "f \<in> Z ` space M" by auto
   1.720 -
   1.721 -    { fix x assume m_0: "?u {x} = 0"
   1.722 -
   1.723 -      let ?PROD = "(\<lambda>x. (X x, Y x, Z x)) -` {x} \<inter> space M"
   1.724 -      obtain a b c where Pair: "x = (a, b, c)" by (cases x)
   1.725 -      hence "?PROD = (X -` {a} \<inter> space M) \<inter> ((\<lambda>x. (Y x, Z x)) -` {(b, c)} \<inter> space M)"
   1.726 -        and x_prod: "{x} = {a} \<times> {(b, c)}" by auto
   1.727 -
   1.728 -      have "distribution (\<lambda>x. (X x, Y x, Z x)) {x} = 0"
   1.729 -      proof (cases "{a} \<subseteq> X ` space M")
   1.730 -        case False
   1.731 -        hence "?PROD = {}"
   1.732 -          unfolding Pair by auto
   1.733 -        thus ?thesis by (auto simp: distribution_def)
   1.734 -      next
   1.735 -        have [intro]: "prob ?PROD \<le> 0 \<Longrightarrow> prob ?PROD = 0"
   1.736 -          using sets_eq_Pow by (auto intro!: positive real_le_antisym[of _ 0])
   1.737 -
   1.738 -        case True
   1.739 -        with prod_measure_times[OF ms_X ms_YZ, simplified, of "{a}" "{(b,c)}"]
   1.740 -        have "prob (X -` {a} \<inter> space M) = 0 \<or> prob ((\<lambda>x. (Y x, Z x)) -` {(b, c)} \<inter> space M) = 0"
   1.741 -          (is "prob ?X = 0 \<or> prob ?Y = 0") using m_0
   1.742 -          by (simp add: prod_measure_space_def distribution_def Pair)
   1.743 -        thus ?thesis
   1.744 -        proof (rule disjE)
   1.745 -          assume "prob ?X = 0"
   1.746 -          have "prob ?PROD \<le> prob ?X"
   1.747 -            using sets_eq_Pow Pair by (auto intro!: measure_mono)
   1.748 -          thus ?thesis using `prob ?X = 0` by (auto simp: distribution_def)
   1.749 -        next
   1.750 -          assume "prob ?Y = 0"
   1.751 -          have "prob ?PROD \<le> prob ?Y"
   1.752 -            using sets_eq_Pow Pair by (auto intro!: measure_mono)
   1.753 -          thus ?thesis using `prob ?Y = 0` by (auto simp: distribution_def)
   1.754 -        qed
   1.755 -      qed }
   1.756 -    note measure_zero_joint_distribution = this
   1.757 -
   1.758 -    from x_Pair d e f finite_space
   1.759 -    show "log b (measure_space.RN_deriv \<lparr>space=?space, sets=?sets, measure=?u\<rparr>
   1.760 -      (distribution (\<lambda>x. (X x, Y x, Z x))) x) * distribution (\<lambda>x. (X x, Y x, Z x)) {x} = ?log x"
   1.761 -    apply (cases "distribution (\<lambda>x. (X x, Y x, Z x)) {x} \<noteq> 0")
   1.762 -    apply (subst finite_measure_space.RN_deriv_finite_singleton)
   1.763 -    proof simp_all
   1.764 -      show "measure_space ?P" using `finite_measure_space ?P` by (simp add: finite_measure_space_def)
   1.765 -
   1.766 -      from finite_measure_space_finite_prod_measure[OF finite_measure_space[of X]
   1.767 -        finite_measure_space_image_prod[of Y Z]] finite_space
   1.768 -      show "finite_measure_space \<lparr>space=?space, sets=?sets, measure=?u\<rparr>"
   1.769 -        by (simp add: prod_measure_space_def sigma_prod_sets_finite)
   1.770 -    next
   1.771 -      fix x assume "?u {x} = 0" thus "distribution (\<lambda>x. (X x, Y x, Z x)) {x} = 0" by fact
   1.772 -    next
   1.773 -      assume jd_0: "distribution (\<lambda>x. (X x, Y x, Z x)) {(d, e, f)} \<noteq> 0"
   1.774 -      show "?u {(d,e,f)} \<noteq> 0"
   1.775 -      proof
   1.776 -        assume "?u {(d, e, f)} = 0"
   1.777 -        from measure_zero_joint_distribution[OF this] jd_0
   1.778 -        show False by simp
   1.779 -      qed
   1.780 -    next
   1.781 -      assume jd_0: "distribution (\<lambda>x. (X x, Y x, Z x)) {(d, e, f)} \<noteq> 0"
   1.782 -      with prod_measure_times[OF ms_X ms_YZ, simplified, of "{d}" "{(e,f)}"] d
   1.783 -      show "log b (distribution (\<lambda>x. (X x, Y x, Z x)) {(d, e, f)} / ?u {(d, e, f)}) =
   1.784 -        log b (distribution (\<lambda>x. (X x, Y x, Z x)) {(d, e, f)} / (distribution X {d} * distribution (\<lambda>x. (Y x, Z x)) {(e,f)}))"
   1.785 -        by (auto intro!: arg_cong[where f="log b"] simp: prod_measure_space_def)
   1.786 -    qed
   1.787 -  qed simp
   1.788 -qed
   1.789 +lemma (in finite_information_space) mutual_information_positive: "0 \<le> \<I>(X;Y)"
   1.790 +  by (subst mutual_information_positive_generic) (simp_all add: finite_prob_space_of_images)
   1.791  
   1.792  definition (in prob_space)
   1.793    "entropy b s X = mutual_information b s s X X"
   1.794  
   1.795 -abbreviation (in finite_prob_space)
   1.796 -  finite_entropy ("\<H>\<^bsub>_\<^esub>'(_')") where
   1.797 -  "\<H>\<^bsub>b\<^esub>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
   1.798 -
   1.799 -abbreviation (in finite_prob_space)
   1.800 -  finite_entropy_2 ("\<H>'(_')") where
   1.801 -  "\<H>(X) \<equiv> \<H>\<^bsub>2\<^esub>(X)"
   1.802 +abbreviation (in finite_information_space)
   1.803 +  finite_entropy ("\<H>'(_')") where
   1.804 +  "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
   1.805  
   1.806 -lemma (in finite_prob_space) finite_entropy_reduce:
   1.807 -  assumes "1 < b"
   1.808 -  shows "\<H>\<^bsub>b\<^esub>(X) = -(\<Sum> x \<in> X ` space M. distribution X {x} * log b (distribution X {x}))"
   1.809 +lemma (in finite_information_space) joint_distribution_remove[simp]:
   1.810 +    "joint_distribution X X {(x, x)} = distribution X {x}"
   1.811 +  unfolding distribution_def by (auto intro!: arg_cong[where f=prob])
   1.812 +
   1.813 +lemma (in finite_information_space) entropy_eq:
   1.814 +  "\<H>(X) = -(\<Sum> x \<in> X ` space M. distribution X {x} * log b (distribution X {x}))"
   1.815  proof -
   1.816 -  have fin: "finite (X ` space M)" using finite_space by simp
   1.817 -
   1.818 -  have If_mult_distr: "\<And>A B C D. If A B C * D = If A (B * D) (C * D)" by auto
   1.819 -
   1.820 +  { fix f
   1.821    { fix x y
   1.822      have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
   1.823 -    hence "distribution (\<lambda>x. (X x, X x))  {(x,y)} = (if x = y then distribution X {x} else 0)"
   1.824 +      hence "distribution (\<lambda>x. (X x, X x))  {(x,y)} * f x y = (if x = y then distribution X {x} * f x y else 0)"
   1.825        unfolding distribution_def by auto }
   1.826 -  moreover
   1.827 -  have "\<And>x. 0 \<le> distribution X x"
   1.828 -    unfolding distribution_def using finite_space sets_eq_Pow by (auto intro: positive)
   1.829 -  hence "\<And>x. distribution X x \<noteq> 0 \<Longrightarrow> 0 < distribution X x" by (auto simp: le_less)
   1.830 -  ultimately
   1.831 -  show ?thesis using `1 < b`
   1.832 -    by (auto intro!: setsum_cong
   1.833 -      simp: log_inverse If_mult_distr setsum_cases[OF fin] inverse_eq_divide[symmetric]
   1.834 -        entropy_def setsum_negf[symmetric] joint_distribution finite_mutual_information_reduce
   1.835 -        setsum_cartesian_product[symmetric])
   1.836 +    hence "(\<Sum>(x, y) \<in> X ` space M \<times> X ` space M. joint_distribution X X {(x, y)} * f x y) =
   1.837 +      (\<Sum>x \<in> X ` space M. distribution X {x} * f x x)"
   1.838 +      unfolding setsum_cartesian_product' by (simp add: setsum_cases finite_space) }
   1.839 +  note remove_cartesian_product = this
   1.840 +
   1.841 +  show ?thesis
   1.842 +    unfolding entropy_def mutual_information_eq setsum_negf[symmetric] remove_cartesian_product
   1.843 +    by (auto intro!: setsum_cong)
   1.844  qed
   1.845  
   1.846 -lemma log_inj: assumes "1 < b" shows "inj_on (log b) {0 <..}"
   1.847 -proof (rule inj_onI, simp)
   1.848 -  fix x y assume pos: "0 < x" "0 < y" and *: "log b x = log b y"
   1.849 -  show "x = y"
   1.850 -  proof (cases rule: linorder_cases)
   1.851 -    assume "x < y" hence "log b x < log b y"
   1.852 -      using log_less_cancel_iff[OF `1 < b`] pos by simp
   1.853 -    thus ?thesis using * by simp
   1.854 -  next
   1.855 -    assume "y < x" hence "log b y < log b x"
   1.856 -      using log_less_cancel_iff[OF `1 < b`] pos by simp
   1.857 -    thus ?thesis using * by simp
   1.858 -  qed simp
   1.859 -qed
   1.860 +lemma (in finite_information_space) entropy_positive: "0 \<le> \<H>(X)"
   1.861 +  unfolding entropy_def using mutual_information_positive .
   1.862  
   1.863  definition (in prob_space)
   1.864    "conditional_mutual_information b s1 s2 s3 X Y Z \<equiv>
   1.865 @@ -524,160 +415,181 @@
   1.866        mutual_information b s1 prod_space X (\<lambda>x. (Y x, Z x)) -
   1.867        mutual_information b s1 s3 X Z"
   1.868  
   1.869 -abbreviation (in finite_prob_space)
   1.870 -  finite_conditional_mutual_information ("\<I>\<^bsub>_\<^esub>'( _ ; _ | _ ')") where
   1.871 -  "\<I>\<^bsub>b\<^esub>(X ; Y | Z) \<equiv> conditional_mutual_information b
   1.872 +abbreviation (in finite_information_space)
   1.873 +  finite_conditional_mutual_information ("\<I>'( _ ; _ | _ ')") where
   1.874 +  "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   1.875      \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   1.876      \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
   1.877      \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
   1.878      X Y Z"
   1.879  
   1.880 -abbreviation (in finite_prob_space)
   1.881 -  finite_conditional_mutual_information_2 ("\<I>'( _ ; _ | _ ')") where
   1.882 -  "\<I>(X ; Y | Z) \<equiv> \<I>\<^bsub>2\<^esub>(X ; Y | Z)"
   1.883 +lemma (in finite_information_space) setsum_distribution_gen:
   1.884 +  assumes "Z -` {c} \<inter> space M = (\<Union>x \<in> X`space M. Y -` {f x}) \<inter> space M"
   1.885 +  and "inj_on f (X`space M)"
   1.886 +  shows "(\<Sum>x \<in> X`space M. distribution Y {f x}) = distribution Z {c}"
   1.887 +  unfolding distribution_def assms
   1.888 +  using finite_space assms
   1.889 +  by (subst measure_finitely_additive'')
   1.890 +     (auto simp add: disjoint_family_on_def sets_eq_Pow inj_on_def
   1.891 +      intro!: arg_cong[where f=prob])
   1.892 +
   1.893 +lemma (in finite_information_space) setsum_distribution:
   1.894 +  "(\<Sum>x \<in> X`space M. joint_distribution X Y {(x, y)}) = distribution Y {y}"
   1.895 +  "(\<Sum>y \<in> Y`space M. joint_distribution X Y {(x, y)}) = distribution X {x}"
   1.896 +  "(\<Sum>x \<in> X`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution Y Z {(y, z)}"
   1.897 +  "(\<Sum>y \<in> Y`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Z {(x, z)}"
   1.898 +  "(\<Sum>z \<in> Z`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Y {(x, y)}"
   1.899 +  by (auto intro!: inj_onI setsum_distribution_gen)
   1.900  
   1.901 -lemma image_pair_eq_Sigma:
   1.902 -  "(\<lambda>x. (f x, g x)) ` A = Sigma (f ` A) (\<lambda>x. g ` (f -` {x} \<inter> A))"
   1.903 -proof (safe intro!: imageI vimageI, simp_all)
   1.904 -  fix a b assume *: "a \<in> A" "b \<in> A" and eq: "f a = f b"
   1.905 -  show "(f b, g a) \<in> (\<lambda>x. (f x, g x)) ` A" unfolding eq[symmetric]
   1.906 -    using * by auto
   1.907 +lemma (in finite_information_space) conditional_mutual_information_eq_sum:
   1.908 +   "\<I>(X ; Y | Z) =
   1.909 +     (\<Sum>(x, y, z)\<in>X ` space M \<times> (\<lambda>x. (Y x, Z x)) ` space M.
   1.910 +             distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)} *
   1.911 +             log b (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}/
   1.912 +        distribution (\<lambda>x. (Y x, Z x)) {(y, z)})) -
   1.913 +     (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   1.914 +        distribution (\<lambda>x. (X x, Z x)) {(x,z)} * log b (distribution (\<lambda>x. (X x, Z x)) {(x,z)} / distribution Z {z}))"
   1.915 +  (is "_ = ?rhs")
   1.916 +proof -
   1.917 +  have setsum_product:
   1.918 +    "\<And>f x. (\<Sum>v\<in>(\<lambda>x. (Y x, Z x)) ` space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x,v)} * f v)
   1.919 +      = (\<Sum>v\<in>Y ` space M \<times> Z ` space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x,v)} * f v)"
   1.920 +  proof (safe intro!: setsum_mono_zero_cong_left imageI)
   1.921 +    fix x y z f
   1.922 +    assume *: "(Y y, Z z) \<notin> (\<lambda>x. (Y x, Z x)) ` space M" and "y \<in> space M" "z \<in> space M"
   1.923 +    hence "(\<lambda>x. (X x, Y x, Z x)) -` {(x, Y y, Z z)} \<inter> space M = {}"
   1.924 +    proof safe
   1.925 +      fix x' assume x': "x' \<in> space M" and eq: "Y x' = Y y" "Z x' = Z z"
   1.926 +      have "(Y y, Z z) \<in> (\<lambda>x. (Y x, Z x)) ` space M" using eq[symmetric] x' by auto
   1.927 +      thus "x' \<in> {}" using * by auto
   1.928 +    qed
   1.929 +    thus "joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, Y y, Z z)} * f (Y y) (Z z) = 0"
   1.930 +      unfolding distribution_def by simp
   1.931 +  qed (simp add: finite_space)
   1.932 +
   1.933 +  thus ?thesis
   1.934 +    unfolding conditional_mutual_information_def Let_def mutual_information_eq
   1.935 +    apply (subst mutual_information_eq_generic)
   1.936 +    by (auto simp add: prod_measure_space_def sigma_prod_sets_finite finite_space
   1.937 +        finite_prob_space_of_images finite_product_prob_space_of_images
   1.938 +        setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf
   1.939 +        setsum_left_distrib[symmetric] setsum_distribution
   1.940 +      cong: setsum_cong)
   1.941  qed
   1.942  
   1.943 -lemma inj_on_swap: "inj_on (\<lambda>(x,y). (y,x)) A" by (auto intro!: inj_onI)
   1.944 -
   1.945 -lemma (in finite_prob_space) finite_conditional_mutual_information_reduce:
   1.946 -  assumes "1 < b"
   1.947 -  shows "\<I>\<^bsub>b\<^esub>(X ; Y | Z) =
   1.948 -	- (\<Sum> (x, z) \<in> (X ` space M \<times> Z ` space M).
   1.949 -             distribution (\<lambda>x. (X x, Z x)) {(x,z)} * log b (distribution (\<lambda>x. (X x, Z x)) {(x,z)} / distribution Z {z}))
   1.950 -	+ (\<Sum> (x, y, z) \<in> (X ` space M \<times> (\<lambda>x. (Y x, Z x)) ` space M).
   1.951 +lemma (in finite_information_space) conditional_mutual_information_eq:
   1.952 +  "\<I>(X ; Y | Z) = (\<Sum>(x, y, z) \<in> X ` space M \<times> Y ` space M \<times> Z ` space M.
   1.953               distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)} *
   1.954               log b (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}/
   1.955 -             distribution (\<lambda>x. (Y x, Z x)) {(y, z)}))" (is "_ = ?rhs")
   1.956 -unfolding conditional_mutual_information_def Let_def using finite_space
   1.957 -apply (simp add: prod_measure_space_def sigma_prod_sets_finite)
   1.958 -apply (subst mutual_information_cong[of _ "\<lparr>space = X ` space M, sets = Pow (X ` space M)\<rparr>"
   1.959 -  _ "\<lparr>space = Y ` space M \<times> Z ` space M, sets = Pow (Y ` space M \<times> Z ` space M)\<rparr>"], simp_all)
   1.960 -apply (subst finite_mutual_information_reduce_prod, simp_all)
   1.961 -apply (subst finite_mutual_information_reduce, simp_all)
   1.962 +    (joint_distribution X Z {(x, z)} * joint_distribution Y Z {(y,z)} / distribution Z {z})))"
   1.963 +  unfolding conditional_mutual_information_def Let_def mutual_information_eq
   1.964 +    apply (subst mutual_information_eq_generic)
   1.965 +  by (auto simp add: prod_measure_space_def sigma_prod_sets_finite finite_space
   1.966 +      finite_prob_space_of_images finite_product_prob_space_of_images
   1.967 +      setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf
   1.968 +      setsum_left_distrib[symmetric] setsum_distribution setsum_commute[where A="Y`space M"]
   1.969 +    cong: setsum_cong)
   1.970 +
   1.971 +lemma (in finite_information_space) conditional_mutual_information_eq_mutual_information:
   1.972 +  "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
   1.973 +proof -
   1.974 +  have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
   1.975 +
   1.976 +  show ?thesis
   1.977 +    unfolding conditional_mutual_information_eq mutual_information_eq
   1.978 +    by (simp add: setsum_cartesian_product' distribution_remove_const)
   1.979 +qed
   1.980 +
   1.981 +lemma (in finite_information_space) conditional_mutual_information_positive:
   1.982 +  "0 \<le> \<I>(X ; Y | Z)"
   1.983  proof -
   1.984    let ?dXYZ = "distribution (\<lambda>x. (X x, Y x, Z x))"
   1.985 -  let ?dXZ = "distribution (\<lambda>x. (X x, Z x))"
   1.986 -  let ?dYZ = "distribution (\<lambda>x. (Y x, Z x))"
   1.987 +  let ?dXZ = "joint_distribution X Z"
   1.988 +  let ?dYZ = "joint_distribution Y Z"
   1.989    let ?dX = "distribution X"
   1.990 -  let ?dY = "distribution Y"
   1.991    let ?dZ = "distribution Z"
   1.992 +  let ?M = "X ` space M \<times> Y ` space M \<times> Z ` space M"
   1.993 +
   1.994 +  have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: expand_fun_eq)
   1.995  
   1.996 -  have If_mult_distr: "\<And>A B C D. If A B C * D = If A (B * D) (C * D)" by auto
   1.997 -  { fix x y
   1.998 -    have "(\<lambda>x. (X x, Y x, Z x)) -` {(X x, y)} \<inter> space M =
   1.999 -      (if y \<in> (\<lambda>x. (Y x, Z x)) ` space M then (\<lambda>x. (X x, Y x, Z x)) -` {(X x, y)} \<inter> space M else {})" by auto
  1.1000 -    hence "?dXYZ {(X x, y)} = (if y \<in> (\<lambda>x. (Y x, Z x)) ` space M then ?dXYZ {(X x, y)} else 0)"
  1.1001 -      unfolding distribution_def by auto }
  1.1002 -  note split_measure = this
  1.1003 -
  1.1004 -  have sets: "Y ` space M \<times> Z ` space M \<inter> (\<lambda>x. (Y x, Z x)) ` space M = (\<lambda>x. (Y x, Z x)) ` space M" by auto
  1.1005 -
  1.1006 -  have cong: "\<And>A B C D. \<lbrakk> A = C ; B = D \<rbrakk> \<Longrightarrow> A + B = C + D" by auto
  1.1007 -
  1.1008 -  { fix A f have "setsum f A = setsum (\<lambda>(x, y). f (y, x)) ((\<lambda>(x, y). (y, x)) ` A)"
  1.1009 -    using setsum_reindex[OF inj_on_swap, of "\<lambda>(x, y). f (y, x)" A] by (simp add: split_twice) }
  1.1010 -  note setsum_reindex_swap = this
  1.1011 -
  1.1012 -  { fix A B f assume *: "finite A" "\<forall>x\<in>A. finite (B x)"
  1.1013 -    have "(\<Sum>x\<in>Sigma A B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) (B x))"
  1.1014 -      unfolding setsum_Sigma[OF *] by simp }
  1.1015 -  note setsum_Sigma = this
  1.1016 +  have "- (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
  1.1017 +    log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))
  1.1018 +    \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
  1.1019 +    unfolding split_beta
  1.1020 +  proof (rule log_setsum_divide)
  1.1021 +    show "?M \<noteq> {}" using not_empty by simp
  1.1022 +    show "1 < b" using b_gt_1 .
  1.1023  
  1.1024 -  { fix x
  1.1025 -    have "(\<Sum>z\<in>Z ` space M. ?dXZ {(X x, z)}) = (\<Sum>yz\<in>(\<lambda>x. (Y x, Z x)) ` space M. ?dXYZ {(X x, yz)})"
  1.1026 -      apply (subst setsum_reindex_swap)
  1.1027 -      apply (simp add: image_image distribution_def)
  1.1028 -      unfolding image_pair_eq_Sigma
  1.1029 -      apply (subst setsum_Sigma)
  1.1030 -      using finite_space apply simp_all
  1.1031 -      apply (rule setsum_cong[OF refl])
  1.1032 -      apply (subst measure_finitely_additive'')
  1.1033 -      by (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=prob]) }
  1.1034 +    fix x assume "x \<in> ?M"
  1.1035 +    show "0 \<le> ?dXYZ {(fst x, fst (snd x), snd (snd x))}" using positive_distribution .
  1.1036 +    show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
  1.1037 +      by (auto intro!: mult_nonneg_nonneg positive_distribution simp: zero_le_divide_iff)
  1.1038  
  1.1039 -  thus "(\<Sum>(x, y, z)\<in>X ` space M \<times> Y ` space M \<times> Z ` space M.
  1.1040 -      ?dXYZ {(x, y, z)} * log b (?dXYZ {(x, y, z)} / (?dX {x} * ?dYZ {(y, z)}))) -
  1.1041 -    (\<Sum>(x, y)\<in>X ` space M \<times> Z ` space M.
  1.1042 -      ?dXZ {(x, y)} * log b (?dXZ {(x, y)} / (?dX {x} * ?dZ {y}))) =
  1.1043 -  - (\<Sum> (x, z) \<in> (X ` space M \<times> Z ` space M).
  1.1044 -      ?dXZ {(x,z)} * log b (?dXZ {(x,z)} / ?dZ {z})) +
  1.1045 -    (\<Sum> (x, y, z) \<in> (X ` space M \<times> (\<lambda>x. (Y x, Z x)) ` space M).
  1.1046 -      ?dXYZ {(x, y, z)} * log b (?dXYZ {(x, y, z)} / ?dYZ {(y, z)}))"
  1.1047 -    using finite_space
  1.1048 -    apply (auto simp: setsum_cartesian_product[symmetric] setsum_negf[symmetric]
  1.1049 -                      setsum_addf[symmetric] diff_minus
  1.1050 -      intro!: setsum_cong[OF refl])
  1.1051 -    apply (subst split_measure)
  1.1052 -    apply (simp add: If_mult_distr setsum_cases sets distribution_log_split[OF assms, of X])
  1.1053 -    apply (subst add_commute)
  1.1054 -    by (simp add: setsum_subtractf setsum_negf field_simps setsum_right_distrib[symmetric] sets_eq_Pow)
  1.1055 +    assume *: "0 < ?dXYZ {(fst x, fst (snd x), snd (snd x))}"
  1.1056 +    thus "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
  1.1057 +      by (auto intro!: divide_pos_pos mult_pos_pos
  1.1058 +           intro: distribution_order(6) distribution_mono_gt_0)
  1.1059 +  qed (simp_all add: setsum_cartesian_product' sum_over_space_distrib setsum_distribution finite_space)
  1.1060 +  also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>Z`space M. ?dZ {z})"
  1.1061 +    apply (simp add: setsum_cartesian_product')
  1.1062 +    apply (subst setsum_commute)
  1.1063 +    apply (subst (2) setsum_commute)
  1.1064 +    by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] setsum_distribution
  1.1065 +          intro!: setsum_cong)
  1.1066 +  finally show ?thesis
  1.1067 +    unfolding conditional_mutual_information_eq sum_over_space_distrib by simp
  1.1068  qed
  1.1069  
  1.1070 +
  1.1071  definition (in prob_space)
  1.1072    "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
  1.1073  
  1.1074 -abbreviation (in finite_prob_space)
  1.1075 -  finite_conditional_entropy ("\<H>\<^bsub>_\<^esub>'(_ | _')") where
  1.1076 -  "\<H>\<^bsub>b\<^esub>(X | Y) \<equiv> conditional_entropy b
  1.1077 +abbreviation (in finite_information_space)
  1.1078 +  finite_conditional_entropy ("\<H>'(_ | _')") where
  1.1079 +  "\<H>(X | Y) \<equiv> conditional_entropy b
  1.1080      \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
  1.1081      \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
  1.1082  
  1.1083 -abbreviation (in finite_prob_space)
  1.1084 -  finite_conditional_entropy_2 ("\<H>'(_ | _')") where
  1.1085 -  "\<H>(X | Y) \<equiv> \<H>\<^bsub>2\<^esub>(X | Y)"
  1.1086 +lemma (in finite_information_space) conditional_entropy_positive:
  1.1087 +  "0 \<le> \<H>(X | Y)" unfolding conditional_entropy_def using conditional_mutual_information_positive .
  1.1088  
  1.1089 -lemma (in finite_prob_space) finite_conditional_entropy_reduce:
  1.1090 -  assumes "1 < b"
  1.1091 -  shows "\<H>\<^bsub>b\<^esub>(X | Z) =
  1.1092 +lemma (in finite_information_space) conditional_entropy_eq:
  1.1093 +  "\<H>(X | Z) =
  1.1094       - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
  1.1095           joint_distribution X Z {(x, z)} *
  1.1096           log b (joint_distribution X Z {(x, z)} / distribution Z {z}))"
  1.1097  proof -
  1.1098    have *: "\<And>x y z. (\<lambda>x. (X x, X x, Z x)) -` {(x, y, z)} = (if x = y then (\<lambda>x. (X x, Z x)) -` {(x, z)} else {})" by auto
  1.1099    show ?thesis
  1.1100 -    unfolding finite_conditional_mutual_information_reduce[OF assms]
  1.1101 -      conditional_entropy_def joint_distribution_def distribution_def *
  1.1102 +    unfolding conditional_mutual_information_eq_sum
  1.1103 +      conditional_entropy_def distribution_def *
  1.1104      by (auto intro!: setsum_0')
  1.1105  qed
  1.1106  
  1.1107 -lemma (in finite_prob_space) finite_mutual_information_eq_entropy_conditional_entropy:
  1.1108 -  assumes "1 < b" shows "\<I>\<^bsub>b\<^esub>(X ; Z) = \<H>\<^bsub>b\<^esub>(X) - \<H>\<^bsub>b\<^esub>(X | Z)" (is "mutual_information b ?X ?Z X Z = _")
  1.1109 -  unfolding finite_mutual_information_reduce
  1.1110 -    finite_entropy_reduce[OF assms]
  1.1111 -    finite_conditional_entropy_reduce[OF assms]
  1.1112 -    joint_distribution diff_minus_eq_add
  1.1113 +lemma (in finite_information_space) mutual_information_eq_entropy_conditional_entropy:
  1.1114 +  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
  1.1115 +  unfolding mutual_information_eq entropy_eq conditional_entropy_eq
  1.1116    using finite_space
  1.1117 -  apply (auto simp add: setsum_addf[symmetric] setsum_subtractf
  1.1118 -      setsum_Sigma[symmetric] distribution_log_split[OF assms] setsum_negf[symmetric]
  1.1119 -    intro!: setsum_cong[OF refl])
  1.1120 -  apply (simp add: setsum_negf setsum_left_distrib[symmetric])
  1.1121 -proof (rule disjI2)
  1.1122 -  let ?dX = "distribution X"
  1.1123 -  and ?dXZ = "distribution (\<lambda>x. (X x, Z x))"
  1.1124 +  by (auto simp add: setsum_addf setsum_subtractf setsum_cartesian_product'
  1.1125 +      setsum_left_distrib[symmetric] setsum_addf setsum_distribution)
  1.1126  
  1.1127 -  fix x assume "x \<in> space M"
  1.1128 -  have "\<And>z. (\<lambda>x. (X x, Z x)) -` {(X x, z)} \<inter> space M = (X -` {X x} \<inter> space M) \<inter> (Z -` {z} \<inter> space M)" by auto
  1.1129 -  thus "(\<Sum>z\<in>Z ` space M. distribution (\<lambda>x. (X x, Z x)) {(X x, z)}) = distribution X {X x}"
  1.1130 -    unfolding distribution_def
  1.1131 -    apply (subst prob_real_sum_image_fn[where e="X -` {X x} \<inter> space M" and s = "Z`space M" and f="\<lambda>z. Z -` {z} \<inter> space M"])
  1.1132 -    using finite_space sets_eq_Pow by auto
  1.1133 +lemma (in finite_information_space) conditional_entropy_less_eq_entropy:
  1.1134 +  "\<H>(X | Z) \<le> \<H>(X)"
  1.1135 +proof -
  1.1136 +  have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy .
  1.1137 +  with mutual_information_positive[of X Z] entropy_positive[of X]
  1.1138 +  show ?thesis by auto
  1.1139  qed
  1.1140  
  1.1141  (* -------------Entropy of a RV with a certain event is zero---------------- *)
  1.1142  
  1.1143 -lemma (in finite_prob_space) finite_entropy_certainty_eq_0:
  1.1144 -  assumes "x \<in> X ` space M" and "distribution X {x} = 1" and "b > 1"
  1.1145 -  shows "\<H>\<^bsub>b\<^esub>(X) = 0"
  1.1146 +lemma (in finite_information_space) finite_entropy_certainty_eq_0:
  1.1147 +  assumes "x \<in> X ` space M" and "distribution X {x} = 1"
  1.1148 +  shows "\<H>(X) = 0"
  1.1149  proof -
  1.1150    interpret X: finite_prob_space "\<lparr> space = X ` space M,
  1.1151      sets = Pow (X ` space M),
  1.1152 -    measure = distribution X\<rparr>" by (rule finite_prob_space)
  1.1153 +    measure = distribution X\<rparr>" by (rule finite_prob_space_of_images)
  1.1154  
  1.1155    have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
  1.1156      using X.measure_compl[of "{x}"] assms by auto
  1.1157 @@ -694,34 +606,18 @@
  1.1158  
  1.1159    have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
  1.1160  
  1.1161 -  show ?thesis
  1.1162 -    unfolding finite_entropy_reduce[OF `b > 1`] by (auto simp: y fi)
  1.1163 +  show ?thesis unfolding entropy_eq by (auto simp: y fi)
  1.1164  qed
  1.1165  (* --------------- upper bound on entropy for a rv ------------------------- *)
  1.1166  
  1.1167 -lemma log_setsum:
  1.1168 -  assumes "finite s" "s \<noteq> {}"
  1.1169 -  assumes "b > 1"
  1.1170 -  assumes "(\<Sum> i \<in> s. a i) = 1"
  1.1171 -  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
  1.1172 -  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
  1.1173 -  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
  1.1174 -proof -
  1.1175 -  have "convex_on {0 <..} (\<lambda> x. - log b x)"
  1.1176 -    by (rule minus_log_convex[OF `b > 1`])
  1.1177 -  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
  1.1178 -    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
  1.1179 -  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
  1.1180 -qed
  1.1181 -
  1.1182 -lemma (in finite_prob_space) finite_entropy_le_card:
  1.1183 -  assumes "1 < b"
  1.1184 -  shows "\<H>\<^bsub>b\<^esub>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
  1.1185 +lemma (in finite_information_space) finite_entropy_le_card:
  1.1186 +  "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
  1.1187  proof -
  1.1188    interpret X: finite_prob_space "\<lparr>space = X ` space M,
  1.1189                                      sets = Pow (X ` space M),
  1.1190                                   measure = distribution X\<rparr>"
  1.1191 -    using finite_prob_space by auto
  1.1192 +    using finite_prob_space_of_images by auto
  1.1193 +
  1.1194    have triv: "\<And> x. (if distribution X {x} \<noteq> 0 then distribution X {x} else 0) = distribution X {x}"
  1.1195      by auto
  1.1196    hence sum1: "(\<Sum> x \<in> X ` space M \<inter> {y. distribution X {y} \<noteq> 0}. distribution X {x}) = 1"
  1.1197 @@ -753,7 +649,7 @@
  1.1198      also have "\<dots> = (if distribution X {x} \<noteq> 0
  1.1199                      then distribution X {x} * log b (inverse (distribution X {x}))
  1.1200                      else 0)"
  1.1201 -      using log_inverse `1 < b` X.positive[of "{x}"] asm by auto
  1.1202 +      using log_inverse b_gt_1 X.positive[of "{x}"] asm by auto
  1.1203      finally have "- distribution X {x} * log b (distribution X {x})
  1.1204                   = (if distribution X {x} \<noteq> 0 
  1.1205                      then distribution X {x} * log b (inverse (distribution X {x}))
  1.1206 @@ -769,7 +665,7 @@
  1.1207      unfolding setsum_restrict_set[OF finite_imageI[OF finite_space, of X]] by auto
  1.1208    also have "\<dots> \<le> log b (\<Sum> x \<in> X ` space M \<inter> {y. distribution X {y} \<noteq> 0}.
  1.1209                            distribution X {x} * (inverse (distribution X {x})))"
  1.1210 -    apply (subst log_setsum[OF _ _ `b > 1` sum1, 
  1.1211 +    apply (subst log_setsum[OF _ _ b_gt_1 sum1, 
  1.1212       unfolded greaterThan_iff, OF _ _ _]) using pos sets_eq_Pow
  1.1213        X.finite_space assms X.positive not_empty by auto
  1.1214    also have "\<dots> = log b (\<Sum> x \<in> X ` space M \<inter> {y. distribution X {y} \<noteq> 0}. 1)"
  1.1215 @@ -778,7 +674,7 @@
  1.1216      by auto
  1.1217    finally have "- (\<Sum>x\<in>X ` space M. distribution X {x} * log b (distribution X {x}))
  1.1218                 \<le> log b (real_of_nat (card (X ` space M \<inter> {y. distribution X {y} \<noteq> 0})))" by simp
  1.1219 -  thus ?thesis unfolding finite_entropy_reduce[OF assms] real_eq_of_nat by auto
  1.1220 +  thus ?thesis unfolding entropy_eq real_eq_of_nat by auto
  1.1221  qed
  1.1222  
  1.1223  (* --------------- entropy is maximal for a uniform rv --------------------- *)
  1.1224 @@ -808,34 +704,31 @@
  1.1225      by (auto simp:field_simps)
  1.1226  qed
  1.1227  
  1.1228 -lemma (in finite_prob_space) finite_entropy_uniform_max:
  1.1229 -  assumes "b > 1"
  1.1230 +lemma (in finite_information_space) finite_entropy_uniform_max:
  1.1231    assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
  1.1232 -  shows "\<H>\<^bsub>b\<^esub>(X) = log b (real (card (X ` space M)))"
  1.1233 +  shows "\<H>(X) = log b (real (card (X ` space M)))"
  1.1234  proof -
  1.1235    interpret X: finite_prob_space "\<lparr>space = X ` space M,
  1.1236                                      sets = Pow (X ` space M),
  1.1237                                   measure = distribution X\<rparr>"
  1.1238 -    using finite_prob_space by auto
  1.1239 +    using finite_prob_space_of_images by auto
  1.1240 +
  1.1241    { fix x assume xasm: "x \<in> X ` space M"
  1.1242      hence card_gt0: "real (card (X ` space M)) > 0"
  1.1243        using card_gt_0_iff X.finite_space by auto
  1.1244      from xasm have "\<And> y. y \<in> X ` space M \<Longrightarrow> distribution X {y} = distribution X {x}"
  1.1245        using assms by blast
  1.1246      hence "- (\<Sum>x\<in>X ` space M. distribution X {x} * log b (distribution X {x}))
  1.1247 -         = - (\<Sum> y \<in> X ` space M. distribution X {x} * log b (distribution X {x}))"
  1.1248 -      by auto
  1.1249 -    also have "\<dots> = - real_of_nat (card (X ` space M)) * distribution X {x} * log b (distribution X {x})"
  1.1250 -      by auto
  1.1251 +         = - real (card (X ` space M)) * distribution X {x} * log b (distribution X {x})"
  1.1252 +      unfolding real_eq_of_nat by auto
  1.1253      also have "\<dots> = - real (card (X ` space M)) * (1 / real (card (X ` space M))) * log b (1 / real (card (X ` space M)))"
  1.1254 -      unfolding real_eq_of_nat[symmetric]
  1.1255 -      by (auto simp: X.uniform_prob[simplified, OF xasm assms(2)])
  1.1256 +      by (auto simp: X.uniform_prob[simplified, OF xasm assms])
  1.1257      also have "\<dots> = log b (real (card (X ` space M)))"
  1.1258        unfolding inverse_eq_divide[symmetric]
  1.1259 -      using card_gt0 log_inverse `b > 1` 
  1.1260 +      using card_gt0 log_inverse b_gt_1
  1.1261        by (auto simp add:field_simps card_gt0)
  1.1262      finally have ?thesis
  1.1263 -      unfolding finite_entropy_reduce[OF `b > 1`] by auto }
  1.1264 +      unfolding entropy_eq by auto }
  1.1265    moreover
  1.1266    { assume "X ` space M = {}"
  1.1267      hence "distribution X (X ` space M) = 0"
  1.1268 @@ -844,4 +737,199 @@
  1.1269    ultimately show ?thesis by auto
  1.1270  qed
  1.1271  
  1.1272 +definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
  1.1273 +
  1.1274 +lemma subvimageI:
  1.1275 +  assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
  1.1276 +  shows "subvimage A f g"
  1.1277 +  using assms unfolding subvimage_def by blast
  1.1278 +
  1.1279 +lemma subvimageE[consumes 1]:
  1.1280 +  assumes "subvimage A f g"
  1.1281 +  obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
  1.1282 +  using assms unfolding subvimage_def by blast
  1.1283 +
  1.1284 +lemma subvimageD:
  1.1285 +  "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
  1.1286 +  using assms unfolding subvimage_def by blast
  1.1287 +
  1.1288 +lemma subvimage_subset:
  1.1289 +  "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
  1.1290 +  unfolding subvimage_def by auto
  1.1291 +
  1.1292 +lemma subvimage_idem[intro]: "subvimage A g g"
  1.1293 +  by (safe intro!: subvimageI)
  1.1294 +
  1.1295 +lemma subvimage_comp_finer[intro]:
  1.1296 +  assumes svi: "subvimage A g h"
  1.1297 +  shows "subvimage A g (f \<circ> h)"
  1.1298 +proof (rule subvimageI, simp)
  1.1299 +  fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
  1.1300 +  from svi[THEN subvimageD, OF this]
  1.1301 +  show "f (h x) = f (h y)" by simp
  1.1302 +qed
  1.1303 +
  1.1304 +lemma subvimage_comp_gran:
  1.1305 +  assumes svi: "subvimage A g h"
  1.1306 +  assumes inj: "inj_on f (g ` A)"
  1.1307 +  shows "subvimage A (f \<circ> g) h"
  1.1308 +  by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
  1.1309 +
  1.1310 +lemma subvimage_comp:
  1.1311 +  assumes svi: "subvimage (f ` A) g h"
  1.1312 +  shows "subvimage A (g \<circ> f) (h \<circ> f)"
  1.1313 +  by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
  1.1314 +
  1.1315 +lemma subvimage_trans:
  1.1316 +  assumes fg: "subvimage A f g"
  1.1317 +  assumes gh: "subvimage A g h"
  1.1318 +  shows "subvimage A f h"
  1.1319 +  by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
  1.1320 +
  1.1321 +lemma subvimage_translator:
  1.1322 +  assumes svi: "subvimage A f g"
  1.1323 +  shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
  1.1324 +proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
  1.1325 +  fix x assume "x \<in> A"
  1.1326 +  show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
  1.1327 +    by (rule theI2[of _ "g x"])
  1.1328 +      (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
  1.1329 +qed
  1.1330 +
  1.1331 +lemma subvimage_translator_image:
  1.1332 +  assumes svi: "subvimage A f g"
  1.1333 +  shows "\<exists>h. h ` f ` A = g ` A"
  1.1334 +proof -
  1.1335 +  from subvimage_translator[OF svi]
  1.1336 +  obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
  1.1337 +  thus ?thesis
  1.1338 +    by (auto intro!: exI[of _ h]
  1.1339 +      simp: image_compose[symmetric] comp_def cong: image_cong)
  1.1340 +qed
  1.1341 +
  1.1342 +lemma subvimage_finite:
  1.1343 +  assumes svi: "subvimage A f g" and fin: "finite (f`A)"
  1.1344 +  shows "finite (g`A)"
  1.1345 +proof -
  1.1346 +  from subvimage_translator_image[OF svi]
  1.1347 +  obtain h where "g`A = h`f`A" by fastsimp
  1.1348 +  with fin show "finite (g`A)" by simp
  1.1349 +qed
  1.1350 +
  1.1351 +lemma subvimage_disj:
  1.1352 +  assumes svi: "subvimage A f g"
  1.1353 +  shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
  1.1354 +      f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
  1.1355 +proof (rule disjCI)
  1.1356 +  assume "\<not> ?dist"
  1.1357 +  then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
  1.1358 +  thus "?sub" using svi unfolding subvimage_def by auto
  1.1359 +qed
  1.1360 +
  1.1361 +lemma setsum_image_split:
  1.1362 +  assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
  1.1363 +  shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
  1.1364 +    (is "?lhs = ?rhs")
  1.1365 +proof -
  1.1366 +  have "f ` A =
  1.1367 +      snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
  1.1368 +      (is "_ = snd ` ?SIGMA")
  1.1369 +    unfolding image_split_eq_Sigma[symmetric]
  1.1370 +    by (simp add: image_compose[symmetric] comp_def)
  1.1371 +  moreover
  1.1372 +  have snd_inj: "inj_on snd ?SIGMA"
  1.1373 +    unfolding image_split_eq_Sigma[symmetric]
  1.1374 +    by (auto intro!: inj_onI subvimageD[OF svi])
  1.1375 +  ultimately
  1.1376 +  have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
  1.1377 +    by (auto simp: setsum_reindex intro: setsum_cong)
  1.1378 +  also have "... = ?rhs"
  1.1379 +    using subvimage_finite[OF svi fin] fin
  1.1380 +    apply (subst setsum_Sigma[symmetric])
  1.1381 +    by (auto intro!: finite_subset[of _ "f`A"])
  1.1382 +  finally show ?thesis .
  1.1383 +qed
  1.1384 +
  1.1385 +lemma (in finite_information_space) entropy_partition:
  1.1386 +  assumes svi: "subvimage (space M) X P"
  1.1387 +  shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
  1.1388 +proof -
  1.1389 +  have "(\<Sum>x\<in>X ` space M. distribution X {x} * log b (distribution X {x})) =
  1.1390 +    (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
  1.1391 +    joint_distribution X P {(x, y)} * log b (joint_distribution X P {(x, y)}))"
  1.1392 +  proof (subst setsum_image_split[OF svi],
  1.1393 +      safe intro!: finite_imageI finite_space setsum_mono_zero_cong_left imageI)
  1.1394 +    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1.1395 +    assume "joint_distribution X P {(X x, P p)} * log b (joint_distribution X P {(X x, P p)}) \<noteq> 0"
  1.1396 +    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
  1.1397 +    with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1.1398 +    show "x \<in> P -` {P p}" by auto
  1.1399 +  next
  1.1400 +    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1.1401 +    assume "P x = P p"
  1.1402 +    from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1.1403 +    have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
  1.1404 +      by auto
  1.1405 +    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
  1.1406 +      by auto
  1.1407 +    thus "distribution X {X x} * log b (distribution X {X x}) =
  1.1408 +          joint_distribution X P {(X x, P p)} *
  1.1409 +          log b (joint_distribution X P {(X x, P p)})"
  1.1410 +      by (auto simp: distribution_def)
  1.1411 +  qed
  1.1412 +  thus ?thesis
  1.1413 +  unfolding entropy_eq conditional_entropy_eq
  1.1414 +    by (simp add: setsum_cartesian_product' setsum_subtractf setsum_distribution
  1.1415 +      setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
  1.1416 +qed
  1.1417 +
  1.1418 +corollary (in finite_information_space) entropy_data_processing:
  1.1419 +  "\<H>(f \<circ> X) \<le> \<H>(X)"
  1.1420 +  by (subst (2) entropy_partition[of _ "f \<circ> X"]) (auto intro: conditional_entropy_positive)
  1.1421 +
  1.1422 +lemma (in prob_space) distribution_cong:
  1.1423 +  assumes "\<And>x. x \<in> space M \<Longrightarrow> X x = Y x"
  1.1424 +  shows "distribution X = distribution Y"
  1.1425 +  unfolding distribution_def expand_fun_eq
  1.1426 +  using assms by (auto intro!: arg_cong[where f=prob])
  1.1427 +
  1.1428 +lemma (in prob_space) joint_distribution_cong:
  1.1429 +  assumes "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
  1.1430 +  assumes "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
  1.1431 +  shows "joint_distribution X Y = joint_distribution X' Y'"
  1.1432 +  unfolding distribution_def expand_fun_eq
  1.1433 +  using assms by (auto intro!: arg_cong[where f=prob])
  1.1434 +
  1.1435 +lemma image_cong:
  1.1436 +  "\<lbrakk> \<And>x. x \<in> S \<Longrightarrow> X x = X' x \<rbrakk> \<Longrightarrow> X ` S = X' ` S"
  1.1437 +  by (auto intro!: image_eqI)
  1.1438 +
  1.1439 +lemma (in finite_information_space) mutual_information_cong:
  1.1440 +  assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
  1.1441 +  assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
  1.1442 +  shows "\<I>(X ; Y) = \<I>(X' ; Y')"
  1.1443 +proof -
  1.1444 +  have "X ` space M = X' ` space M" using X by (rule image_cong)
  1.1445 +  moreover have "Y ` space M = Y' ` space M" using Y by (rule image_cong)
  1.1446 +  ultimately show ?thesis
  1.1447 +  unfolding mutual_information_eq
  1.1448 +    using
  1.1449 +      assms[THEN distribution_cong]
  1.1450 +      joint_distribution_cong[OF assms]
  1.1451 +    by (auto intro!: setsum_cong)
  1.1452 +qed
  1.1453 +
  1.1454 +corollary (in finite_information_space) entropy_of_inj:
  1.1455 +  assumes "inj_on f (X`space M)"
  1.1456 +  shows "\<H>(f \<circ> X) = \<H>(X)"
  1.1457 +proof (rule antisym)
  1.1458 +  show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing .
  1.1459 +next
  1.1460 +  have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1.1461 +    by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF assms])
  1.1462 +  also have "... \<le> \<H>(f \<circ> X)"
  1.1463 +    using entropy_data_processing .
  1.1464 +  finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1.1465 +qed
  1.1466 +
  1.1467  end
     2.1 --- a/src/HOL/Probability/Lebesgue.thy	Mon May 03 14:35:10 2010 +0200
     2.2 +++ b/src/HOL/Probability/Lebesgue.thy	Mon May 03 14:35:10 2010 +0200
     2.3 @@ -25,6 +25,21 @@
     2.4    shows "nonneg (neg_part f)"
     2.5    unfolding nonneg_def neg_part_def min_def by auto
     2.6  
     2.7 +lemma pos_neg_part_abs:
     2.8 +  fixes f :: "'a \<Rightarrow> real"
     2.9 +  shows "pos_part f x + neg_part f x = \<bar>f x\<bar>"
    2.10 +unfolding real_abs_def pos_part_def neg_part_def by auto
    2.11 +
    2.12 +lemma pos_part_abs:
    2.13 +  fixes f :: "'a \<Rightarrow> real"
    2.14 +  shows "pos_part (\<lambda> x. \<bar>f x\<bar>) y = \<bar>f y\<bar>"
    2.15 +unfolding pos_part_def real_abs_def by auto
    2.16 +
    2.17 +lemma neg_part_abs:
    2.18 +  fixes f :: "'a \<Rightarrow> real"
    2.19 +  shows "neg_part (\<lambda> x. \<bar>f x\<bar>) y = 0"
    2.20 +unfolding neg_part_def real_abs_def by auto
    2.21 +
    2.22  lemma (in measure_space)
    2.23    assumes "f \<in> borel_measurable M"
    2.24    shows pos_part_borel_measurable: "pos_part f \<in> borel_measurable M"
    2.25 @@ -1273,6 +1288,22 @@
    2.26    thus "?int S" and "?I S" by auto
    2.27  qed
    2.28  
    2.29 +lemma (in measure_space) integrable_abs:
    2.30 +  assumes "integrable f"
    2.31 +  shows "integrable (\<lambda> x. \<bar>f x\<bar>)"
    2.32 +using assms
    2.33 +proof -
    2.34 +  from assms obtain p q where pq: "p \<in> nnfis (pos_part f)" "q \<in> nnfis (neg_part f)"
    2.35 +    unfolding integrable_def by auto
    2.36 +  hence "p + q \<in> nnfis (\<lambda> x. pos_part f x + neg_part f x)"
    2.37 +    using nnfis_add by auto
    2.38 +  hence "p + q \<in> nnfis (\<lambda> x. \<bar>f x\<bar>)" using pos_neg_part_abs[of f] by simp
    2.39 +  thus ?thesis unfolding integrable_def
    2.40 +    using ext[OF pos_part_abs[of f], of "\<lambda> y. y"]
    2.41 +      ext[OF neg_part_abs[of f], of "\<lambda> y. y"]
    2.42 +    using nnfis_0 by auto
    2.43 +qed
    2.44 +
    2.45  lemma markov_ineq:
    2.46    assumes "integrable f" "0 < a" "integrable (\<lambda>x. \<bar>f x\<bar>^n)"
    2.47    shows "measure M (f -` {a ..} \<inter> space M) \<le> integral (\<lambda>x. \<bar>f x\<bar>^n) / a^n"
    2.48 @@ -1310,6 +1341,61 @@
    2.49      by (auto intro!: mult_imp_le_div_pos[OF `0 < a ^ n`], simp add: real_mult_commute)
    2.50  qed
    2.51  
    2.52 +lemma (in measure_space) integral_0:
    2.53 +  fixes f :: "'a \<Rightarrow> real"
    2.54 +  assumes "integrable f" "integral f = 0" "nonneg f" and borel: "f \<in> borel_measurable M"
    2.55 +  shows "measure M ({x. f x \<noteq> 0} \<inter> space M) = 0"
    2.56 +proof -
    2.57 +  have "{x. f x \<noteq> 0} = {x. \<bar>f x\<bar> > 0}" by auto
    2.58 +  moreover
    2.59 +  { fix y assume "y \<in> {x. \<bar> f x \<bar> > 0}"
    2.60 +    hence "\<bar> f y \<bar> > 0" by auto
    2.61 +    hence "\<exists> n. \<bar>f y\<bar> \<ge> inverse (real (Suc n))"
    2.62 +      using ex_inverse_of_nat_Suc_less[of "\<bar>f y\<bar>"] less_imp_le unfolding real_of_nat_def by auto
    2.63 +    hence "y \<in> (\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))})"
    2.64 +      by auto }
    2.65 +  moreover
    2.66 +  { fix y assume "y \<in> (\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))})"
    2.67 +    then obtain n where n: "y \<in> {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))}" by auto
    2.68 +    hence "\<bar>f y\<bar> \<ge> inverse (real (Suc n))" by auto
    2.69 +    hence "\<bar>f y\<bar> > 0"
    2.70 +      using real_of_nat_Suc_gt_zero
    2.71 +        positive_imp_inverse_positive[of "real_of_nat (Suc n)"] by fastsimp
    2.72 +    hence "y \<in> {x. \<bar>f x\<bar> > 0}" by auto }
    2.73 +  ultimately have fneq0_UN: "{x. f x \<noteq> 0} = (\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))})"
    2.74 +    by blast
    2.75 +  { fix n
    2.76 +    have int_one: "integrable (\<lambda> x. \<bar>f x\<bar> ^ 1)" using integrable_abs assms by auto
    2.77 +    have "measure M (f -` {inverse (real (Suc n))..} \<inter> space M)
    2.78 +           \<le> integral (\<lambda> x. \<bar>f x\<bar> ^ 1) / (inverse (real (Suc n)) ^ 1)"
    2.79 +      using markov_ineq[OF `integrable f` _ int_one] real_of_nat_Suc_gt_zero by auto
    2.80 +    hence le0: "measure M (f -` {inverse (real (Suc n))..} \<inter> space M) \<le> 0"
    2.81 +      using assms unfolding nonneg_def by auto
    2.82 +    have "{x. f x \<ge> inverse (real (Suc n))} \<inter> space M \<in> sets M"
    2.83 +      apply (subst Int_commute) unfolding Int_def
    2.84 +      using borel[unfolded borel_measurable_ge_iff] by simp
    2.85 +    hence m0: "measure M ({x. f x \<ge> inverse (real (Suc n))} \<inter> space M) = 0 \<and>
    2.86 +      {x. f x \<ge> inverse (real (Suc n))} \<inter> space M \<in> sets M"
    2.87 +      using positive le0 unfolding atLeast_def by fastsimp }
    2.88 +  moreover hence "range (\<lambda> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M) \<subseteq> sets M"
    2.89 +    by auto
    2.90 +  moreover
    2.91 +  { fix n
    2.92 +    have "inverse (real (Suc n)) \<ge> inverse (real (Suc (Suc n)))"
    2.93 +      using less_imp_inverse_less real_of_nat_Suc_gt_zero[of n] by fastsimp
    2.94 +    hence "\<And> x. f x \<ge> inverse (real (Suc n)) \<Longrightarrow> f x \<ge> inverse (real (Suc (Suc n)))" by (rule order_trans)
    2.95 +    hence "{x. f x \<ge> inverse (real (Suc n))} \<inter> space M
    2.96 +         \<subseteq> {x. f x \<ge> inverse (real (Suc (Suc n)))} \<inter> space M" by auto }
    2.97 +  ultimately have "(\<lambda> x. 0) ----> measure M (\<Union> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M)"
    2.98 +    using monotone_convergence[of "\<lambda> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M"]
    2.99 +    unfolding o_def by (simp del: of_nat_Suc)
   2.100 +  hence "measure M (\<Union> n. {x. f x \<ge> inverse (real (Suc n))} \<inter> space M) = 0"
   2.101 +    using LIMSEQ_const[of 0] LIMSEQ_unique by simp
   2.102 +  hence "measure M ((\<Union> n. {x. \<bar>f x\<bar> \<ge> inverse (real (Suc n))}) \<inter> space M) = 0"
   2.103 +    using assms unfolding nonneg_def by auto
   2.104 +  thus "measure M ({x. f x \<noteq> 0} \<inter> space M) = 0" using fneq0_UN by simp
   2.105 +qed
   2.106 +
   2.107  section "Lebesgue integration on countable spaces"
   2.108  
   2.109  lemma nnfis_on_countable:
   2.110 @@ -1551,10 +1637,6 @@
   2.111  
   2.112  end
   2.113  
   2.114 -locale finite_measure_space = measure_space +
   2.115 -  assumes finite_space: "finite (space M)"
   2.116 -  and sets_eq_Pow: "sets M = Pow (space M)"
   2.117 -
   2.118  lemma sigma_algebra_cong:
   2.119    fixes M :: "('a, 'b) algebra_scheme" and M' :: "('a, 'c) algebra_scheme"
   2.120    assumes *: "sigma_algebra M"
   2.121 @@ -1610,7 +1692,7 @@
   2.122  lemma (in finite_measure_space) RN_deriv_finite_singleton:
   2.123    fixes v :: "'a set \<Rightarrow> real"
   2.124    assumes ms_v: "measure_space (M\<lparr>measure := v\<rparr>)"
   2.125 -  and eq_0: "\<And>x. measure M {x} = 0 \<Longrightarrow> v {x} = 0"
   2.126 +  and eq_0: "\<And>x. \<lbrakk> x \<in> space M ; measure M {x} = 0 \<rbrakk> \<Longrightarrow> v {x} = 0"
   2.127    and "x \<in> space M" and "measure M {x} \<noteq> 0"
   2.128    shows "RN_deriv v x = v {x} / (measure M {x})" (is "_ = ?v x")
   2.129    unfolding RN_deriv_def
   2.130 @@ -1621,7 +1703,7 @@
   2.131    fix a assume "a \<in> sets M"
   2.132    hence "a \<subseteq> space M" and "finite a"
   2.133      using sets_into_space finite_space by (auto intro: finite_subset)
   2.134 -  have *: "\<And>x a. (if measure M {x} = 0 then 0 else v {x} * indicator_fn a x) =
   2.135 +  have *: "\<And>x a. x \<in> space M \<Longrightarrow> (if measure M {x} = 0 then 0 else v {x} * indicator_fn a x) =
   2.136      v {x} * indicator_fn a x" using eq_0 by auto
   2.137  
   2.138    from measure_space.measure_real_sum_image[OF ms_v, of a]
     3.1 --- a/src/HOL/Probability/Measure.thy	Mon May 03 14:35:10 2010 +0200
     3.2 +++ b/src/HOL/Probability/Measure.thy	Mon May 03 14:35:10 2010 +0200
     3.3 @@ -365,6 +365,18 @@
     3.4      by arith
     3.5  qed
     3.6  
     3.7 +lemma (in measure_space) measure_mono:
     3.8 +  assumes "a \<subseteq> b" "a \<in> sets M" "b \<in> sets M"
     3.9 +  shows "measure M a \<le> measure M b"
    3.10 +proof -
    3.11 +  have "b = a \<union> (b - a)" using assms by auto
    3.12 +  moreover have "{} = a \<inter> (b - a)" by auto
    3.13 +  ultimately have "measure M b = measure M a + measure M (b - a)"
    3.14 +    using measure_additive[of a "b - a"] local.Diff[of b a] assms by auto
    3.15 +  moreover have "measure M (b - a) \<ge> 0" using positive assms by auto
    3.16 +  ultimately show "measure M a \<le> measure M b" by auto
    3.17 +qed
    3.18 +
    3.19  lemma disjoint_family_Suc:
    3.20    assumes Suc: "!!n. A n \<subseteq> A (Suc n)"
    3.21    shows "disjoint_family (\<lambda>i. A (Suc i) - A i)"
    3.22 @@ -1045,4 +1057,12 @@
    3.23    qed
    3.24  qed
    3.25  
    3.26 +locale finite_measure_space = measure_space +
    3.27 +  assumes finite_space: "finite (space M)"
    3.28 +  and sets_eq_Pow: "sets M = Pow (space M)"
    3.29 +
    3.30 +lemma (in finite_measure_space) sum_over_space: "(\<Sum>x\<in>space M. measure M {x}) = measure M (space M)"
    3.31 +  using measure_finitely_additive''[of "space M" "\<lambda>i. {i}"]
    3.32 +  by (simp add: sets_eq_Pow disjoint_family_on_def finite_space)
    3.33 +
    3.34  end
    3.35 \ No newline at end of file
     4.1 --- a/src/HOL/Probability/Probability_Space.thy	Mon May 03 14:35:10 2010 +0200
     4.2 +++ b/src/HOL/Probability/Probability_Space.thy	Mon May 03 14:35:10 2010 +0200
     4.3 @@ -21,22 +21,23 @@
     4.4  definition
     4.5    "distribution X = (\<lambda>s. prob ((X -` s) \<inter> (space M)))"
     4.6  
     4.7 -definition
     4.8 -  "probably e \<longleftrightarrow> e \<in> events \<and> prob e = 1"
     4.9 +abbreviation
    4.10 +  "joint_distribution X Y \<equiv> distribution (\<lambda>x. (X x, Y x))"
    4.11  
    4.12 -definition
    4.13 -  "possibly e \<longleftrightarrow> e \<in> events \<and> prob e \<noteq> 0"
    4.14 +(*
    4.15 +definition probably :: "('a \<Rightarrow> bool) \<Rightarrow> bool" (binder "\<forall>\<^sup>*" 10) where
    4.16 +  "probably P \<longleftrightarrow> { x. P x } \<in> events \<and> prob { x. P x } = 1"
    4.17 +definition possibly :: "('a \<Rightarrow> bool) \<Rightarrow> bool" (binder "\<exists>\<^sup>*" 10) where
    4.18 +  "possibly P \<longleftrightarrow> { x. P x } \<in> events \<and> prob { x. P x } \<noteq> 0"
    4.19 +*)
    4.20  
    4.21  definition
    4.22 -  "joint_distribution X Y \<equiv> (\<lambda>a. prob ((\<lambda>x. (X x, Y x)) -` a \<inter> space M))"
    4.23 +  "conditional_expectation X M' \<equiv> SOME f. f \<in> measurable M' borel_space \<and>
    4.24 +    (\<forall> g \<in> sets M'. measure_space.integral M' (\<lambda>x. f x * indicator_fn g x) =
    4.25 +                    measure_space.integral M' (\<lambda>x. X x * indicator_fn g x))"
    4.26  
    4.27  definition
    4.28 -  "conditional_expectation X s \<equiv> THE f. random_variable borel_space f \<and>
    4.29 -    (\<forall> g \<in> s. integral (\<lambda>x. f x * indicator_fn g x) =
    4.30 -              integral (\<lambda>x. X x * indicator_fn g x))"
    4.31 -
    4.32 -definition
    4.33 -  "conditional_prob e1 e2 \<equiv> conditional_expectation (indicator_fn e1) e2"
    4.34 +  "conditional_prob E M' \<equiv> conditional_expectation (indicator_fn E) M'"
    4.35  
    4.36  lemma positive': "positive M prob"
    4.37    unfolding positive_def using positive empty_measure by blast
    4.38 @@ -389,14 +390,61 @@
    4.39  
    4.40  locale finite_prob_space = prob_space + finite_measure_space
    4.41  
    4.42 -lemma (in finite_prob_space) finite_marginal_product_space_POW2:
    4.43 +lemma finite_prob_space_eq:
    4.44 +  "finite_prob_space M \<longleftrightarrow> finite_measure_space M \<and> measure M (space M) = 1"
    4.45 +  unfolding finite_prob_space_def finite_measure_space_def prob_space_def prob_space_axioms_def
    4.46 +  by auto
    4.47 +
    4.48 +lemma (in prob_space) not_empty: "space M \<noteq> {}"
    4.49 +  using prob_space empty_measure by auto
    4.50 +
    4.51 +lemma (in finite_prob_space) sum_over_space_eq_1: "(\<Sum>x\<in>space M. measure M {x}) = 1"
    4.52 +  using prob_space sum_over_space by simp
    4.53 +
    4.54 +lemma (in finite_prob_space) positive_distribution: "0 \<le> distribution X x"
    4.55 +  unfolding distribution_def using positive sets_eq_Pow by simp
    4.56 +
    4.57 +lemma (in finite_prob_space) joint_distribution_restriction_fst:
    4.58 +  "joint_distribution X Y A \<le> distribution X (fst ` A)"
    4.59 +  unfolding distribution_def
    4.60 +proof (safe intro!: measure_mono)
    4.61 +  fix x assume "x \<in> space M" and *: "(X x, Y x) \<in> A"
    4.62 +  show "x \<in> X -` fst ` A"
    4.63 +    by (auto intro!: image_eqI[OF _ *])
    4.64 +qed (simp_all add: sets_eq_Pow)
    4.65 +
    4.66 +lemma (in finite_prob_space) joint_distribution_restriction_snd:
    4.67 +  "joint_distribution X Y A \<le> distribution Y (snd ` A)"
    4.68 +  unfolding distribution_def
    4.69 +proof (safe intro!: measure_mono)
    4.70 +  fix x assume "x \<in> space M" and *: "(X x, Y x) \<in> A"
    4.71 +  show "x \<in> Y -` snd ` A"
    4.72 +    by (auto intro!: image_eqI[OF _ *])
    4.73 +qed (simp_all add: sets_eq_Pow)
    4.74 +
    4.75 +lemma (in finite_prob_space) distribution_order:
    4.76 +  shows "0 \<le> distribution X x'"
    4.77 +  and "(distribution X x' \<noteq> 0) \<longleftrightarrow> (0 < distribution X x')"
    4.78 +  and "r \<le> joint_distribution X Y {(x, y)} \<Longrightarrow> r \<le> distribution X {x}"
    4.79 +  and "r \<le> joint_distribution X Y {(x, y)} \<Longrightarrow> r \<le> distribution Y {y}"
    4.80 +  and "r < joint_distribution X Y {(x, y)} \<Longrightarrow> r < distribution X {x}"
    4.81 +  and "r < joint_distribution X Y {(x, y)} \<Longrightarrow> r < distribution Y {y}"
    4.82 +  and "distribution X {x} = 0 \<Longrightarrow> joint_distribution X Y {(x, y)} = 0"
    4.83 +  and "distribution Y {y} = 0 \<Longrightarrow> joint_distribution X Y {(x, y)} = 0"
    4.84 +  using positive_distribution[of X x']
    4.85 +    positive_distribution[of "\<lambda>x. (X x, Y x)" "{(x, y)}"]
    4.86 +    joint_distribution_restriction_fst[of X Y "{(x, y)}"]
    4.87 +    joint_distribution_restriction_snd[of X Y "{(x, y)}"]
    4.88 +  by auto
    4.89 +
    4.90 +lemma (in finite_prob_space) finite_product_measure_space:
    4.91    assumes "finite s1" "finite s2"
    4.92    shows "finite_measure_space \<lparr> space = s1 \<times> s2, sets = Pow (s1 \<times> s2), measure = joint_distribution X Y\<rparr>"
    4.93      (is "finite_measure_space ?M")
    4.94  proof (rule finite_Pow_additivity_sufficient)
    4.95    show "positive ?M (measure ?M)"
    4.96      unfolding positive_def using positive'[unfolded positive_def] assms sets_eq_Pow
    4.97 -    by (simp add: joint_distribution_def)
    4.98 +    by (simp add: distribution_def)
    4.99  
   4.100    show "additive ?M (measure ?M)" unfolding additive_def
   4.101    proof safe
   4.102 @@ -406,7 +454,7 @@
   4.103      assume "x \<inter> y = {}"
   4.104      from additive[unfolded additive_def, rule_format, OF A B] this
   4.105      show "measure ?M (x \<union> y) = measure ?M x + measure ?M y"
   4.106 -      apply (simp add: joint_distribution_def)
   4.107 +      apply (simp add: distribution_def)
   4.108        apply (subst Int_Un_distrib2)
   4.109        by auto
   4.110    qed
   4.111 @@ -418,11 +466,58 @@
   4.112      by simp
   4.113  qed
   4.114  
   4.115 -lemma (in finite_prob_space) finite_marginal_product_space_POW:
   4.116 +lemma (in finite_prob_space) finite_product_measure_space_of_images:
   4.117    shows "finite_measure_space \<lparr> space = X ` space M \<times> Y ` space M,
   4.118                                  sets = Pow (X ` space M \<times> Y ` space M),
   4.119                                  measure = joint_distribution X Y\<rparr>"
   4.120      (is "finite_measure_space ?M")
   4.121 -  using finite_space by (auto intro!: finite_marginal_product_space_POW2)
   4.122 +  using finite_space by (auto intro!: finite_product_measure_space)
   4.123 +
   4.124 +lemma (in finite_prob_space) finite_measure_space:
   4.125 +  shows "finite_measure_space \<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   4.126 +    (is "finite_measure_space ?S")
   4.127 +proof (rule finite_Pow_additivity_sufficient, simp_all)
   4.128 +  show "finite (X ` space M)" using finite_space by simp
   4.129 +
   4.130 +  show "positive ?S (distribution X)" unfolding distribution_def
   4.131 +    unfolding positive_def using positive'[unfolded positive_def] sets_eq_Pow by auto
   4.132 +
   4.133 +  show "additive ?S (distribution X)" unfolding additive_def distribution_def
   4.134 +  proof (simp, safe)
   4.135 +    fix x y
   4.136 +    have x: "(X -` x) \<inter> space M \<in> sets M"
   4.137 +      and y: "(X -` y) \<inter> space M \<in> sets M" using sets_eq_Pow by auto
   4.138 +    assume "x \<inter> y = {}"
   4.139 +    from additive[unfolded additive_def, rule_format, OF x y] this
   4.140 +    have "prob (((X -` x) \<union> (X -` y)) \<inter> space M) =
   4.141 +      prob ((X -` x) \<inter> space M) + prob ((X -` y) \<inter> space M)"
   4.142 +      apply (subst Int_Un_distrib2)
   4.143 +      by auto
   4.144 +    thus "prob ((X -` x \<union> X -` y) \<inter> space M) = prob (X -` x \<inter> space M) + prob (X -` y \<inter> space M)"
   4.145 +      by auto
   4.146 +  qed
   4.147 +qed
   4.148 +
   4.149 +lemma (in finite_prob_space) finite_prob_space_of_images:
   4.150 +  "finite_prob_space \<lparr> space = X ` space M, sets = Pow (X ` space M), measure = distribution X\<rparr>"
   4.151 +  (is "finite_prob_space ?S")
   4.152 +proof (simp add: finite_prob_space_eq, safe)
   4.153 +  show "finite_measure_space ?S" by (rule finite_measure_space)
   4.154 +  have "X -` X ` space M \<inter> space M = space M" by auto
   4.155 +  thus "distribution X (X`space M) = 1"
   4.156 +    by (simp add: distribution_def prob_space)
   4.157 +qed
   4.158 +
   4.159 +lemma (in finite_prob_space) finite_product_prob_space_of_images:
   4.160 +  "finite_prob_space \<lparr> space = X ` space M \<times> Y ` space M, sets = Pow (X ` space M \<times> Y ` space M), 
   4.161 +    measure = joint_distribution X Y\<rparr>"
   4.162 +  (is "finite_prob_space ?S")
   4.163 +proof (simp add: finite_prob_space_eq, safe)
   4.164 +  show "finite_measure_space ?S" by (rule finite_product_measure_space_of_images)
   4.165 +
   4.166 +  have "X -` X ` space M \<inter> Y -` Y ` space M \<inter> space M = space M" by auto
   4.167 +  thus "joint_distribution X Y (X ` space M \<times> Y ` space M) = 1"
   4.168 +    by (simp add: distribution_def prob_space vimage_Times comp_def)
   4.169 +qed
   4.170  
   4.171  end
     5.1 --- a/src/HOL/Probability/ex/Dining_Cryptographers.thy	Mon May 03 14:35:10 2010 +0200
     5.2 +++ b/src/HOL/Probability/ex/Dining_Cryptographers.thy	Mon May 03 14:35:10 2010 +0200
     5.3 @@ -2,10 +2,10 @@
     5.4  imports Information
     5.5  begin
     5.6  
     5.7 -lemma finite_prob_spaceI:
     5.8 -  "\<lbrakk> finite_measure_space M ; measure M (space M) = 1 \<rbrakk> \<Longrightarrow> finite_prob_space M"
     5.9 -  unfolding finite_measure_space_def finite_measure_space_axioms_def
    5.10 -    finite_prob_space_def prob_space_def prob_space_axioms_def
    5.11 +lemma finite_information_spaceI:
    5.12 +  "\<lbrakk> finite_measure_space M ; measure M (space M) = 1 ; 1 < b \<rbrakk> \<Longrightarrow> finite_information_space M b"
    5.13 +  unfolding finite_information_space_def finite_measure_space_def finite_measure_space_axioms_def
    5.14 +    finite_prob_space_def prob_space_def prob_space_axioms_def finite_information_space_axioms_def
    5.15    by auto
    5.16  
    5.17  locale finite_space =
    5.18 @@ -21,8 +21,8 @@
    5.19    and measure_M[simp]: "measure M s = real (card s) / real (card S)"
    5.20    by (simp_all add: M_def)
    5.21  
    5.22 -sublocale finite_space \<subseteq> finite_prob_space M
    5.23 -proof (rule finite_prob_spaceI)
    5.24 +sublocale finite_space \<subseteq> finite_information_space M 2
    5.25 +proof (rule finite_information_spaceI)
    5.26    let ?measure = "\<lambda>s::'a set. real (card s) / real (card S)"
    5.27  
    5.28    show "finite_measure_space M"
    5.29 @@ -40,9 +40,7 @@
    5.30          by (cases "card S = 0") (simp_all add: field_simps)
    5.31      qed
    5.32    qed
    5.33 -
    5.34 -  show "measure M (space M) = 1" by simp
    5.35 -qed
    5.36 +qed simp_all
    5.37  
    5.38  lemma set_of_list_extend:
    5.39    "{xs. length xs = Suc n \<and> (\<forall>x\<in>set xs. x \<in> A)} =
    5.40 @@ -83,19 +81,6 @@
    5.41    and card_list_length: "card {xs. length xs = n \<and> (\<forall>x\<in>set xs. x \<in> A)} = (card A)^n"
    5.42    using card_finite_list_length[OF assms, of n] by auto
    5.43  
    5.44 -lemma product_not_empty:
    5.45 -  "A \<noteq> {} \<Longrightarrow> B \<noteq> {} \<Longrightarrow> A \<times> B \<noteq> {}"
    5.46 -  by auto
    5.47 -
    5.48 -lemma fst_product[simp]: "fst ` (A \<times> B) = (if B = {} then {} else A)"
    5.49 -  by (auto intro!: image_eqI)
    5.50 -
    5.51 -lemma snd_product[simp]: "snd ` (A \<times> B) = (if A = {} then {} else B)"
    5.52 -  by (auto intro!: image_eqI)
    5.53 -
    5.54 -lemma Ex_eq_length[simp]: "\<exists>xs. length xs = n"
    5.55 -  by (rule exI[of _ "replicate n undefined"]) simp
    5.56 -
    5.57  section "Define the state space"
    5.58  
    5.59  text {*
    5.60 @@ -197,10 +182,10 @@
    5.61    have *: "{xs. length xs = n} \<noteq> {}"
    5.62      by (auto intro!: exI[of _ "replicate n undefined"])
    5.63    show ?thesis
    5.64 -    unfolding payer_def_raw dc_crypto fst_product if_not_P[OF *] ..
    5.65 +    unfolding payer_def_raw dc_crypto fst_image_times if_not_P[OF *] ..
    5.66  qed
    5.67  
    5.68 -lemma image_ex1_eq: "inj_on f A \<Longrightarrow> (b \<in> f ` A) = (\<exists>!x \<in> A. b = f x)"
    5.69 +lemma image_ex1_eq: "inj_on f A \<Longrightarrow> (b \<in> f ` A) \<longleftrightarrow> (\<exists>!x \<in> A. b = f x)"
    5.70    by (unfold inj_on_def) blast
    5.71  
    5.72  lemma Ex1_eq: "\<exists>! x. P x \<Longrightarrow> P x \<Longrightarrow> P y \<Longrightarrow> x = y"
    5.73 @@ -495,26 +480,24 @@
    5.74    show "finite dc_crypto" using finite_dc_crypto .
    5.75    show "dc_crypto \<noteq> {}"
    5.76      unfolding dc_crypto
    5.77 -    apply (rule product_not_empty)
    5.78      using n_gt_3 by (auto intro: exI[of _ "replicate n True"])
    5.79  qed
    5.80  
    5.81  notation (in dining_cryptographers_space)
    5.82 -  finite_mutual_information_2 ("\<I>'( _ ; _ ')")
    5.83 +  finite_mutual_information ("\<I>'( _ ; _ ')")
    5.84  
    5.85  notation (in dining_cryptographers_space)
    5.86 -  finite_entropy_2 ("\<H>'( _ ')")
    5.87 +  finite_entropy ("\<H>'( _ ')")
    5.88  
    5.89  notation (in dining_cryptographers_space)
    5.90 -  finite_conditional_entropy_2 ("\<H>'( _ | _ ')")
    5.91 +  finite_conditional_entropy ("\<H>'( _ | _ ')")
    5.92  
    5.93  theorem (in dining_cryptographers_space)
    5.94    "\<I>( inversion ; payer ) = 0"
    5.95  proof -
    5.96 -  have b: "1 < (2 :: real)" by simp
    5.97    have n: "0 < n" using n_gt_3 by auto
    5.98  
    5.99 -  have lists: "{xs. length xs = n} \<noteq> {}" by auto
   5.100 +  have lists: "{xs. length xs = n} \<noteq> {}" using Ex_list_of_length by auto
   5.101  
   5.102    have card_image_inversion:
   5.103      "real (card (inversion ` dc_crypto)) = 2^n / 2"
   5.104 @@ -526,7 +509,7 @@
   5.105  
   5.106    { have "\<H>(inversion|payer) =
   5.107          - (\<Sum>x\<in>inversion`dc_crypto. (\<Sum>z\<in>Some ` {0..<n}. ?dIP {(x, z)} * log 2 (?dIP {(x, z)} / ?dP {z})))"
   5.108 -      unfolding finite_conditional_entropy_reduce[OF b] joint_distribution
   5.109 +      unfolding conditional_entropy_eq
   5.110        by (simp add: image_payer_dc_crypto setsum_Sigma)
   5.111      also have "... =
   5.112          - (\<Sum>x\<in>inversion`dc_crypto. (\<Sum>z\<in>Some ` {0..<n}. 2 / (real n * 2^n) * (1 - real n)))"
   5.113 @@ -560,7 +543,7 @@
   5.114      finally have "\<H>(inversion|payer) = real n - 1" . }
   5.115    moreover
   5.116    { have "\<H>(inversion) = - (\<Sum>x \<in> inversion`dc_crypto. ?dI {x} * log 2 (?dI {x}))"
   5.117 -      unfolding finite_entropy_reduce[OF b] by simp
   5.118 +      unfolding entropy_eq by simp
   5.119      also have "... = - (\<Sum>x \<in> inversion`dc_crypto. 2 * (1 - real n) / 2^n)"
   5.120        unfolding neg_equal_iff_equal
   5.121      proof (rule setsum_cong[OF refl])
   5.122 @@ -577,7 +560,7 @@
   5.123      finally have "\<H>(inversion) = real n - 1" .
   5.124    }
   5.125    ultimately show ?thesis
   5.126 -    unfolding finite_mutual_information_eq_entropy_conditional_entropy[OF b]
   5.127 +    unfolding mutual_information_eq_entropy_conditional_entropy
   5.128      by simp
   5.129  qed
   5.130