src/HOL/Probability/Information.thy
changeset 40859 de0b30e6c2d2
parent 39302 d7728f65b353
child 41023 9118eb4eb8dc
equal deleted inserted replaced
40854:f2c9ebbe04aa 40859:de0b30e6c2d2
     1 theory Information
     1 theory Information
     2 imports Probability_Space Product_Measure Convex Radon_Nikodym
     2 imports Probability_Space Convex Lebesgue_Measure
     3 begin
     3 begin
     4 
     4 
     5 lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
     5 lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
     6   by (subst log_le_cancel_iff) auto
     6   by (subst log_le_cancel_iff) auto
     7 
     7 
     9   by (subst log_less_cancel_iff) auto
     9   by (subst log_less_cancel_iff) auto
    10 
    10 
    11 lemma setsum_cartesian_product':
    11 lemma setsum_cartesian_product':
    12   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
    12   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
    13   unfolding setsum_cartesian_product by simp
    13   unfolding setsum_cartesian_product by simp
    14 
       
    15 lemma real_of_pinfreal_inverse[simp]:
       
    16   fixes X :: pinfreal
       
    17   shows "real (inverse X) = 1 / real X"
       
    18   by (cases X) (auto simp: inverse_eq_divide)
       
    19 
       
    20 lemma (in finite_prob_space) finite_product_prob_space_of_images:
       
    21   "finite_prob_space \<lparr> space = X ` space M \<times> Y ` space M, sets = Pow (X ` space M \<times> Y ` space M)\<rparr>
       
    22                      (joint_distribution X Y)"
       
    23   (is "finite_prob_space ?S _")
       
    24 proof (simp add: finite_prob_space_eq finite_product_measure_space_of_images)
       
    25   have "X -` X ` space M \<inter> Y -` Y ` space M \<inter> space M = space M" by auto
       
    26   thus "joint_distribution X Y (X ` space M \<times> Y ` space M) = 1"
       
    27     by (simp add: distribution_def prob_space vimage_Times comp_def measure_space_1)
       
    28 qed
       
    29 
       
    30 lemma (in finite_prob_space) finite_measure_space_prod:
       
    31   assumes X: "finite_measure_space MX (distribution X)"
       
    32   assumes Y: "finite_measure_space MY (distribution Y)"
       
    33   shows "finite_measure_space (prod_measure_space MX MY) (joint_distribution X Y)"
       
    34     (is "finite_measure_space ?M ?D")
       
    35 proof (intro finite_measure_spaceI)
       
    36   interpret X: finite_measure_space MX "distribution X" by fact
       
    37   interpret Y: finite_measure_space MY "distribution Y" by fact
       
    38   note finite_measure_space.finite_prod_measure_space[OF X Y, simp]
       
    39   show "finite (space ?M)" using X.finite_space Y.finite_space by auto
       
    40   show "joint_distribution X Y {} = 0" by simp
       
    41   show "sets ?M = Pow (space ?M)" by simp
       
    42   { fix x show "?D (space ?M) \<noteq> \<omega>" by (rule distribution_finite) }
       
    43   { fix A B assume "A \<subseteq> space ?M" "B \<subseteq> space ?M" "A \<inter> B = {}"
       
    44     have *: "(\<lambda>t. (X t, Y t)) -` (A \<union> B) \<inter> space M =
       
    45              (\<lambda>t. (X t, Y t)) -` A \<inter> space M \<union> (\<lambda>t. (X t, Y t)) -` B \<inter> space M"
       
    46       by auto
       
    47     show "?D (A \<union> B) = ?D A + ?D B" unfolding distribution_def *
       
    48       apply (rule measure_additive[symmetric])
       
    49       using `A \<inter> B = {}` by (auto simp: sets_eq_Pow) }
       
    50 qed
       
    51 
    14 
    52 section "Convex theory"
    15 section "Convex theory"
    53 
    16 
    54 lemma log_setsum:
    17 lemma log_setsum:
    55   assumes "finite s" "s \<noteq> {}"
    18   assumes "finite s" "s \<noteq> {}"
   146   qed
   109   qed
   147   finally show ?thesis .
   110   finally show ?thesis .
   148 qed
   111 qed
   149 
   112 
   150 lemma split_pairs:
   113 lemma split_pairs:
   151   shows
   114   "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   152     "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   115   "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
   153     "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
       
   154 
   116 
   155 section "Information theory"
   117 section "Information theory"
   156 
   118 
   157 locale finite_information_space = finite_prob_space +
   119 locale information_space = prob_space +
   158   fixes b :: real assumes b_gt_1: "1 < b"
   120   fixes b :: real assumes b_gt_1: "1 < b"
   159 
   121 
   160 context finite_information_space
   122 context information_space
   161 begin
   123 begin
   162 
   124 
   163 lemma
   125 text {* Introduce some simplification rules for logarithm of base @{term b}. *}
   164   assumes "0 \<le> A" and pos: "0 < A \<Longrightarrow> 0 < B" "0 < A \<Longrightarrow> 0 < C"
   126 
   165   shows mult_log_mult: "A * log b (B * C) = A * log b B + A * log b C" (is "?mult")
   127 lemma log_neg_const:
   166   and mult_log_divide: "A * log b (B / C) = A * log b B - A * log b C" (is "?div")
   128   assumes "x \<le> 0"
   167 proof -
   129   shows "log b x = log b 0"
   168   have "?mult \<and> ?div"
   130 proof -
   169   proof (cases "A = 0")
   131   { fix u :: real
   170     case False
   132     have "x \<le> 0" by fact
   171     hence "0 < A" using `0 \<le> A` by auto
   133     also have "0 < exp u"
   172       with pos[OF this] show "?mult \<and> ?div" using b_gt_1
   134       using exp_gt_zero .
   173         by (auto simp: log_divide log_mult field_simps)
   135     finally have "exp u \<noteq> x"
   174   qed simp
   136       by auto }
   175   thus ?mult and ?div by auto
   137   then show "log b x = log b 0"
   176 qed
   138     by (simp add: log_def ln_def)
   177 
   139 qed
   178 ML {*
   140 
   179 
   141 lemma log_mult_eq:
   180   (* tactic to solve equations of the form @{term "W * log b (X / (Y * Z)) = W * log b X - W * log b (Y * Z)"}
   142   "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
   181      where @{term W} is a joint distribution of @{term X}, @{term Y}, and @{term Z}. *)
   143   using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
   182 
   144   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   183   val mult_log_intros = [@{thm mult_log_divide}, @{thm mult_log_mult}]
   145 
   184   val intros = [@{thm divide_pos_pos}, @{thm mult_pos_pos}, @{thm real_pinfreal_nonneg},
   146 lemma log_inverse_eq:
   185     @{thm real_distribution_divide_pos_pos},
   147   "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
   186     @{thm real_distribution_mult_inverse_pos_pos},
   148   using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
   187     @{thm real_distribution_mult_pos_pos}]
   149 
   188 
   150 lemma log_divide_eq:
   189   val distribution_gt_0_tac = (rtac @{thm distribution_mono_gt_0}
   151   "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
   190     THEN' assume_tac
   152   unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
   191     THEN' clarsimp_tac (clasimpset_of @{context} addsimps2 @{thms split_pairs}))
   153   by (auto simp: zero_less_mult_iff mult_le_0_iff)
   192 
   154 
   193   val distr_mult_log_eq_tac = REPEAT_ALL_NEW (CHANGED o TRY o
   155 lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
   194     (resolve_tac (mult_log_intros @ intros)
       
   195       ORELSE' distribution_gt_0_tac
       
   196       ORELSE' clarsimp_tac (clasimpset_of @{context})))
       
   197 
       
   198   fun instanciate_term thy redex intro =
       
   199     let
       
   200       val intro_concl = Thm.concl_of intro
       
   201 
       
   202       val lhs = intro_concl |> HOLogic.dest_Trueprop |> HOLogic.dest_eq |> fst
       
   203 
       
   204       val m = SOME (Pattern.match thy (lhs, redex) (Vartab.empty, Vartab.empty))
       
   205         handle Pattern.MATCH => NONE
       
   206 
       
   207     in
       
   208       Option.map (fn m => Envir.subst_term m intro_concl) m
       
   209     end
       
   210 
       
   211   fun mult_log_simproc simpset redex =
       
   212   let
       
   213     val ctxt = Simplifier.the_context simpset
       
   214     val thy = ProofContext.theory_of ctxt
       
   215     fun prove (SOME thm) = (SOME
       
   216           (Goal.prove ctxt [] [] thm (K (distr_mult_log_eq_tac 1))
       
   217            |> mk_meta_eq)
       
   218             handle THM _ => NONE)
       
   219       | prove NONE = NONE
       
   220   in
       
   221     get_first (instanciate_term thy (term_of redex) #> prove) mult_log_intros
       
   222   end
       
   223 *}
       
   224 
       
   225 simproc_setup mult_log ("real (distribution X x) * log b (A * B)" |
       
   226                         "real (distribution X x) * log b (A / B)") = {* K mult_log_simproc *}
       
   227 
   156 
   228 end
   157 end
   229 
   158 
   230 subsection "Kullback$-$Leibler divergence"
   159 subsection "Kullback$-$Leibler divergence"
   231 
   160 
   234 
   163 
   235 definition
   164 definition
   236   "KL_divergence b M \<mu> \<nu> =
   165   "KL_divergence b M \<mu> \<nu> =
   237     measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"
   166     measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"
   238 
   167 
       
   168 lemma (in sigma_finite_measure) KL_divergence_cong:
       
   169   assumes "measure_space M \<nu>"
       
   170   and cong: "\<And>A. A \<in> sets M \<Longrightarrow> \<mu>' A = \<mu> A" "\<And>A. A \<in> sets M \<Longrightarrow> \<nu>' A = \<nu> A"
       
   171   shows "KL_divergence b M \<nu>' \<mu>' = KL_divergence b M \<nu> \<mu>"
       
   172 proof -
       
   173   interpret \<nu>: measure_space M \<nu> by fact
       
   174   show ?thesis
       
   175     unfolding KL_divergence_def
       
   176     using RN_deriv_cong[OF cong, of "\<lambda>A. A"]
       
   177     by (simp add: cong \<nu>.integral_cong_measure[OF cong(2)])
       
   178 qed
       
   179 
       
   180 lemma (in sigma_finite_measure) KL_divergence_vimage:
       
   181   assumes f: "bij_betw f S (space M)"
       
   182   assumes \<nu>: "measure_space M \<nu>" "absolutely_continuous \<nu>"
       
   183   shows "KL_divergence b (vimage_algebra S f) (\<lambda>A. \<nu> (f ` A)) (\<lambda>A. \<mu> (f ` A)) = KL_divergence b M \<nu> \<mu>"
       
   184     (is "KL_divergence b ?M ?\<nu> ?\<mu> = _")
       
   185 proof -
       
   186   interpret \<nu>: measure_space M \<nu> by fact
       
   187   interpret v: measure_space ?M ?\<nu>
       
   188     using f by (rule \<nu>.measure_space_isomorphic)
       
   189 
       
   190   let ?RN = "sigma_finite_measure.RN_deriv ?M ?\<mu> ?\<nu>"
       
   191   from RN_deriv_vimage[OF f \<nu>]
       
   192   have *: "\<nu>.almost_everywhere (\<lambda>x. ?RN (the_inv_into S f x) = RN_deriv \<nu> x)"
       
   193     by (rule absolutely_continuous_AE[OF \<nu>])
       
   194 
       
   195   show ?thesis
       
   196     unfolding KL_divergence_def \<nu>.integral_vimage_inv[OF f]
       
   197     apply (rule \<nu>.integral_cong_AE)
       
   198     apply (rule \<nu>.AE_mp[OF *])
       
   199     apply (rule \<nu>.AE_cong)
       
   200     apply simp
       
   201     done
       
   202 qed
       
   203 
   239 lemma (in finite_measure_space) KL_divergence_eq_finite:
   204 lemma (in finite_measure_space) KL_divergence_eq_finite:
   240   assumes v: "finite_measure_space M \<nu>"
   205   assumes v: "finite_measure_space M \<nu>"
   241   assumes ac: "\<forall>x\<in>space M. \<mu> {x} = 0 \<longrightarrow> \<nu> {x} = 0"
   206   assumes ac: "absolutely_continuous \<nu>"
   242   shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
   207   shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
   243 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
   208 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
   244   interpret v: finite_measure_space M \<nu> by fact
   209   interpret v: finite_measure_space M \<nu> by fact
   245   have ms: "measure_space M \<nu>" by fact
   210   have ms: "measure_space M \<nu>" by fact
   246   have ac: "absolutely_continuous \<nu>"
       
   247     using ac by (auto intro!: absolutely_continuousI[OF v])
       
   248   show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
   211   show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
   249     using RN_deriv_finite_measure[OF ms ac]
   212     using RN_deriv_finite_measure[OF ms ac]
   250     by (auto intro!: setsum_cong simp: field_simps real_of_pinfreal_mult[symmetric])
   213     by (auto intro!: setsum_cong simp: field_simps real_of_pinfreal_mult[symmetric])
   251 qed
   214 qed
   252 
   215 
   253 lemma (in finite_prob_space) KL_divergence_positive_finite:
   216 lemma (in finite_prob_space) KL_divergence_positive_finite:
   254   assumes v: "finite_prob_space M \<nu>"
   217   assumes v: "finite_prob_space M \<nu>"
   255   assumes ac: "\<And>x. \<lbrakk> x \<in> space M ; \<mu> {x} = 0 \<rbrakk> \<Longrightarrow> \<nu> {x} = 0"
   218   assumes ac: "absolutely_continuous \<nu>"
   256   and "1 < b"
   219   and "1 < b"
   257   shows "0 \<le> KL_divergence b M \<nu> \<mu>"
   220   shows "0 \<le> KL_divergence b M \<nu> \<mu>"
   258 proof -
   221 proof -
   259   interpret v: finite_prob_space M \<nu> using v .
   222   interpret v: finite_prob_space M \<nu> using v .
   260 
   223   have ms: "finite_measure_space M \<nu>" by default
   261   have *: "space M \<noteq> {}" using not_empty by simp
   224 
   262 
   225   have "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   263   hence "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   226   proof (subst KL_divergence_eq_finite[OF ms ac], safe intro!: log_setsum_divide not_empty)
   264   proof (subst KL_divergence_eq_finite)
   227     show "finite (space M)" using finite_space by simp
   265     show "finite_measure_space  M \<nu>" by fact
   228     show "1 < b" by fact
   266 
   229     show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
   267     show "\<forall>x\<in>space M. \<mu> {x} = 0 \<longrightarrow> \<nu> {x} = 0" using ac by auto
   230 
   268     show "- (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x}))) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   231     fix x assume "x \<in> space M"
   269     proof (safe intro!: log_setsum_divide *)
   232     then have x: "{x} \<in> sets M" unfolding sets_eq_Pow by auto
   270       show "finite (space M)" using finite_space by simp
   233     { assume "0 < real (\<nu> {x})"
   271       show "1 < b" by fact
   234       then have "\<nu> {x} \<noteq> 0" by auto
   272       show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
   235       then have "\<mu> {x} \<noteq> 0"
   273 
   236         using ac[unfolded absolutely_continuous_def, THEN bspec, of "{x}"] x by auto
   274       fix x assume x: "x \<in> space M"
   237       thus "0 < prob {x}" using finite_measure[of "{x}"] x by auto }
   275       { assume "0 < real (\<nu> {x})"
   238   qed auto
   276         hence "\<mu> {x} \<noteq> 0" using ac[OF x] by auto
       
   277         thus "0 < prob {x}" using finite_measure[of "{x}"] sets_eq_Pow x
       
   278           by (cases "\<mu> {x}") simp_all }
       
   279     qed auto
       
   280   qed
       
   281   thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
   239   thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
   282 qed
   240 qed
   283 
   241 
   284 subsection {* Mutual Information *}
   242 subsection {* Mutual Information *}
   285 
   243 
   286 definition (in prob_space)
   244 definition (in prob_space)
   287   "mutual_information b S T X Y =
   245   "mutual_information b S T X Y =
   288     KL_divergence b (prod_measure_space S T)
   246     KL_divergence b (sigma (pair_algebra S T))
   289       (joint_distribution X Y)
   247       (joint_distribution X Y)
   290       (prod_measure S (distribution X) T (distribution Y))"
   248       (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y))"
   291 
   249 
   292 abbreviation (in finite_information_space)
   250 definition (in prob_space)
   293   finite_mutual_information ("\<I>'(_ ; _')") where
   251   "entropy b s X = mutual_information b s s X X"
       
   252 
       
   253 abbreviation (in information_space)
       
   254   mutual_information_Pow ("\<I>'(_ ; _')") where
   294   "\<I>(X ; Y) \<equiv> mutual_information b
   255   "\<I>(X ; Y) \<equiv> mutual_information b
   295     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   256     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   296     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   257     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   297 
   258 
   298 lemma (in finite_information_space) mutual_information_generic_eq:
   259 lemma (in information_space) mutual_information_commute_generic:
   299   assumes MX: "finite_measure_space MX (distribution X)"
   260   assumes X: "random_variable S X" and Y: "random_variable T Y"
   300   assumes MY: "finite_measure_space MY (distribution Y)"
   261   assumes ac: "measure_space.absolutely_continuous (sigma (pair_algebra S T))
   301   shows "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   262    (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
   302       real (joint_distribution X Y {(x,y)}) *
   263   shows "mutual_information b S T X Y = mutual_information b T S Y X"
   303       log b (real (joint_distribution X Y {(x,y)}) /
   264 proof -
   304       (real (distribution X {x}) * real (distribution Y {y}))))"
   265   interpret P: prob_space "sigma (pair_algebra S T)" "joint_distribution X Y"
   305 proof -
   266     using random_variable_pairI[OF X Y] by (rule distribution_prob_space)
   306   let ?P = "prod_measure_space MX MY"
   267   interpret Q: prob_space "sigma (pair_algebra T S)" "joint_distribution Y X"
   307   let ?\<mu> = "prod_measure MX (distribution X) MY (distribution Y)"
   268     using random_variable_pairI[OF Y X] by (rule distribution_prob_space)
   308   let ?\<nu> = "joint_distribution X Y"
   269   interpret X: prob_space S "distribution X" using X by (rule distribution_prob_space)
   309   interpret X: finite_measure_space MX "distribution X" by fact
   270   interpret Y: prob_space T "distribution Y" using Y by (rule distribution_prob_space)
   310   moreover interpret Y: finite_measure_space MY "distribution Y" by fact
   271   interpret ST: pair_sigma_finite S "distribution X" T "distribution Y" by default
   311   have fms: "finite_measure_space MX (distribution X)"
   272   interpret TS: pair_sigma_finite T "distribution Y" S "distribution X" by default
   312             "finite_measure_space MY (distribution Y)" by fact+
   273 
   313   have fms_P: "finite_measure_space ?P ?\<mu>"
   274   have ST: "measure_space (sigma (pair_algebra S T)) (joint_distribution X Y)" by default
   314     by (rule X.finite_measure_space_finite_prod_measure) fact
   275   have TS: "measure_space (sigma (pair_algebra T S)) (joint_distribution Y X)" by default
   315   then interpret P: finite_measure_space ?P ?\<mu> .
   276 
   316   have fms_P': "finite_measure_space ?P ?\<nu>"
   277   have bij_ST: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra S T))) (space (sigma (pair_algebra T S)))"
   317       using finite_product_measure_space[of "space MX" "space MY"]
   278     by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
   318         X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
   279   have bij_TS: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra T S))) (space (sigma (pair_algebra S T)))"
   319         X.sets_eq_Pow Y.sets_eq_Pow
   280     by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
   320       by (simp add: prod_measure_space_def sigma_def)
   281 
   321   then interpret P': finite_measure_space ?P ?\<nu> .
   282   { fix A
   322   { fix x assume "x \<in> space ?P"
   283     have "joint_distribution X Y ((\<lambda>(x, y). (y, x)) ` A) = joint_distribution Y X A"
   323     hence in_MX: "{fst x} \<in> sets MX" "{snd x} \<in> sets MY" using X.sets_eq_Pow Y.sets_eq_Pow
   284       unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
   324       by (auto simp: prod_measure_space_def)
   285   note jd_commute = this
   325     assume "?\<mu> {x} = 0"
   286 
   326     with X.finite_prod_measure_times[OF fms(2), of "{fst x}" "{snd x}"] in_MX
   287   { fix A assume A: "A \<in> sets (sigma (pair_algebra T S))"
   327     have "distribution X {fst x} = 0 \<or> distribution Y {snd x} = 0"
   288     have *: "\<And>x y. indicator ((\<lambda>(x, y). (y, x)) ` A) (x, y) = (indicator A (y, x) :: pinfreal)"
   328       by (simp add: prod_measure_space_def)
   289       unfolding indicator_def by auto
   329     hence "joint_distribution X Y {x} = 0"
   290     have "ST.pair_measure ((\<lambda>(x, y). (y, x)) ` A) = TS.pair_measure A"
   330       by (cases x) (auto simp: distribution_order) }
   291       unfolding ST.pair_measure_def TS.pair_measure_def
   331   note measure_0 = this
   292       using A by (auto simp add: TS.Fubini[symmetric] *) }
       
   293   note pair_measure_commute = this
       
   294 
   332   show ?thesis
   295   show ?thesis
   333     unfolding Let_def mutual_information_def
   296     unfolding mutual_information_def
   334     using measure_0 fms_P fms_P' MX MY P.absolutely_continuous_def
   297     unfolding ST.KL_divergence_vimage[OF bij_TS ST ac, symmetric]
   335     by (subst P.KL_divergence_eq_finite)
   298     unfolding space_sigma space_pair_algebra jd_commute
   336        (auto simp add: prod_measure_space_def prod_measure_times_finite
   299     unfolding ST.pair_sigma_algebra_swap[symmetric]
   337          finite_prob_space_eq setsum_cartesian_product' real_of_pinfreal_mult[symmetric])
   300     by (simp cong: TS.KL_divergence_cong[OF TS] add: pair_measure_commute)
   338 qed
   301 qed
   339 
   302 
   340 lemma (in finite_information_space)
   303 lemma (in prob_space) finite_variables_absolutely_continuous:
   341   assumes MX: "finite_prob_space MX (distribution X)"
   304   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
   342   assumes MY: "finite_prob_space MY (distribution Y)"
   305   shows "measure_space.absolutely_continuous (sigma (pair_algebra S T))
   343   and X_space: "X ` space M \<subseteq> space MX" and Y_space: "Y ` space M \<subseteq> space MY"
   306    (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
   344   shows mutual_information_eq_generic:
   307 proof -
       
   308   interpret X: finite_prob_space S "distribution X" using X by (rule distribution_finite_prob_space)
       
   309   interpret Y: finite_prob_space T "distribution Y" using Y by (rule distribution_finite_prob_space)
       
   310   interpret XY: pair_finite_prob_space S "distribution X" T "distribution Y" by default
       
   311   interpret P: finite_prob_space XY.P "joint_distribution X Y"
       
   312     using assms by (intro joint_distribution_finite_prob_space)
       
   313   show "XY.absolutely_continuous (joint_distribution X Y)"
       
   314   proof (rule XY.absolutely_continuousI)
       
   315     show "finite_measure_space XY.P (joint_distribution X Y)" by default
       
   316     fix x assume "x \<in> space XY.P" and "XY.pair_measure {x} = 0"
       
   317     then obtain a b where "(a, b) = x" and "a \<in> space S" "b \<in> space T"
       
   318       and distr: "distribution X {a} * distribution Y {b} = 0"
       
   319       by (cases x) (auto simp: pair_algebra_def)
       
   320     with assms[THEN finite_random_variableD]
       
   321       joint_distribution_Times_le_fst[of S X T Y "{a}" "{b}"]
       
   322       joint_distribution_Times_le_snd[of S X T Y "{a}" "{b}"]
       
   323     have "joint_distribution X Y {x} \<le> distribution Y {b}"
       
   324          "joint_distribution X Y {x} \<le> distribution X {a}"
       
   325       by auto
       
   326     with distr show "joint_distribution X Y {x} = 0" by auto
       
   327   qed
       
   328 qed
       
   329 
       
   330 lemma (in information_space) mutual_information_commute:
       
   331   assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
       
   332   shows "mutual_information b S T X Y = mutual_information b T S Y X"
       
   333   by (intro finite_random_variableD X Y mutual_information_commute_generic finite_variables_absolutely_continuous)
       
   334 
       
   335 lemma (in information_space) mutual_information_commute_simple:
       
   336   assumes X: "simple_function X" and Y: "simple_function Y"
       
   337   shows "\<I>(X;Y) = \<I>(Y;X)"
       
   338   by (intro X Y simple_function_imp_finite_random_variable mutual_information_commute)
       
   339 
       
   340 lemma (in information_space)
       
   341   assumes MX: "finite_random_variable MX X"
       
   342   assumes MY: "finite_random_variable MY Y"
       
   343   shows mutual_information_generic_eq:
   345     "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   344     "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   346       real (joint_distribution X Y {(x,y)}) *
   345       real (joint_distribution X Y {(x,y)}) *
   347       log b (real (joint_distribution X Y {(x,y)}) /
   346       log b (real (joint_distribution X Y {(x,y)}) /
   348       (real (distribution X {x}) * real (distribution Y {y}))))"
   347       (real (distribution X {x}) * real (distribution Y {y}))))"
   349     (is "?equality")
   348     (is ?sum)
   350   and mutual_information_positive_generic:
   349   and mutual_information_positive_generic:
   351     "0 \<le> mutual_information b MX MY X Y" (is "?positive")
   350      "0 \<le> mutual_information b MX MY X Y" (is ?positive)
   352 proof -
   351 proof -
   353   let ?P = "prod_measure_space MX MY"
   352   interpret X: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
   354   let ?\<mu> = "prod_measure MX (distribution X) MY (distribution Y)"
   353   interpret Y: finite_prob_space MY "distribution Y" using MY by (rule distribution_finite_prob_space)
   355   let ?\<nu> = "joint_distribution X Y"
   354   interpret XY: pair_finite_prob_space MX "distribution X" MY "distribution Y" by default
   356 
   355   interpret P: finite_prob_space XY.P "joint_distribution X Y"
   357   interpret X: finite_prob_space MX "distribution X" by fact
   356     using assms by (intro joint_distribution_finite_prob_space)
   358   moreover interpret Y: finite_prob_space MY "distribution Y" by fact
   357 
   359   have ms_X: "measure_space MX (distribution X)"
   358   have P_ms: "finite_measure_space XY.P (joint_distribution X Y)" by default
   360     and ms_Y: "measure_space MY (distribution Y)"
   359   have P_ps: "finite_prob_space XY.P (joint_distribution X Y)" by default
   361     and fms: "finite_measure_space MX (distribution X)" "finite_measure_space MY (distribution Y)" by fact+
   360 
   362   have fms_P: "finite_measure_space ?P ?\<mu>"
   361   show ?sum
   363     by (rule X.finite_measure_space_finite_prod_measure) fact
       
   364   then interpret P: finite_measure_space ?P ?\<mu> .
       
   365 
       
   366   have fms_P': "finite_measure_space ?P ?\<nu>"
       
   367       using finite_product_measure_space[of "space MX" "space MY"]
       
   368         X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
       
   369         X.sets_eq_Pow Y.sets_eq_Pow
       
   370       by (simp add: prod_measure_space_def sigma_def)
       
   371   then interpret P': finite_measure_space ?P ?\<nu> .
       
   372 
       
   373   { fix x assume "x \<in> space ?P"
       
   374     hence in_MX: "{fst x} \<in> sets MX" "{snd x} \<in> sets MY" using X.sets_eq_Pow Y.sets_eq_Pow
       
   375       by (auto simp: prod_measure_space_def)
       
   376 
       
   377     assume "?\<mu> {x} = 0"
       
   378     with X.finite_prod_measure_times[OF fms(2), of "{fst x}" "{snd x}"] in_MX
       
   379     have "distribution X {fst x} = 0 \<or> distribution Y {snd x} = 0"
       
   380       by (simp add: prod_measure_space_def)
       
   381 
       
   382     hence "joint_distribution X Y {x} = 0"
       
   383       by (cases x) (auto simp: distribution_order) }
       
   384   note measure_0 = this
       
   385 
       
   386   show ?equality
       
   387     unfolding Let_def mutual_information_def
   362     unfolding Let_def mutual_information_def
   388     using measure_0 fms_P fms_P' MX MY P.absolutely_continuous_def
   363     by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]])
   389     by (subst P.KL_divergence_eq_finite)
   364        (auto simp add: pair_algebra_def setsum_cartesian_product' real_of_pinfreal_mult[symmetric])
   390        (auto simp add: prod_measure_space_def prod_measure_times_finite
       
   391          finite_prob_space_eq setsum_cartesian_product' real_of_pinfreal_mult[symmetric])
       
   392 
   365 
   393   show ?positive
   366   show ?positive
   394     unfolding Let_def mutual_information_def using measure_0 b_gt_1
   367     using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1]
   395   proof (safe intro!: finite_prob_space.KL_divergence_positive_finite, simp_all)
   368     unfolding mutual_information_def .
   396     have "?\<mu> (space ?P) = 1"
   369 qed
   397       using X.top Y.top X.measure_space_1 Y.measure_space_1 fms
   370 
   398       by (simp add: prod_measure_space_def X.finite_prod_measure_times)
   371 lemma (in information_space) mutual_information_eq:
   399     with fms_P show "finite_prob_space ?P ?\<mu>"
   372   assumes "simple_function X" "simple_function Y"
   400       by (simp add: finite_prob_space_eq)
   373   shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
   401 
       
   402     from ms_X ms_Y X.top Y.top X.measure_space_1 Y.measure_space_1 Y.not_empty X_space Y_space
       
   403     have "?\<nu> (space ?P) = 1" unfolding measure_space_1[symmetric]
       
   404       by (auto intro!: arg_cong[where f="\<mu>"]
       
   405                simp add: prod_measure_space_def distribution_def vimage_Times comp_def)
       
   406     with fms_P' show "finite_prob_space ?P ?\<nu>"
       
   407       by (simp add: finite_prob_space_eq)
       
   408   qed
       
   409 qed
       
   410 
       
   411 lemma (in finite_information_space) mutual_information_eq:
       
   412   "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
       
   413     real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
   374     real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
   414                                                    (real (distribution X {x}) * real (distribution Y {y}))))"
   375                                                    (real (distribution X {x}) * real (distribution Y {y}))))"
   415   by (subst mutual_information_eq_generic) (simp_all add: finite_prob_space_of_images)
   376   using assms by (simp add: mutual_information_generic_eq)
   416 
   377 
   417 lemma (in finite_information_space) mutual_information_cong:
   378 lemma (in information_space) mutual_information_generic_cong:
   418   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   379   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   419   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   380   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   420   shows "\<I>(X ; Y) = \<I>(X' ; Y')"
   381   shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'"
   421 proof -
   382   unfolding mutual_information_def using X Y
   422   have "X ` space M = X' ` space M" using X by (auto intro!: image_eqI)
   383   by (simp cong: distribution_cong)
   423   moreover have "Y ` space M = Y' ` space M" using Y by (auto intro!: image_eqI)
   384 
   424   ultimately show ?thesis
   385 lemma (in information_space) mutual_information_cong:
   425   unfolding mutual_information_eq
   386   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   426     using
   387   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   427       assms[THEN distribution_cong]
   388   shows "\<I>(X; Y) = \<I>(X'; Y')"
   428       joint_distribution_cong[OF assms]
   389   unfolding mutual_information_def using X Y
   429     by (auto intro!: setsum_cong)
   390   by (simp cong: distribution_cong image_cong)
   430 qed
   391 
   431 
   392 lemma (in information_space) mutual_information_positive:
   432 lemma (in finite_information_space) mutual_information_positive: "0 \<le> \<I>(X;Y)"
   393   assumes "simple_function X" "simple_function Y"
   433   by (subst mutual_information_positive_generic) (simp_all add: finite_prob_space_of_images)
   394   shows "0 \<le> \<I>(X;Y)"
       
   395   using assms by (simp add: mutual_information_positive_generic)
   434 
   396 
   435 subsection {* Entropy *}
   397 subsection {* Entropy *}
   436 
   398 
   437 definition (in prob_space)
   399 abbreviation (in information_space)
   438   "entropy b s X = mutual_information b s s X X"
   400   entropy_Pow ("\<H>'(_')") where
   439 
       
   440 abbreviation (in finite_information_space)
       
   441   finite_entropy ("\<H>'(_')") where
       
   442   "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
   401   "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
   443 
   402 
   444 lemma (in finite_information_space) entropy_generic_eq:
   403 lemma (in information_space) entropy_generic_eq:
   445   assumes MX: "finite_measure_space MX (distribution X)"
   404   assumes MX: "finite_random_variable MX X"
   446   shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
   405   shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
   447 proof -
   406 proof -
       
   407   interpret MX: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
   448   let "?X x" = "real (distribution X {x})"
   408   let "?X x" = "real (distribution X {x})"
   449   let "?XX x y" = "real (joint_distribution X X {(x, y)})"
   409   let "?XX x y" = "real (joint_distribution X X {(x, y)})"
   450   interpret MX: finite_measure_space MX "distribution X" by fact
       
   451   { fix x y
   410   { fix x y
   452     have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
   411     have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
   453     then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
   412     then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
   454         (if x = y then - ?X y * log b (?X y) else 0)"
   413         (if x = y then - ?X y * log b (?X y) else 0)"
   455       unfolding distribution_def by (auto simp: mult_log_divide) }
   414       unfolding distribution_def by (auto simp: log_simps zero_less_mult_iff) }
   456   note remove_XX = this
   415   note remove_XX = this
   457   show ?thesis
   416   show ?thesis
   458     unfolding entropy_def mutual_information_generic_eq[OF MX MX]
   417     unfolding entropy_def mutual_information_generic_eq[OF MX MX]
   459     unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
   418     unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
   460     by (auto simp: setsum_cases MX.finite_space)
   419     by (auto simp: setsum_cases MX.finite_space)
   461 qed
   420 qed
   462 
   421 
   463 lemma (in finite_information_space) entropy_eq:
   422 lemma (in information_space) entropy_eq:
   464   "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
   423   assumes "simple_function X"
   465   by (simp add: finite_measure_space entropy_generic_eq)
   424   shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
   466 
   425   using assms by (simp add: entropy_generic_eq)
   467 lemma (in finite_information_space) entropy_positive: "0 \<le> \<H>(X)"
   426 
   468   unfolding entropy_def using mutual_information_positive .
   427 lemma (in information_space) entropy_positive:
   469 
   428   "simple_function X \<Longrightarrow> 0 \<le> \<H>(X)"
   470 lemma (in finite_information_space) entropy_certainty_eq_0:
   429   unfolding entropy_def by (simp add: mutual_information_positive)
   471   assumes "x \<in> X ` space M" and "distribution X {x} = 1"
   430 
       
   431 lemma (in information_space) entropy_certainty_eq_0:
       
   432   assumes "simple_function X" and "x \<in> X ` space M" and "distribution X {x} = 1"
   472   shows "\<H>(X) = 0"
   433   shows "\<H>(X) = 0"
   473 proof -
   434 proof -
   474   interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
   435   interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
   475     by (rule finite_prob_space_of_images)
   436     using simple_function_imp_finite_random_variable[OF `simple_function X`]
   476 
   437     by (rule distribution_finite_prob_space)
   477   have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
   438   have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
   478     using X.measure_compl[of "{x}"] assms by auto
   439     using X.measure_compl[of "{x}"] assms by auto
   479   also have "\<dots> = 0" using X.prob_space assms by auto
   440   also have "\<dots> = 0" using X.prob_space assms by auto
   480   finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
   441   finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
   481 
       
   482   { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
   442   { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
   483     hence "{y} \<subseteq> X ` space M - {x}" by auto
   443     hence "{y} \<subseteq> X ` space M - {x}" by auto
   484     from X.measure_mono[OF this] X0 asm
   444     from X.measure_mono[OF this] X0 asm
   485     have "distribution X {y} = 0" by auto }
   445     have "distribution X {y} = 0" by auto }
   486 
       
   487   hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
   446   hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
   488     using assms by auto
   447     using assms by auto
   489 
       
   490   have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
   448   have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
   491 
   449   show ?thesis unfolding entropy_eq[OF `simple_function X`] by (auto simp: y fi)
   492   show ?thesis unfolding entropy_eq by (auto simp: y fi)
   450 qed
   493 qed
   451 
   494 
   452 lemma (in information_space) entropy_le_card_not_0:
   495 lemma (in finite_information_space) entropy_le_card_not_0:
   453   assumes "simple_function X"
   496   "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
   454   shows "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
   497 proof -
   455 proof -
   498   let "?d x" = "distribution X {x}"
   456   let "?d x" = "distribution X {x}"
   499   let "?p x" = "real (?d x)"
   457   let "?p x" = "real (?d x)"
   500   have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
   458   have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
   501     by (auto intro!: setsum_cong simp: entropy_eq setsum_negf[symmetric])
   459     by (auto intro!: setsum_cong simp: entropy_eq[OF `simple_function X`] setsum_negf[symmetric] log_simps not_less)
   502   also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
   460   also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
   503     apply (rule log_setsum')
   461     apply (rule log_setsum')
   504     using not_empty b_gt_1 finite_space sum_over_space_real_distribution
   462     using not_empty b_gt_1 `simple_function X` sum_over_space_real_distribution
   505     by auto
   463     by (auto simp: simple_function_def)
   506   also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
   464   also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
   507     apply (rule arg_cong[where f="\<lambda>f. log b (\<Sum>x\<in>X`space M. f x)"])
   465     using distribution_finite[OF `simple_function X`[THEN simple_function_imp_random_variable], simplified]
   508     using distribution_finite[of X] by (auto simp: fun_eq_iff real_of_pinfreal_eq_0)
   466     by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) (auto simp: real_of_pinfreal_eq_0)
   509   finally show ?thesis
   467   finally show ?thesis
   510     using finite_space by (auto simp: setsum_cases real_eq_of_nat)
   468     using `simple_function X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def)
   511 qed
   469 qed
   512 
   470 
   513 lemma (in finite_information_space) entropy_uniform_max:
   471 lemma (in information_space) entropy_uniform_max:
       
   472   assumes "simple_function X"
   514   assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
   473   assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
   515   shows "\<H>(X) = log b (real (card (X ` space M)))"
   474   shows "\<H>(X) = log b (real (card (X ` space M)))"
   516 proof -
   475 proof -
   517   note uniform =
   476   interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
   518     finite_prob_space_of_images[of X, THEN finite_prob_space.uniform_prob, simplified]
   477     using simple_function_imp_finite_random_variable[OF `simple_function X`]
   519 
   478     by (rule distribution_finite_prob_space)
   520   have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
   479   have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
   521     using finite_space not_empty by auto
   480     using `simple_function X` not_empty by (auto simp: simple_function_def)
   522 
       
   523   { fix x assume "x \<in> X ` space M"
   481   { fix x assume "x \<in> X ` space M"
   524     hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
   482     hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
   525     proof (rule uniform)
   483     proof (rule X.uniform_prob[simplified])
   526       fix x y assume "x \<in> X`space M" "y \<in> X`space M"
   484       fix x y assume "x \<in> X`space M" "y \<in> X`space M"
   527       from assms[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
   485       from assms(2)[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
   528     qed }
   486     qed }
   529   thus ?thesis
   487   thus ?thesis
   530     using not_empty finite_space b_gt_1 card_gt0
   488     using not_empty X.finite_space b_gt_1 card_gt0
   531     by (simp add: entropy_eq real_eq_of_nat[symmetric] log_divide)
   489     by (simp add: entropy_eq[OF `simple_function X`] real_eq_of_nat[symmetric] log_simps)
   532 qed
   490 qed
   533 
   491 
   534 lemma (in finite_information_space) entropy_le_card:
   492 lemma (in information_space) entropy_le_card:
   535   "\<H>(X) \<le> log b (real (card (X ` space M)))"
   493   assumes "simple_function X"
       
   494   shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
   536 proof cases
   495 proof cases
   537   assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
   496   assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
   538   then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
   497   then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
   539   moreover
   498   moreover
   540   have "0 < card (X`space M)"
   499   have "0 < card (X`space M)"
   541     using finite_space not_empty unfolding card_gt_0_iff by auto
   500     using `simple_function X` not_empty
       
   501     by (auto simp: card_gt_0_iff simple_function_def)
   542   then have "log b 1 \<le> log b (real (card (X`space M)))"
   502   then have "log b 1 \<le> log b (real (card (X`space M)))"
   543     using b_gt_1 by (intro log_le) auto
   503     using b_gt_1 by (intro log_le) auto
   544   ultimately show ?thesis unfolding entropy_eq by simp
   504   ultimately show ?thesis using assms by (simp add: entropy_eq)
   545 next
   505 next
   546   assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
   506   assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
   547   have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
   507   have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
   548     (is "?A \<le> ?B") using finite_space not_empty by (auto intro!: card_mono)
   508     (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def)
   549   note entropy_le_card_not_0
   509   note entropy_le_card_not_0[OF assms]
   550   also have "log b (real ?A) \<le> log b (real ?B)"
   510   also have "log b (real ?A) \<le> log b (real ?B)"
   551     using b_gt_1 False finite_space not_empty `?A \<le> ?B`
   511     using b_gt_1 False not_empty `?A \<le> ?B` assms
   552     by (auto intro!: log_le simp: card_gt_0_iff)
   512     by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def)
   553   finally show ?thesis .
   513   finally show ?thesis .
   554 qed
   514 qed
   555 
   515 
   556 lemma (in finite_information_space) entropy_commute:
   516 lemma (in information_space) entropy_commute:
   557   "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
   517   assumes "simple_function X" "simple_function Y"
   558 proof -
   518   shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
       
   519 proof -
       
   520   have sf: "simple_function (\<lambda>x. (X x, Y x))" "simple_function (\<lambda>x. (Y x, X x))"
       
   521     using assms by (auto intro: simple_function_Pair)
   559   have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
   522   have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
   560     by auto
   523     by auto
   561   have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
   524   have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
   562     by (auto intro!: inj_onI)
   525     by (auto intro!: inj_onI)
   563   show ?thesis
   526   show ?thesis
   564     unfolding entropy_eq unfolding * setsum_reindex[OF inj]
   527     unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj]
   565     by (simp add: joint_distribution_commute[of Y X] split_beta)
   528     by (simp add: joint_distribution_commute[of Y X] split_beta)
   566 qed
   529 qed
   567 
   530 
   568 lemma (in finite_information_space) entropy_eq_cartesian_sum:
   531 lemma (in information_space) entropy_eq_cartesian_product:
   569   "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   532   assumes "simple_function X" "simple_function Y"
       
   533   shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   570     real (joint_distribution X Y {(x,y)}) *
   534     real (joint_distribution X Y {(x,y)}) *
   571     log b (real (joint_distribution X Y {(x,y)})))"
   535     log b (real (joint_distribution X Y {(x,y)})))"
   572 proof -
   536 proof -
       
   537   have sf: "simple_function (\<lambda>x. (X x, Y x))"
       
   538     using assms by (auto intro: simple_function_Pair)
   573   { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
   539   { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
   574     then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
   540     then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
   575     then have "joint_distribution X Y {x} = 0"
   541     then have "joint_distribution X Y {x} = 0"
   576       unfolding distribution_def by auto }
   542       unfolding distribution_def by auto }
   577   then show ?thesis using finite_space
   543   then show ?thesis using sf assms
   578     unfolding entropy_eq neg_equal_iff_equal setsum_cartesian_product
   544     unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product
   579     by (auto intro!: setsum_mono_zero_cong_left)
   545     by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def)
   580 qed
   546 qed
   581 
   547 
   582 subsection {* Conditional Mutual Information *}
   548 subsection {* Conditional Mutual Information *}
   583 
   549 
   584 definition (in prob_space)
   550 definition (in prob_space)
   585   "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
   551   "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
   586     mutual_information b M1 (prod_measure_space M2 M3) X (\<lambda>x. (Y x, Z x)) -
   552     mutual_information b M1 (sigma (pair_algebra M2 M3)) X (\<lambda>x. (Y x, Z x)) -
   587     mutual_information b M1 M3 X Z"
   553     mutual_information b M1 M3 X Z"
   588 
   554 
   589 abbreviation (in finite_information_space)
   555 abbreviation (in information_space)
   590   finite_conditional_mutual_information ("\<I>'( _ ; _ | _ ')") where
   556   conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
   591   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   557   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   592     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   558     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   593     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
   559     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
   594     \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
   560     \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
   595     X Y Z"
   561     X Y Z"
   596 
   562 
   597 lemma (in finite_information_space) conditional_mutual_information_generic_eq:
   563 
   598   assumes MX: "finite_measure_space MX (distribution X)"
   564 lemma (in information_space) conditional_mutual_information_generic_eq:
   599   assumes MY: "finite_measure_space MY (distribution Y)"
   565   assumes MX: "finite_random_variable MX X"
   600   assumes MZ: "finite_measure_space MZ (distribution Z)"
   566     and MY: "finite_random_variable MY Y"
   601   shows "conditional_mutual_information b MX MY MZ X Y Z =
   567     and MZ: "finite_random_variable MZ Z"
   602     (\<Sum>(x, y, z)\<in>space MX \<times> space MY \<times> space MZ.
   568   shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ.
   603       real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) *
       
   604       log b (real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) /
       
   605                    (real (distribution X {x}) * real (joint_distribution Y Z {(y, z)})))) -
       
   606     (\<Sum>(x, y)\<in>space MX \<times> space MZ.
       
   607       real (joint_distribution X Z {(x, y)}) *
       
   608       log b (real (joint_distribution X Z {(x, y)}) / (real (distribution X {x}) * real (distribution Z {y}))))"
       
   609   using assms finite_measure_space_prod[OF MY MZ]
       
   610   unfolding conditional_mutual_information_def
       
   611   by (subst (1 2) mutual_information_generic_eq)
       
   612      (simp_all add: setsum_cartesian_product' finite_measure_space.finite_prod_measure_space)
       
   613 
       
   614 
       
   615 lemma (in finite_information_space) conditional_mutual_information_eq:
       
   616   "\<I>(X ; Y | Z) = (\<Sum>(x, y, z) \<in> X ` space M \<times> Y ` space M \<times> Z ` space M.
       
   617              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   569              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   618              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   570              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   619     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   571     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   620   by (subst conditional_mutual_information_generic_eq)
   572   (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z)))")
   621      (auto simp add: prod_measure_space_def sigma_prod_sets_finite finite_space
   573 proof -
   622       finite_measure_space finite_product_prob_space_of_images sigma_def
   574   let ?YZ = "\<lambda>y z. real (joint_distribution Y Z {(y, z)})"
   623       setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf
   575   let ?X = "\<lambda>x. real (distribution X {x})"
   624       setsum_left_distrib[symmetric] setsum_real_distribution setsum_commute[where A="Y`space M"]
   576   let ?Z = "\<lambda>z. real (distribution Z {z})"
   625       real_of_pinfreal_mult[symmetric]
   577 
   626     cong: setsum_cong)
   578   txt {* This proof is actually quiet easy, however we need to show that the
   627 
   579     distributions are finite and the joint distributions are zero when one of
   628 lemma (in finite_information_space) conditional_mutual_information_eq_mutual_information:
   580     the variables distribution is also zero. *}
   629   "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
   581 
       
   582   note finite_var = MX MY MZ
       
   583   note random_var = finite_var[THEN finite_random_variableD]
       
   584 
       
   585   note space_simps = space_pair_algebra space_sigma algebra.simps
       
   586 
       
   587   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
       
   588   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
       
   589   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
       
   590   note YZX = finite_random_variable_pairI[OF finite_var(2) ZX]
       
   591   note order1 =
       
   592     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
       
   593     finite_distribution_order(5,6)[OF finite_var(1,3), simplified space_simps]
       
   594 
       
   595   note finite = finite_var(1) YZ finite_var(3) XZ YZX
       
   596   note finite[THEN finite_distribution_finite, simplified space_simps, simp]
       
   597 
       
   598   have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
       
   599           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
       
   600     unfolding joint_distribution_commute_singleton[of X]
       
   601     unfolding joint_distribution_assoc_singleton[symmetric]
       
   602     using finite_distribution_order(6)[OF finite_var(2) ZX]
       
   603     by (auto simp: space_simps)
       
   604 
       
   605   have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z))) =
       
   606     (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))"
       
   607     (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)")
       
   608   proof (safe intro!: setsum_cong)
       
   609     fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ"
       
   610     then have *: "?XYZ x y z / (?XZ x z * ?YZdZ y z) =
       
   611       (?XYZ x y z / (?X x * ?YZ y z)) / (?XZ x z / (?X x * ?Z z))"
       
   612       using order1(3)
       
   613       by (auto simp: real_of_pinfreal_mult[symmetric] real_of_pinfreal_eq_0)
       
   614     show "?L x y z = ?R x y z"
       
   615     proof cases
       
   616       assume "?XYZ x y z \<noteq> 0"
       
   617       with space b_gt_1 order1 order2 show ?thesis unfolding *
       
   618         by (subst log_divide)
       
   619            (auto simp: zero_less_divide_iff zero_less_real_of_pinfreal
       
   620                        real_of_pinfreal_eq_0 zero_less_mult_iff)
       
   621     qed simp
       
   622   qed
       
   623   also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
       
   624                   (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))"
       
   625     by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong)
       
   626   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) =
       
   627              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))"
       
   628     unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"]
       
   629               setsum_left_distrib[symmetric]
       
   630     unfolding joint_distribution_commute_singleton[of X]
       
   631     unfolding joint_distribution_assoc_singleton[symmetric]
       
   632     using setsum_real_joint_distribution_singleton[OF finite_var(2) ZX, unfolded space_simps]
       
   633     by (intro setsum_cong refl) simp
       
   634   also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
       
   635              (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) =
       
   636              conditional_mutual_information b MX MY MZ X Y Z"
       
   637     unfolding conditional_mutual_information_def
       
   638     unfolding mutual_information_generic_eq[OF finite_var(1,3)]
       
   639     unfolding mutual_information_generic_eq[OF finite_var(1) YZ]
       
   640     by (simp add: space_sigma space_pair_algebra setsum_cartesian_product')
       
   641   finally show ?thesis by simp
       
   642 qed
       
   643 
       
   644 lemma (in information_space) conditional_mutual_information_eq:
       
   645   assumes "simple_function X" "simple_function Y" "simple_function Z"
       
   646   shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
       
   647              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
       
   648              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
       
   649     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
       
   650   using conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
       
   651   by simp
       
   652 
       
   653 lemma (in information_space) conditional_mutual_information_eq_mutual_information:
       
   654   assumes X: "simple_function X" and Y: "simple_function Y"
       
   655   shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
   630 proof -
   656 proof -
   631   have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
   657   have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
   632 
   658   have C: "simple_function (\<lambda>x. ())" by auto
   633   show ?thesis
   659   show ?thesis
   634     unfolding conditional_mutual_information_eq mutual_information_eq
   660     unfolding conditional_mutual_information_eq[OF X Y C]
       
   661     unfolding mutual_information_eq[OF X Y]
   635     by (simp add: setsum_cartesian_product' distribution_remove_const)
   662     by (simp add: setsum_cartesian_product' distribution_remove_const)
   636 qed
   663 qed
   637 
   664 
   638 lemma (in finite_information_space) conditional_mutual_information_positive:
   665 lemma (in prob_space) distribution_unit[simp]: "distribution (\<lambda>x. ()) {()} = 1"
   639   "0 \<le> \<I>(X ; Y | Z)"
   666   unfolding distribution_def using measure_space_1 by auto
   640 proof -
   667 
       
   668 lemma (in prob_space) joint_distribution_unit[simp]: "distribution (\<lambda>x. (X x, ())) {(a, ())} = distribution X {a}"
       
   669   unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>])
       
   670 
       
   671 lemma (in prob_space) setsum_distribution:
       
   672   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. distribution X {a}) = 1"
       
   673   using setsum_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
       
   674   using sigma_algebra_Pow[of "UNIV::unit set"] by simp
       
   675 
       
   676 lemma (in prob_space) setsum_real_distribution:
       
   677   assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. real (distribution X {a})) = 1"
       
   678   using setsum_real_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
       
   679   using sigma_algebra_Pow[of "UNIV::unit set"] by simp
       
   680 
       
   681 lemma (in information_space) conditional_mutual_information_generic_positive:
       
   682   assumes "finite_random_variable MX X" and "finite_random_variable MY Y" and "finite_random_variable MZ Z"
       
   683   shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z"
       
   684 proof (cases "space MX \<times> space MY \<times> space MZ = {}")
       
   685   case True show ?thesis
       
   686     unfolding conditional_mutual_information_generic_eq[OF assms] True
       
   687     by simp
       
   688 next
       
   689   case False
   641   let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
   690   let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
   642   let "?dXZ A" = "real (joint_distribution X Z A)"
   691   let "?dXZ A" = "real (joint_distribution X Z A)"
   643   let "?dYZ A" = "real (joint_distribution Y Z A)"
   692   let "?dYZ A" = "real (joint_distribution Y Z A)"
   644   let "?dX A" = "real (distribution X A)"
   693   let "?dX A" = "real (distribution X A)"
   645   let "?dZ A" = "real (distribution Z A)"
   694   let "?dZ A" = "real (distribution Z A)"
   646   let ?M = "X ` space M \<times> Y ` space M \<times> Z ` space M"
   695   let ?M = "space MX \<times> space MY \<times> space MZ"
   647 
   696 
   648   have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)
   697   have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)
   649 
   698 
   650   have "- (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
   699   note space_simps = space_pair_algebra space_sigma algebra.simps
   651     log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))
   700 
   652     \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
   701   note finite_var = assms
       
   702   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
       
   703   note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
       
   704   note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
       
   705   note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
       
   706   note XYZ = finite_random_variable_pairI[OF finite_var(1) YZ]
       
   707   note finite = finite_var(3) YZ XZ XYZ
       
   708   note finite = finite[THEN finite_distribution_finite, simplified space_simps]
       
   709 
       
   710   have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
       
   711           \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
       
   712     unfolding joint_distribution_commute_singleton[of X]
       
   713     unfolding joint_distribution_assoc_singleton[symmetric]
       
   714     using finite_distribution_order(6)[OF finite_var(2) ZX]
       
   715     by (auto simp: space_simps)
       
   716 
       
   717   note order = order
       
   718     finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
       
   719     finite_distribution_order(5,6)[OF finite_var(2,3), simplified space_simps]
       
   720 
       
   721   have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
       
   722     log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))"
       
   723     unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal
       
   724     by (intro setsum_cong) (auto intro!: arg_cong[where f="log b"] simp: real_of_pinfreal_mult[symmetric])
       
   725   also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
   653     unfolding split_beta
   726     unfolding split_beta
   654   proof (rule log_setsum_divide)
   727   proof (rule log_setsum_divide)
   655     show "?M \<noteq> {}" using not_empty by simp
   728     show "?M \<noteq> {}" using False by simp
   656     show "1 < b" using b_gt_1 .
   729     show "1 < b" using b_gt_1 .
       
   730 
       
   731     show "finite ?M" using assms
       
   732       unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto
       
   733 
       
   734     show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1"
       
   735       unfolding setsum_cartesian_product'
       
   736       unfolding setsum_commute[of _ "space MY"]
       
   737       unfolding setsum_commute[of _ "space MZ"]
       
   738       by (simp_all add: space_pair_algebra
       
   739         setsum_real_joint_distribution_singleton[OF `finite_random_variable MX X` YZ]
       
   740         setsum_real_joint_distribution_singleton[OF `finite_random_variable MY Y` finite_var(3)]
       
   741         setsum_real_distribution[OF `finite_random_variable MZ Z`])
   657 
   742 
   658     fix x assume "x \<in> ?M"
   743     fix x assume "x \<in> ?M"
   659     let ?x = "(fst x, fst (snd x), snd (snd x))"
   744     let ?x = "(fst x, fst (snd x), snd (snd x))"
   660 
   745 
   661     show "0 \<le> ?dXYZ {?x}" using real_pinfreal_nonneg .
   746     show "0 \<le> ?dXYZ {?x}" using real_pinfreal_nonneg .
   662     show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   747     show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   663      by (simp add: real_pinfreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
   748      by (simp add: real_pinfreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
   664 
   749 
   665     assume *: "0 < ?dXYZ {?x}"
   750     assume *: "0 < ?dXYZ {?x}"
   666     thus "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   751     with `x \<in> ?M` show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   667       apply (rule_tac divide_pos_pos mult_pos_pos)+
   752       using finite order
   668       by (auto simp add: real_distribution_gt_0 intro: distribution_order(6) distribution_mono_gt_0)
   753       by (cases x)
   669   qed (simp_all add: setsum_cartesian_product' sum_over_space_real_distribution setsum_real_distribution finite_space)
   754          (auto simp add: zero_less_real_of_pinfreal zero_less_mult_iff zero_less_divide_iff)
   670   also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>Z`space M. ?dZ {z})"
   755   qed
       
   756   also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})"
   671     apply (simp add: setsum_cartesian_product')
   757     apply (simp add: setsum_cartesian_product')
   672     apply (subst setsum_commute)
   758     apply (subst setsum_commute)
   673     apply (subst (2) setsum_commute)
   759     apply (subst (2) setsum_commute)
   674     by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] setsum_real_distribution
   760     by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric]
       
   761                    setsum_real_joint_distribution_singleton[OF finite_var(1,3)]
       
   762                    setsum_real_joint_distribution_singleton[OF finite_var(2,3)]
   675           intro!: setsum_cong)
   763           intro!: setsum_cong)
   676   finally show ?thesis
   764   also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0"
   677     unfolding conditional_mutual_information_eq sum_over_space_real_distribution
   765     unfolding setsum_real_distribution[OF finite_var(3)] by simp
   678     by (simp add: real_of_pinfreal_mult[symmetric])
   766   finally show ?thesis by simp
   679 qed
   767 qed
       
   768 
       
   769 lemma (in information_space) conditional_mutual_information_positive:
       
   770   assumes "simple_function X" and "simple_function Y" and "simple_function Z"
       
   771   shows "0 \<le> \<I>(X;Y|Z)"
       
   772   using conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]]
       
   773   by simp
   680 
   774 
   681 subsection {* Conditional Entropy *}
   775 subsection {* Conditional Entropy *}
   682 
   776 
   683 definition (in prob_space)
   777 definition (in prob_space)
   684   "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
   778   "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
   685 
   779 
   686 abbreviation (in finite_information_space)
   780 abbreviation (in information_space)
   687   finite_conditional_entropy ("\<H>'(_ | _')") where
   781   conditional_entropy_Pow ("\<H>'(_ | _')") where
   688   "\<H>(X | Y) \<equiv> conditional_entropy b
   782   "\<H>(X | Y) \<equiv> conditional_entropy b
   689     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   783     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   690     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   784     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   691 
   785 
   692 lemma (in finite_information_space) conditional_entropy_positive:
   786 lemma (in information_space) conditional_entropy_positive:
   693   "0 \<le> \<H>(X | Y)" unfolding conditional_entropy_def using conditional_mutual_information_positive .
   787   "simple_function X \<Longrightarrow> simple_function Y \<Longrightarrow> 0 \<le> \<H>(X | Y)"
   694 
   788   unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive)
   695 lemma (in finite_information_space) conditional_entropy_generic_eq:
   789 
   696   assumes MX: "finite_measure_space MX (distribution X)"
   790 lemma (in measure_space) empty_measureI: "A = {} \<Longrightarrow> \<mu> A = 0" by simp
   697   assumes MY: "finite_measure_space MZ (distribution Z)"
   791 
       
   792 lemma (in information_space) conditional_entropy_generic_eq:
       
   793   assumes MX: "finite_random_variable MX X"
       
   794   assumes MZ: "finite_random_variable MZ Z"
   698   shows "conditional_entropy b MX MZ X Z =
   795   shows "conditional_entropy b MX MZ X Z =
   699      - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
   796      - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
   700          real (joint_distribution X Z {(x, z)}) *
   797          real (joint_distribution X Z {(x, z)}) *
   701          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   798          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   702   unfolding conditional_entropy_def using assms
   799 proof -
   703   apply (simp add: conditional_mutual_information_generic_eq
   800   interpret MX: finite_sigma_algebra MX using MX by simp
   704                    setsum_cartesian_product' setsum_commute[of _ "space MZ"]
   801   interpret MZ: finite_sigma_algebra MZ using MZ by simp
   705                    setsum_negf[symmetric] setsum_subtractf[symmetric])
   802   let "?XXZ x y z" = "joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}"
   706 proof (safe intro!: setsum_cong, simp)
   803   let "?XZ x z" = "joint_distribution X Z {(x, z)}"
   707   fix z x assume "z \<in> space MZ" "x \<in> space MX"
   804   let "?Z z" = "distribution Z {z}"
   708   let "?XXZ x'" = "real (joint_distribution X (\<lambda>x. (X x, Z x)) {(x, x', z)})"
   805   let "?f x y z" = "log b (real (?XXZ x y z) / (real (?XZ x z) * real (?XZ y z / ?Z z)))"
   709   let "?XZ x'" = "real (joint_distribution X Z {(x', z)})"
   806   { fix x z have "?XXZ x x z = ?XZ x z"
   710   let "?X" = "real (distribution X {x})"
   807       unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
   711   interpret MX: finite_measure_space MX "distribution X" by fact
   808   note this[simp]
   712   have *: "\<And>A. A = {} \<Longrightarrow> prob A = 0" by simp
   809   { fix x x' :: 'b and z assume "x' \<noteq> x"
   713   have XXZ: "\<And>x'. ?XXZ x' = (if x' = x then ?XZ x else 0)"
   810     then have "?XXZ x x' z = 0"
   714     by (auto simp: distribution_def intro!: arg_cong[where f=prob] *)
   811       by (auto simp: distribution_def intro!: arg_cong[where f=\<mu>] empty_measureI) }
   715   have "(\<Sum>x'\<in>space MX. ?XXZ x' * log b (?XXZ x') - (?XXZ x' * log b ?X + ?XXZ x' * log b (?XZ x'))) =
   812   note this[simp]
   716     (\<Sum>x'\<in>{x}. ?XZ x' * log b (?XZ x') - (?XZ x' * log b ?X + ?XZ x' * log b (?XZ x')))"
   813   { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ"
   717     using `x \<in> space MX` MX.finite_space
   814     then have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z)
   718     by (safe intro!: setsum_mono_zero_cong_right)
   815       = (\<Sum>x'\<in>space MX. if x = x' then real (?XZ x z) * ?f x x z else 0)"
   719        (auto split: split_if_asm simp: XXZ)
   816       by (auto intro!: setsum_cong)
   720   then show "(\<Sum>x'\<in>space MX. ?XXZ x' * log b (?XXZ x') - (?XXZ x' * log b ?X + ?XXZ x' * log b (?XZ x'))) +
   817     also have "\<dots> = real (?XZ x z) * ?f x x z"
   721       ?XZ x * log b ?X = 0" by simp
   818       using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space])
   722 qed
   819     also have "\<dots> = real (?XZ x z) * log b (real (?Z z) / real (?XZ x z))"
   723 
   820       by (auto simp: real_of_pinfreal_mult[symmetric])
   724 lemma (in finite_information_space) conditional_entropy_eq:
   821     also have "\<dots> = - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))"
   725   "\<H>(X | Z) =
   822       using assms[THEN finite_distribution_finite]
       
   823       using finite_distribution_order(6)[OF MX MZ]
       
   824       by (auto simp: log_simps field_simps zero_less_mult_iff zero_less_real_of_pinfreal real_of_pinfreal_eq_0)
       
   825     finally have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z) =
       
   826       - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))" . }
       
   827   note * = this
       
   828 
       
   829   show ?thesis
       
   830     unfolding conditional_entropy_def
       
   831     unfolding conditional_mutual_information_generic_eq[OF MX MX MZ]
       
   832     by (auto simp: setsum_cartesian_product' setsum_negf[symmetric]
       
   833                    setsum_commute[of _ "space MZ"] *   simp del: divide_pinfreal_def
       
   834              intro!: setsum_cong)
       
   835 qed
       
   836 
       
   837 lemma (in information_space) conditional_entropy_eq:
       
   838   assumes "simple_function X" "simple_function Z"
       
   839   shows "\<H>(X | Z) =
   726      - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   840      - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   727          real (joint_distribution X Z {(x, z)}) *
   841          real (joint_distribution X Z {(x, z)}) *
   728          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   842          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   729   by (simp add: finite_measure_space conditional_entropy_generic_eq)
   843   using conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
   730 
   844   by simp
   731 lemma (in finite_information_space) conditional_entropy_eq_ce_with_hypothesis:
   845 
   732   "\<H>(X | Y) =
   846 lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis:
       
   847   assumes X: "simple_function X" and Y: "simple_function Y"
       
   848   shows "\<H>(X | Y) =
   733     -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
   849     -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
   734       (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
   850       (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
   735               log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
   851               log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
   736   unfolding conditional_entropy_eq neg_equal_iff_equal
   852   unfolding conditional_entropy_eq[OF assms]
   737   apply (simp add: setsum_commute[of _ "Y`space M"] setsum_cartesian_product' setsum_divide_distrib[symmetric])
   853   using finite_distribution_finite[OF finite_random_variable_pairI[OF assms[THEN simple_function_imp_finite_random_variable]]]
   738   apply (safe intro!: setsum_cong)
   854   using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]]
   739   using real_distribution_order'[of Y _ X _]
   855   using finite_distribution_finite[OF Y[THEN simple_function_imp_finite_random_variable]]
   740   by (auto simp add: setsum_subtractf[of _ _ "X`space M"])
   856   by (auto simp: setsum_cartesian_product'  setsum_commute[of _ "Y`space M"] setsum_right_distrib real_of_pinfreal_eq_0
   741 
   857            intro!: setsum_cong)
   742 lemma (in finite_information_space) conditional_entropy_eq_cartesian_sum:
   858 
   743   "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   859 lemma (in information_space) conditional_entropy_eq_cartesian_product:
       
   860   assumes "simple_function X" "simple_function Y"
       
   861   shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
   744     real (joint_distribution X Y {(x,y)}) *
   862     real (joint_distribution X Y {(x,y)}) *
   745     log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
   863     log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
   746 proof -
   864   unfolding conditional_entropy_eq[OF assms]
   747   { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
   865   by (auto intro!: setsum_cong simp: setsum_cartesian_product')
   748     then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
       
   749     then have "joint_distribution X Y {x} = 0"
       
   750       unfolding distribution_def by auto }
       
   751   then show ?thesis using finite_space
       
   752     unfolding conditional_entropy_eq neg_equal_iff_equal setsum_cartesian_product
       
   753     by (auto intro!: setsum_mono_zero_cong_left)
       
   754 qed
       
   755 
   866 
   756 subsection {* Equalities *}
   867 subsection {* Equalities *}
   757 
   868 
   758 lemma (in finite_information_space) mutual_information_eq_entropy_conditional_entropy:
   869 lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
   759   "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
   870   assumes X: "simple_function X" and Z: "simple_function Z"
   760   unfolding mutual_information_eq entropy_eq conditional_entropy_eq
   871   shows  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
   761   using finite_space
   872 proof -
   762   by (auto simp add: setsum_addf setsum_subtractf setsum_cartesian_product'
   873   let "?XZ x z" = "real (joint_distribution X Z {(x, z)})"
   763       setsum_left_distrib[symmetric] setsum_addf setsum_real_distribution)
   874   let "?Z z" = "real (distribution Z {z})"
   764 
   875   let "?X x" = "real (distribution X {x})"
   765 lemma (in finite_information_space) conditional_entropy_less_eq_entropy:
   876   note fX = X[THEN simple_function_imp_finite_random_variable]
   766   "\<H>(X | Z) \<le> \<H>(X)"
   877   note fZ = Z[THEN simple_function_imp_finite_random_variable]
   767 proof -
   878   note fX[THEN finite_distribution_finite, simp] and fZ[THEN finite_distribution_finite, simp]
   768   have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy .
   879   note finite_distribution_order[OF fX fZ, simp]
   769   with mutual_information_positive[of X Z] entropy_positive[of X]
   880   { fix x z assume "x \<in> X`space M" "z \<in> Z`space M"
       
   881     have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) =
       
   882           ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)"
       
   883       by (auto simp: log_simps real_of_pinfreal_mult[symmetric] zero_less_mult_iff
       
   884                      zero_less_real_of_pinfreal field_simps real_of_pinfreal_eq_0 abs_mult) }
       
   885   note * = this
       
   886   show ?thesis
       
   887     unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z]
       
   888     using setsum_real_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]]
       
   889     by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric]
       
   890                      setsum_real_distribution)
       
   891 qed
       
   892 
       
   893 lemma (in information_space) conditional_entropy_less_eq_entropy:
       
   894   assumes X: "simple_function X" and Z: "simple_function Z"
       
   895   shows "\<H>(X | Z) \<le> \<H>(X)"
       
   896 proof -
       
   897   have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
       
   898   with mutual_information_positive[OF X Z] entropy_positive[OF X]
   770   show ?thesis by auto
   899   show ?thesis by auto
   771 qed
   900 qed
   772 
   901 
   773 lemma (in finite_information_space) entropy_chain_rule:
   902 lemma (in information_space) entropy_chain_rule:
   774   "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
   903   assumes X: "simple_function X" and Y: "simple_function Y"
   775   unfolding entropy_eq[of X] entropy_eq_cartesian_sum conditional_entropy_eq_cartesian_sum
   904   shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
   776   unfolding setsum_commute[of _ "X`space M"] setsum_negf[symmetric] setsum_addf[symmetric]
   905 proof -
   777   by (rule setsum_cong)
   906   let "?XY x y" = "real (joint_distribution X Y {(x, y)})"
   778      (simp_all add: setsum_negf setsum_addf setsum_subtractf setsum_real_distribution
   907   let "?Y y" = "real (distribution Y {y})"
   779                     setsum_left_distrib[symmetric] joint_distribution_commute[of X Y])
   908   let "?X x" = "real (distribution X {x})"
       
   909   note fX = X[THEN simple_function_imp_finite_random_variable]
       
   910   note fY = Y[THEN simple_function_imp_finite_random_variable]
       
   911   note fX[THEN finite_distribution_finite, simp] and fY[THEN finite_distribution_finite, simp]
       
   912   note finite_distribution_order[OF fX fY, simp]
       
   913   { fix x y assume "x \<in> X`space M" "y \<in> Y`space M"
       
   914     have "?XY x y * log b (?XY x y / ?X x) =
       
   915           ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)"
       
   916       by (auto simp: log_simps real_of_pinfreal_mult[symmetric] zero_less_mult_iff
       
   917                      zero_less_real_of_pinfreal field_simps real_of_pinfreal_eq_0 abs_mult) }
       
   918   note * = this
       
   919   show ?thesis
       
   920     using setsum_real_joint_distribution_singleton[OF fY fX]
       
   921     unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y]
       
   922     unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"]
       
   923     by (simp add: * setsum_subtractf setsum_left_distrib[symmetric])
       
   924 qed
   780 
   925 
   781 section {* Partitioning *}
   926 section {* Partitioning *}
   782 
   927 
   783 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
   928 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
   784 
   929 
   891     apply (subst setsum_Sigma[symmetric])
  1036     apply (subst setsum_Sigma[symmetric])
   892     by (auto intro!: finite_subset[of _ "f`A"])
  1037     by (auto intro!: finite_subset[of _ "f`A"])
   893   finally show ?thesis .
  1038   finally show ?thesis .
   894 qed
  1039 qed
   895 
  1040 
   896 lemma (in finite_information_space) entropy_partition:
  1041 lemma (in information_space) entropy_partition:
       
  1042   assumes sf: "simple_function X" "simple_function P"
   897   assumes svi: "subvimage (space M) X P"
  1043   assumes svi: "subvimage (space M) X P"
   898   shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
  1044   shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
   899 proof -
  1045 proof -
       
  1046   let "?XP x p" = "real (joint_distribution X P {(x, p)})"
       
  1047   let "?X x" = "real (distribution X {x})"
       
  1048   let "?P p" = "real (distribution P {p})"
       
  1049   note fX = sf(1)[THEN simple_function_imp_finite_random_variable]
       
  1050   note fP = sf(2)[THEN simple_function_imp_finite_random_variable]
       
  1051   note fX[THEN finite_distribution_finite, simp] and fP[THEN finite_distribution_finite, simp]
       
  1052   note finite_distribution_order[OF fX fP, simp]
   900   have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
  1053   have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
   901     (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
  1054     (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
   902     real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
  1055     real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
   903   proof (subst setsum_image_split[OF svi],
  1056   proof (subst setsum_image_split[OF svi],
   904       safe intro!: finite_imageI finite_space setsum_mono_zero_cong_left imageI)
  1057       safe intro!: setsum_mono_zero_cong_left imageI)
       
  1058     show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)"
       
  1059       using sf unfolding simple_function_def by auto
       
  1060   next
   905     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
  1061     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
   906     assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
  1062     assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
   907     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
  1063     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
   908     with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
  1064     with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
   909     show "x \<in> P -` {P p}" by auto
  1065     show "x \<in> P -` {P p}" by auto
   918     thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
  1074     thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
   919           real (joint_distribution X P {(X x, P p)}) *
  1075           real (joint_distribution X P {(X x, P p)}) *
   920           log b (real (joint_distribution X P {(X x, P p)}))"
  1076           log b (real (joint_distribution X P {(X x, P p)}))"
   921       by (auto simp: distribution_def)
  1077       by (auto simp: distribution_def)
   922   qed
  1078   qed
   923   thus ?thesis
  1079   moreover have "\<And>x y. real (joint_distribution X P {(x, y)}) *
   924   unfolding entropy_eq conditional_entropy_eq
  1080       log b (real (joint_distribution X P {(x, y)}) / real (distribution P {y})) =
       
  1081       real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})) -
       
  1082       real (joint_distribution X P {(x, y)}) * log b (real (distribution P {y}))"
       
  1083     by (auto simp add: log_simps zero_less_mult_iff field_simps)
       
  1084   ultimately show ?thesis
       
  1085     unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf]
       
  1086     using setsum_real_joint_distribution_singleton[OF fX fP]
   925     by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
  1087     by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
   926       setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
  1088       setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
   927 qed
  1089 qed
   928 
  1090 
   929 corollary (in finite_information_space) entropy_data_processing:
  1091 corollary (in information_space) entropy_data_processing:
   930   "\<H>(f \<circ> X) \<le> \<H>(X)"
  1092   assumes X: "simple_function X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
   931   by (subst (2) entropy_partition[of _ "f \<circ> X"]) (auto intro: conditional_entropy_positive)
  1093 proof -
   932 
  1094   note X
   933 corollary (in finite_information_space) entropy_of_inj:
  1095   moreover have fX: "simple_function (f \<circ> X)" using X by auto
   934   assumes "inj_on f (X`space M)"
  1096   moreover have "subvimage (space M) X (f \<circ> X)" by auto
       
  1097   ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
       
  1098   then show "\<H>(f \<circ> X) \<le> \<H>(X)"
       
  1099     by (auto intro: conditional_entropy_positive[OF X fX])
       
  1100 qed
       
  1101 
       
  1102 corollary (in information_space) entropy_of_inj:
       
  1103   assumes X: "simple_function X" and inj: "inj_on f (X`space M)"
   935   shows "\<H>(f \<circ> X) = \<H>(X)"
  1104   shows "\<H>(f \<circ> X) = \<H>(X)"
   936 proof (rule antisym)
  1105 proof (rule antisym)
   937   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing .
  1106   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
   938 next
  1107 next
       
  1108   have sf: "simple_function (f \<circ> X)"
       
  1109     using X by auto
   939   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1110   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
   940     by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF assms])
  1111     by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF inj])
   941   also have "... \<le> \<H>(f \<circ> X)"
  1112   also have "... \<le> \<H>(f \<circ> X)"
   942     using entropy_data_processing .
  1113     using entropy_data_processing[OF sf] .
   943   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1114   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
   944 qed
  1115 qed
   945 
  1116 
   946 end
  1117 end