src/HOL/Probability/Information.thy
 author hoelzl Thu Sep 02 17:12:40 2010 +0200 (2010-09-02) changeset 39092 98de40859858 parent 38656 d5d342611edb child 39097 943c7b348524 permissions -rw-r--r--
move lemmas to correct theory files
```     1 theory Information
```
```     2 imports Probability_Space Product_Measure Convex Radon_Nikodym
```
```     3 begin
```
```     4
```
```     5 lemma real_of_pinfreal_inverse[simp]:
```
```     6   fixes X :: pinfreal
```
```     7   shows "real (inverse X) = 1 / real X"
```
```     8   by (cases X) (auto simp: inverse_eq_divide)
```
```     9
```
```    10 section "Convex theory"
```
```    11
```
```    12 lemma log_setsum:
```
```    13   assumes "finite s" "s \<noteq> {}"
```
```    14   assumes "b > 1"
```
```    15   assumes "(\<Sum> i \<in> s. a i) = 1"
```
```    16   assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
```
```    17   assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
```
```    18   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
```
```    19 proof -
```
```    20   have "convex_on {0 <..} (\<lambda> x. - log b x)"
```
```    21     by (rule minus_log_convex[OF `b > 1`])
```
```    22   hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
```
```    23     using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
```
```    24   thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
```
```    25 qed
```
```    26
```
```    27 lemma log_setsum':
```
```    28   assumes "finite s" "s \<noteq> {}"
```
```    29   assumes "b > 1"
```
```    30   assumes "(\<Sum> i \<in> s. a i) = 1"
```
```    31   assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
```
```    32           "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
```
```    33   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
```
```    34 proof -
```
```    35   have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
```
```    36     using assms by (auto intro!: setsum_mono_zero_cong_left)
```
```    37   moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
```
```    38   proof (rule log_setsum)
```
```    39     have "setsum a (s - {i. a i = 0}) = setsum a s"
```
```    40       using assms(1) by (rule setsum_mono_zero_cong_left) auto
```
```    41     thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
```
```    42       "finite (s - {i. a i = 0})" using assms by simp_all
```
```    43
```
```    44     show "s - {i. a i = 0} \<noteq> {}"
```
```    45     proof
```
```    46       assume *: "s - {i. a i = 0} = {}"
```
```    47       hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
```
```    48       with sum_1 show False by simp
```
```    49     qed
```
```    50
```
```    51     fix i assume "i \<in> s - {i. a i = 0}"
```
```    52     hence "i \<in> s" "a i \<noteq> 0" by simp_all
```
```    53     thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
```
```    54   qed fact+
```
```    55   ultimately show ?thesis by simp
```
```    56 qed
```
```    57
```
```    58 lemma log_setsum_divide:
```
```    59   assumes "finite S" and "S \<noteq> {}" and "1 < b"
```
```    60   assumes "(\<Sum>x\<in>S. g x) = 1"
```
```    61   assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
```
```    62   assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
```
```    63   shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
```
```    64 proof -
```
```    65   have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
```
```    66     using `1 < b` by (subst log_le_cancel_iff) auto
```
```    67
```
```    68   have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
```
```    69   proof (unfold setsum_negf[symmetric], rule setsum_cong)
```
```    70     fix x assume x: "x \<in> S"
```
```    71     show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
```
```    72     proof (cases "g x = 0")
```
```    73       case False
```
```    74       with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
```
```    75       thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
```
```    76     qed simp
```
```    77   qed rule
```
```    78   also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
```
```    79   proof (rule log_setsum')
```
```    80     fix x assume x: "x \<in> S" "0 < g x"
```
```    81     with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
```
```    82   qed fact+
```
```    83   also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
```
```    84     by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
```
```    85         split: split_if_asm)
```
```    86   also have "... \<le> log b (\<Sum>x\<in>S. f x)"
```
```    87   proof (rule log_mono)
```
```    88     have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
```
```    89     also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
```
```    90     proof (rule setsum_strict_mono)
```
```    91       show "finite (S - {x. g x = 0})" using `finite S` by simp
```
```    92       show "S - {x. g x = 0} \<noteq> {}"
```
```    93       proof
```
```    94         assume "S - {x. g x = 0} = {}"
```
```    95         hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
```
```    96         with `(\<Sum>x\<in>S. g x) = 1` show False by simp
```
```    97       qed
```
```    98       fix x assume "x \<in> S - {x. g x = 0}"
```
```    99       thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
```
```   100     qed
```
```   101     finally show "0 < ?sum" .
```
```   102     show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
```
```   103       using `finite S` pos by (auto intro!: setsum_mono2)
```
```   104   qed
```
```   105   finally show ?thesis .
```
```   106 qed
```
```   107
```
```   108 lemma (in finite_prob_space) sum_over_space_distrib:
```
```   109   "(\<Sum>x\<in>X`space M. distribution X {x}) = 1"
```
```   110   unfolding distribution_def measure_space_1[symmetric] using finite_space
```
```   111   by (subst measure_finitely_additive'')
```
```   112      (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=\<mu>])
```
```   113
```
```   114 lemma (in finite_prob_space) sum_over_space_real_distribution:
```
```   115   "(\<Sum>x\<in>X`space M. real (distribution X {x})) = 1"
```
```   116   unfolding distribution_def prob_space[symmetric] using finite_space
```
```   117   by (subst real_finite_measure_finite_Union[symmetric])
```
```   118      (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=prob])
```
```   119
```
```   120 section "Information theory"
```
```   121
```
```   122 definition
```
```   123   "KL_divergence b M \<mu> \<nu> =
```
```   124     measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"
```
```   125
```
```   126 locale finite_information_space = finite_prob_space +
```
```   127   fixes b :: real assumes b_gt_1: "1 < b"
```
```   128
```
```   129 lemma (in finite_prob_space) distribution_mono:
```
```   130   assumes "\<And>t. \<lbrakk> t \<in> space M ; X t \<in> x \<rbrakk> \<Longrightarrow> Y t \<in> y"
```
```   131   shows "distribution X x \<le> distribution Y y"
```
```   132   unfolding distribution_def
```
```   133   using assms by (auto simp: sets_eq_Pow intro!: measure_mono)
```
```   134
```
```   135 lemma (in prob_space) distribution_remove_const:
```
```   136   shows "joint_distribution X (\<lambda>x. ()) {(x, ())} = distribution X {x}"
```
```   137   and "joint_distribution (\<lambda>x. ()) X {((), x)} = distribution X {x}"
```
```   138   and "joint_distribution X (\<lambda>x. (Y x, ())) {(x, y, ())} = joint_distribution X Y {(x, y)}"
```
```   139   and "joint_distribution X (\<lambda>x. ((), Y x)) {(x, (), y)} = joint_distribution X Y {(x, y)}"
```
```   140   and "distribution (\<lambda>x. ()) {()} = 1"
```
```   141   unfolding measure_space_1[symmetric]
```
```   142   by (auto intro!: arg_cong[where f="\<mu>"] simp: distribution_def)
```
```   143
```
```   144 context finite_information_space
```
```   145 begin
```
```   146
```
```   147 lemma distribution_mono_gt_0:
```
```   148   assumes gt_0: "0 < distribution X x"
```
```   149   assumes *: "\<And>t. \<lbrakk> t \<in> space M ; X t \<in> x \<rbrakk> \<Longrightarrow> Y t \<in> y"
```
```   150   shows "0 < distribution Y y"
```
```   151   by (rule less_le_trans[OF gt_0 distribution_mono]) (rule *)
```
```   152
```
```   153 lemma
```
```   154   assumes "0 \<le> A" and pos: "0 < A \<Longrightarrow> 0 < B" "0 < A \<Longrightarrow> 0 < C"
```
```   155   shows mult_log_mult: "A * log b (B * C) = A * log b B + A * log b C" (is "?mult")
```
```   156   and mult_log_divide: "A * log b (B / C) = A * log b B - A * log b C" (is "?div")
```
```   157 proof -
```
```   158   have "?mult \<and> ?div"
```
```   159   proof (cases "A = 0")
```
```   160     case False
```
```   161     hence "0 < A" using `0 \<le> A` by auto
```
```   162       with pos[OF this] show "?mult \<and> ?div" using b_gt_1
```
```   163         by (auto simp: log_divide log_mult field_simps)
```
```   164   qed simp
```
```   165   thus ?mult and ?div by auto
```
```   166 qed
```
```   167
```
```   168 lemma split_pairs:
```
```   169   shows
```
```   170     "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
```
```   171     "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
```
```   172
```
```   173 lemma (in finite_information_space) distribution_finite:
```
```   174   "distribution X A \<noteq> \<omega>"
```
```   175   using measure_finite[of "X -` A \<inter> space M"]
```
```   176   unfolding distribution_def sets_eq_Pow by auto
```
```   177
```
```   178 lemma (in finite_information_space) real_distribution_gt_0[simp]:
```
```   179   "0 < real (distribution Y y) \<longleftrightarrow>  0 < distribution Y y"
```
```   180   using assms by (auto intro!: real_pinfreal_pos distribution_finite)
```
```   181
```
```   182 lemma real_distribution_mult_pos_pos:
```
```   183   assumes "0 < distribution Y y"
```
```   184   and "0 < distribution X x"
```
```   185   shows "0 < real (distribution Y y * distribution X x)"
```
```   186   unfolding real_of_pinfreal_mult[symmetric]
```
```   187   using assms by (auto intro!: mult_pos_pos)
```
```   188
```
```   189 lemma real_distribution_divide_pos_pos:
```
```   190   assumes "0 < distribution Y y"
```
```   191   and "0 < distribution X x"
```
```   192   shows "0 < real (distribution Y y / distribution X x)"
```
```   193   unfolding divide_pinfreal_def real_of_pinfreal_mult[symmetric]
```
```   194   using assms distribution_finite[of X x] by (cases "distribution X x") (auto intro!: mult_pos_pos)
```
```   195
```
```   196 lemma real_distribution_mult_inverse_pos_pos:
```
```   197   assumes "0 < distribution Y y"
```
```   198   and "0 < distribution X x"
```
```   199   shows "0 < real (distribution Y y * inverse (distribution X x))"
```
```   200   unfolding divide_pinfreal_def real_of_pinfreal_mult[symmetric]
```
```   201   using assms distribution_finite[of X x] by (cases "distribution X x") (auto intro!: mult_pos_pos)
```
```   202
```
```   203 ML {*
```
```   204
```
```   205   (* tactic to solve equations of the form @{term "W * log b (X / (Y * Z)) = W * log b X - W * log b (Y * Z)"}
```
```   206      where @{term W} is a joint distribution of @{term X}, @{term Y}, and @{term Z}. *)
```
```   207
```
```   208   val mult_log_intros = [@{thm mult_log_divide}, @{thm mult_log_mult}]
```
```   209   val intros = [@{thm divide_pos_pos}, @{thm mult_pos_pos}, @{thm real_pinfreal_nonneg},
```
```   210     @{thm real_distribution_divide_pos_pos},
```
```   211     @{thm real_distribution_mult_inverse_pos_pos},
```
```   212     @{thm real_distribution_mult_pos_pos}]
```
```   213
```
```   214   val distribution_gt_0_tac = (rtac @{thm distribution_mono_gt_0}
```
```   215     THEN' assume_tac
```
```   216     THEN' clarsimp_tac (clasimpset_of @{context} addsimps2 @{thms split_pairs}))
```
```   217
```
```   218   val distr_mult_log_eq_tac = REPEAT_ALL_NEW (CHANGED o TRY o
```
```   219     (resolve_tac (mult_log_intros @ intros)
```
```   220       ORELSE' distribution_gt_0_tac
```
```   221       ORELSE' clarsimp_tac (clasimpset_of @{context})))
```
```   222
```
```   223   fun instanciate_term thy redex intro =
```
```   224     let
```
```   225       val intro_concl = Thm.concl_of intro
```
```   226
```
```   227       val lhs = intro_concl |> HOLogic.dest_Trueprop |> HOLogic.dest_eq |> fst
```
```   228
```
```   229       val m = SOME (Pattern.match thy (lhs, redex) (Vartab.empty, Vartab.empty))
```
```   230         handle Pattern.MATCH => NONE
```
```   231
```
```   232     in
```
```   233       Option.map (fn m => Envir.subst_term m intro_concl) m
```
```   234     end
```
```   235
```
```   236   fun mult_log_simproc simpset redex =
```
```   237   let
```
```   238     val ctxt = Simplifier.the_context simpset
```
```   239     val thy = ProofContext.theory_of ctxt
```
```   240     fun prove (SOME thm) = (SOME
```
```   241           (Goal.prove ctxt [] [] thm (K (distr_mult_log_eq_tac 1))
```
```   242            |> mk_meta_eq)
```
```   243             handle THM _ => NONE)
```
```   244       | prove NONE = NONE
```
```   245   in
```
```   246     get_first (instanciate_term thy (term_of redex) #> prove) mult_log_intros
```
```   247   end
```
```   248 *}
```
```   249
```
```   250 simproc_setup mult_log ("real (distribution X x) * log b (A * B)" |
```
```   251                         "real (distribution X x) * log b (A / B)") = {* K mult_log_simproc *}
```
```   252
```
```   253 end
```
```   254
```
```   255 lemma (in finite_measure_space) absolutely_continuousI:
```
```   256   assumes "finite_measure_space M \<nu>"
```
```   257   assumes v: "\<And>x. \<lbrakk> x \<in> space M ; \<mu> {x} = 0 \<rbrakk> \<Longrightarrow> \<nu> {x} = 0"
```
```   258   shows "absolutely_continuous \<nu>"
```
```   259 proof (unfold absolutely_continuous_def sets_eq_Pow, safe)
```
```   260   fix N assume "\<mu> N = 0" "N \<subseteq> space M"
```
```   261
```
```   262   interpret v: finite_measure_space M \<nu> by fact
```
```   263
```
```   264   have "\<nu> N = \<nu> (\<Union>x\<in>N. {x})" by simp
```
```   265   also have "\<dots> = (\<Sum>x\<in>N. \<nu> {x})"
```
```   266   proof (rule v.measure_finitely_additive''[symmetric])
```
```   267     show "finite N" using `N \<subseteq> space M` finite_space by (auto intro: finite_subset)
```
```   268     show "disjoint_family_on (\<lambda>i. {i}) N" unfolding disjoint_family_on_def by auto
```
```   269     fix x assume "x \<in> N" thus "{x} \<in> sets M" using `N \<subseteq> space M` sets_eq_Pow by auto
```
```   270   qed
```
```   271   also have "\<dots> = 0"
```
```   272   proof (safe intro!: setsum_0')
```
```   273     fix x assume "x \<in> N"
```
```   274     hence "\<mu> {x} \<le> \<mu> N" using sets_eq_Pow `N \<subseteq> space M` by (auto intro!: measure_mono)
```
```   275     hence "\<mu> {x} = 0" using `\<mu> N = 0` by simp
```
```   276     thus "\<nu> {x} = 0" using v[of x] `x \<in> N` `N \<subseteq> space M` by auto
```
```   277   qed
```
```   278   finally show "\<nu> N = 0" .
```
```   279 qed
```
```   280
```
```   281 lemma (in finite_measure_space) KL_divergence_eq_finite:
```
```   282   assumes v: "finite_measure_space M \<nu>"
```
```   283   assumes ac: "\<forall>x\<in>space M. \<mu> {x} = 0 \<longrightarrow> \<nu> {x} = 0"
```
```   284   shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
```
```   285 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
```
```   286   interpret v: finite_measure_space M \<nu> by fact
```
```   287   have ms: "measure_space M \<nu>" by fact
```
```   288
```
```   289   have ac: "absolutely_continuous \<nu>"
```
```   290     using ac by (auto intro!: absolutely_continuousI[OF v])
```
```   291
```
```   292   show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
```
```   293     using RN_deriv_finite_measure[OF ms ac]
```
```   294     by (auto intro!: setsum_cong simp: field_simps real_of_pinfreal_mult[symmetric])
```
```   295 qed
```
```   296
```
```   297 lemma (in finite_prob_space) finite_sum_over_space_eq_1:
```
```   298   "(\<Sum>x\<in>space M. real (\<mu> {x})) = 1"
```
```   299   using sum_over_space_eq_1 finite_measure by (simp add: real_of_pinfreal_setsum sets_eq_Pow)
```
```   300
```
```   301 lemma (in finite_prob_space) KL_divergence_positive_finite:
```
```   302   assumes v: "finite_prob_space M \<nu>"
```
```   303   assumes ac: "\<And>x. \<lbrakk> x \<in> space M ; \<mu> {x} = 0 \<rbrakk> \<Longrightarrow> \<nu> {x} = 0"
```
```   304   and "1 < b"
```
```   305   shows "0 \<le> KL_divergence b M \<nu> \<mu>"
```
```   306 proof -
```
```   307   interpret v: finite_prob_space M \<nu> using v .
```
```   308
```
```   309   have *: "space M \<noteq> {}" using not_empty by simp
```
```   310
```
```   311   hence "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
```
```   312   proof (subst KL_divergence_eq_finite)
```
```   313     show "finite_measure_space  M \<nu>" by fact
```
```   314
```
```   315     show "\<forall>x\<in>space M. \<mu> {x} = 0 \<longrightarrow> \<nu> {x} = 0" using ac by auto
```
```   316     show "- (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x}))) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
```
```   317     proof (safe intro!: log_setsum_divide *)
```
```   318       show "finite (space M)" using finite_space by simp
```
```   319       show "1 < b" by fact
```
```   320       show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
```
```   321
```
```   322       fix x assume x: "x \<in> space M"
```
```   323       { assume "0 < real (\<nu> {x})"
```
```   324         hence "\<mu> {x} \<noteq> 0" using ac[OF x] by auto
```
```   325         thus "0 < prob {x}" using measure_finite[of "{x}"] sets_eq_Pow x
```
```   326           by (cases "\<mu> {x}") simp_all }
```
```   327     qed auto
```
```   328   qed
```
```   329   thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
```
```   330 qed
```
```   331
```
```   332 definition (in prob_space)
```
```   333   "mutual_information b S T X Y =
```
```   334     KL_divergence b (prod_measure_space S T)
```
```   335       (joint_distribution X Y)
```
```   336       (prod_measure S (distribution X) T (distribution Y))"
```
```   337
```
```   338 abbreviation (in finite_information_space)
```
```   339   finite_mutual_information ("\<I>'(_ ; _')") where
```
```   340   "\<I>(X ; Y) \<equiv> mutual_information b
```
```   341     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
```
```   342     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
```
```   343
```
```   344 lemma prod_measure_times_finite:
```
```   345   assumes fms: "finite_measure_space M \<mu>" "finite_measure_space N \<nu>" and a: "a \<in> space M \<times> space N"
```
```   346   shows "prod_measure M \<mu> N \<nu> {a} = \<mu> {fst a} * \<nu> {snd a}"
```
```   347 proof (cases a)
```
```   348   case (Pair b c)
```
```   349   hence a_eq: "{a} = {b} \<times> {c}" by simp
```
```   350
```
```   351   interpret M: finite_measure_space M \<mu> by fact
```
```   352   interpret N: finite_measure_space N \<nu> by fact
```
```   353
```
```   354   from finite_measure_space.finite_prod_measure_times[OF fms, of "{b}" "{c}"] M.sets_eq_Pow N.sets_eq_Pow a Pair
```
```   355   show ?thesis unfolding a_eq by simp
```
```   356 qed
```
```   357
```
```   358 lemma setsum_cartesian_product':
```
```   359   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
```
```   360   unfolding setsum_cartesian_product by simp
```
```   361
```
```   362 lemma (in finite_information_space) mutual_information_generic_eq:
```
```   363   assumes MX: "finite_measure_space MX (distribution X)"
```
```   364   assumes MY: "finite_measure_space MY (distribution Y)"
```
```   365   shows "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
```
```   366       real (joint_distribution X Y {(x,y)}) *
```
```   367       log b (real (joint_distribution X Y {(x,y)}) /
```
```   368       (real (distribution X {x}) * real (distribution Y {y}))))"
```
```   369 proof -
```
```   370   let ?P = "prod_measure_space MX MY"
```
```   371   let ?\<mu> = "prod_measure MX (distribution X) MY (distribution Y)"
```
```   372   let ?\<nu> = "joint_distribution X Y"
```
```   373   interpret X: finite_measure_space MX "distribution X" by fact
```
```   374   moreover interpret Y: finite_measure_space MY "distribution Y" by fact
```
```   375   have fms: "finite_measure_space MX (distribution X)"
```
```   376             "finite_measure_space MY (distribution Y)" by fact+
```
```   377   have fms_P: "finite_measure_space ?P ?\<mu>"
```
```   378     by (rule X.finite_measure_space_finite_prod_measure) fact
```
```   379   then interpret P: finite_measure_space ?P ?\<mu> .
```
```   380   have fms_P': "finite_measure_space ?P ?\<nu>"
```
```   381       using finite_product_measure_space[of "space MX" "space MY"]
```
```   382         X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
```
```   383         X.sets_eq_Pow Y.sets_eq_Pow
```
```   384       by (simp add: prod_measure_space_def sigma_def)
```
```   385   then interpret P': finite_measure_space ?P ?\<nu> .
```
```   386   { fix x assume "x \<in> space ?P"
```
```   387     hence in_MX: "{fst x} \<in> sets MX" "{snd x} \<in> sets MY" using X.sets_eq_Pow Y.sets_eq_Pow
```
```   388       by (auto simp: prod_measure_space_def)
```
```   389     assume "?\<mu> {x} = 0"
```
```   390     with X.finite_prod_measure_times[OF fms(2), of "{fst x}" "{snd x}"] in_MX
```
```   391     have "distribution X {fst x} = 0 \<or> distribution Y {snd x} = 0"
```
```   392       by (simp add: prod_measure_space_def)
```
```   393     hence "joint_distribution X Y {x} = 0"
```
```   394       by (cases x) (auto simp: distribution_order) }
```
```   395   note measure_0 = this
```
```   396   show ?thesis
```
```   397     unfolding Let_def mutual_information_def
```
```   398     using measure_0 fms_P fms_P' MX MY P.absolutely_continuous_def
```
```   399     by (subst P.KL_divergence_eq_finite)
```
```   400        (auto simp add: prod_measure_space_def prod_measure_times_finite
```
```   401          finite_prob_space_eq setsum_cartesian_product' real_of_pinfreal_mult[symmetric])
```
```   402 qed
```
```   403
```
```   404 lemma (in finite_information_space)
```
```   405   assumes MX: "finite_prob_space MX (distribution X)"
```
```   406   assumes MY: "finite_prob_space MY (distribution Y)"
```
```   407   and X_space: "X ` space M \<subseteq> space MX" and Y_space: "Y ` space M \<subseteq> space MY"
```
```   408   shows mutual_information_eq_generic:
```
```   409     "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
```
```   410       real (joint_distribution X Y {(x,y)}) *
```
```   411       log b (real (joint_distribution X Y {(x,y)}) /
```
```   412       (real (distribution X {x}) * real (distribution Y {y}))))"
```
```   413     (is "?equality")
```
```   414   and mutual_information_positive_generic:
```
```   415     "0 \<le> mutual_information b MX MY X Y" (is "?positive")
```
```   416 proof -
```
```   417   let ?P = "prod_measure_space MX MY"
```
```   418   let ?\<mu> = "prod_measure MX (distribution X) MY (distribution Y)"
```
```   419   let ?\<nu> = "joint_distribution X Y"
```
```   420
```
```   421   interpret X: finite_prob_space MX "distribution X" by fact
```
```   422   moreover interpret Y: finite_prob_space MY "distribution Y" by fact
```
```   423   have ms_X: "measure_space MX (distribution X)"
```
```   424     and ms_Y: "measure_space MY (distribution Y)"
```
```   425     and fms: "finite_measure_space MX (distribution X)" "finite_measure_space MY (distribution Y)" by fact+
```
```   426   have fms_P: "finite_measure_space ?P ?\<mu>"
```
```   427     by (rule X.finite_measure_space_finite_prod_measure) fact
```
```   428   then interpret P: finite_measure_space ?P ?\<mu> .
```
```   429
```
```   430   have fms_P': "finite_measure_space ?P ?\<nu>"
```
```   431       using finite_product_measure_space[of "space MX" "space MY"]
```
```   432         X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
```
```   433         X.sets_eq_Pow Y.sets_eq_Pow
```
```   434       by (simp add: prod_measure_space_def sigma_def)
```
```   435   then interpret P': finite_measure_space ?P ?\<nu> .
```
```   436
```
```   437   { fix x assume "x \<in> space ?P"
```
```   438     hence in_MX: "{fst x} \<in> sets MX" "{snd x} \<in> sets MY" using X.sets_eq_Pow Y.sets_eq_Pow
```
```   439       by (auto simp: prod_measure_space_def)
```
```   440
```
```   441     assume "?\<mu> {x} = 0"
```
```   442     with X.finite_prod_measure_times[OF fms(2), of "{fst x}" "{snd x}"] in_MX
```
```   443     have "distribution X {fst x} = 0 \<or> distribution Y {snd x} = 0"
```
```   444       by (simp add: prod_measure_space_def)
```
```   445
```
```   446     hence "joint_distribution X Y {x} = 0"
```
```   447       by (cases x) (auto simp: distribution_order) }
```
```   448   note measure_0 = this
```
```   449
```
```   450   show ?equality
```
```   451     unfolding Let_def mutual_information_def
```
```   452     using measure_0 fms_P fms_P' MX MY P.absolutely_continuous_def
```
```   453     by (subst P.KL_divergence_eq_finite)
```
```   454        (auto simp add: prod_measure_space_def prod_measure_times_finite
```
```   455          finite_prob_space_eq setsum_cartesian_product' real_of_pinfreal_mult[symmetric])
```
```   456
```
```   457   show ?positive
```
```   458     unfolding Let_def mutual_information_def using measure_0 b_gt_1
```
```   459   proof (safe intro!: finite_prob_space.KL_divergence_positive_finite, simp_all)
```
```   460     have "?\<mu> (space ?P) = 1"
```
```   461       using X.top Y.top X.measure_space_1 Y.measure_space_1 fms
```
```   462       by (simp add: prod_measure_space_def X.finite_prod_measure_times)
```
```   463     with fms_P show "finite_prob_space ?P ?\<mu>"
```
```   464       by (simp add: finite_prob_space_eq)
```
```   465
```
```   466     from ms_X ms_Y X.top Y.top X.measure_space_1 Y.measure_space_1 Y.not_empty X_space Y_space
```
```   467     have "?\<nu> (space ?P) = 1" unfolding measure_space_1[symmetric]
```
```   468       by (auto intro!: arg_cong[where f="\<mu>"]
```
```   469                simp add: prod_measure_space_def distribution_def vimage_Times comp_def)
```
```   470     with fms_P' show "finite_prob_space ?P ?\<nu>"
```
```   471       by (simp add: finite_prob_space_eq)
```
```   472   qed
```
```   473 qed
```
```   474
```
```   475 lemma (in finite_information_space) mutual_information_eq:
```
```   476   "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
```
```   477     real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
```
```   478                                                    (real (distribution X {x}) * real (distribution Y {y}))))"
```
```   479   by (subst mutual_information_eq_generic) (simp_all add: finite_prob_space_of_images)
```
```   480
```
```   481 lemma (in finite_information_space) mutual_information_positive: "0 \<le> \<I>(X;Y)"
```
```   482   by (subst mutual_information_positive_generic) (simp_all add: finite_prob_space_of_images)
```
```   483
```
```   484 definition (in prob_space)
```
```   485   "entropy b s X = mutual_information b s s X X"
```
```   486
```
```   487 abbreviation (in finite_information_space)
```
```   488   finite_entropy ("\<H>'(_')") where
```
```   489   "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
```
```   490
```
```   491 lemma (in finite_information_space) joint_distribution_remove[simp]:
```
```   492     "joint_distribution X X {(x, x)} = distribution X {x}"
```
```   493   unfolding distribution_def by (auto intro!: arg_cong[where f="\<mu>"])
```
```   494
```
```   495 lemma (in finite_information_space) entropy_eq:
```
```   496   "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
```
```   497 proof -
```
```   498   { fix f
```
```   499     { fix x y
```
```   500       have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
```
```   501         hence "real (distribution (\<lambda>x. (X x, X x))  {(x,y)}) * f x y =
```
```   502             (if x = y then real (distribution X {x}) * f x y else 0)"
```
```   503         unfolding distribution_def by auto }
```
```   504     hence "(\<Sum>(x, y) \<in> X ` space M \<times> X ` space M. real (joint_distribution X X {(x, y)}) * f x y) =
```
```   505       (\<Sum>x \<in> X ` space M. real (distribution X {x}) * f x x)"
```
```   506       unfolding setsum_cartesian_product' by (simp add: setsum_cases finite_space) }
```
```   507   note remove_cartesian_product = this
```
```   508
```
```   509   show ?thesis
```
```   510     unfolding entropy_def mutual_information_eq setsum_negf[symmetric] remove_cartesian_product
```
```   511     by (auto intro!: setsum_cong)
```
```   512 qed
```
```   513
```
```   514 lemma (in finite_information_space) entropy_positive: "0 \<le> \<H>(X)"
```
```   515   unfolding entropy_def using mutual_information_positive .
```
```   516
```
```   517 definition (in prob_space)
```
```   518   "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
```
```   519     mutual_information b M1 (prod_measure_space M2 M3) X (\<lambda>x. (Y x, Z x)) -
```
```   520     mutual_information b M1 M3 X Z"
```
```   521
```
```   522 abbreviation (in finite_information_space)
```
```   523   finite_conditional_mutual_information ("\<I>'( _ ; _ | _ ')") where
```
```   524   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
```
```   525     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
```
```   526     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
```
```   527     \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
```
```   528     X Y Z"
```
```   529
```
```   530 lemma (in finite_information_space) setsum_distribution_gen:
```
```   531   assumes "Z -` {c} \<inter> space M = (\<Union>x \<in> X`space M. Y -` {f x}) \<inter> space M"
```
```   532   and "inj_on f (X`space M)"
```
```   533   shows "(\<Sum>x \<in> X`space M. distribution Y {f x}) = distribution Z {c}"
```
```   534   unfolding distribution_def assms
```
```   535   using finite_space assms
```
```   536   by (subst measure_finitely_additive'')
```
```   537      (auto simp add: disjoint_family_on_def sets_eq_Pow inj_on_def
```
```   538       intro!: arg_cong[where f=prob])
```
```   539
```
```   540 lemma (in finite_information_space) setsum_distribution:
```
```   541   "(\<Sum>x \<in> X`space M. joint_distribution X Y {(x, y)}) = distribution Y {y}"
```
```   542   "(\<Sum>y \<in> Y`space M. joint_distribution X Y {(x, y)}) = distribution X {x}"
```
```   543   "(\<Sum>x \<in> X`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution Y Z {(y, z)}"
```
```   544   "(\<Sum>y \<in> Y`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Z {(x, z)}"
```
```   545   "(\<Sum>z \<in> Z`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Y {(x, y)}"
```
```   546   by (auto intro!: inj_onI setsum_distribution_gen)
```
```   547
```
```   548 lemma (in finite_information_space) setsum_real_distribution_gen:
```
```   549   assumes "Z -` {c} \<inter> space M = (\<Union>x \<in> X`space M. Y -` {f x}) \<inter> space M"
```
```   550   and "inj_on f (X`space M)"
```
```   551   shows "(\<Sum>x \<in> X`space M. real (distribution Y {f x})) = real (distribution Z {c})"
```
```   552   unfolding distribution_def assms
```
```   553   using finite_space assms
```
```   554   by (subst real_finite_measure_finite_Union[symmetric])
```
```   555      (auto simp add: disjoint_family_on_def sets_eq_Pow inj_on_def
```
```   556         intro!: arg_cong[where f=prob])
```
```   557
```
```   558 lemma (in finite_information_space) setsum_real_distribution:
```
```   559   "(\<Sum>x \<in> X`space M. real (joint_distribution X Y {(x, y)})) = real (distribution Y {y})"
```
```   560   "(\<Sum>y \<in> Y`space M. real (joint_distribution X Y {(x, y)})) = real (distribution X {x})"
```
```   561   "(\<Sum>x \<in> X`space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)})) = real (joint_distribution Y Z {(y, z)})"
```
```   562   "(\<Sum>y \<in> Y`space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)})) = real (joint_distribution X Z {(x, z)})"
```
```   563   "(\<Sum>z \<in> Z`space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)})) = real (joint_distribution X Y {(x, y)})"
```
```   564   by (auto intro!: inj_onI setsum_real_distribution_gen)
```
```   565
```
```   566 lemma (in finite_information_space) conditional_mutual_information_eq_sum:
```
```   567    "\<I>(X ; Y | Z) =
```
```   568      (\<Sum>(x, y, z)\<in>X ` space M \<times> (\<lambda>x. (Y x, Z x)) ` space M.
```
```   569              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
```
```   570              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)})/
```
```   571         real (distribution (\<lambda>x. (Y x, Z x)) {(y, z)}))) -
```
```   572      (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
```
```   573         real (distribution (\<lambda>x. (X x, Z x)) {(x,z)}) * log b (real (distribution (\<lambda>x. (X x, Z x)) {(x,z)}) / real (distribution Z {z})))"
```
```   574   (is "_ = ?rhs")
```
```   575 proof -
```
```   576   have setsum_product:
```
```   577     "\<And>f x. (\<Sum>v\<in>(\<lambda>x. (Y x, Z x)) ` space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x,v)}) * f v)
```
```   578       = (\<Sum>v\<in>Y ` space M \<times> Z ` space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x,v)}) * f v)"
```
```   579   proof (safe intro!: setsum_mono_zero_cong_left imageI)
```
```   580     fix x y z f
```
```   581     assume *: "(Y y, Z z) \<notin> (\<lambda>x. (Y x, Z x)) ` space M" and "y \<in> space M" "z \<in> space M"
```
```   582     hence "(\<lambda>x. (X x, Y x, Z x)) -` {(x, Y y, Z z)} \<inter> space M = {}"
```
```   583     proof safe
```
```   584       fix x' assume x': "x' \<in> space M" and eq: "Y x' = Y y" "Z x' = Z z"
```
```   585       have "(Y y, Z z) \<in> (\<lambda>x. (Y x, Z x)) ` space M" using eq[symmetric] x' by auto
```
```   586       thus "x' \<in> {}" using * by auto
```
```   587     qed
```
```   588     thus "real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, Y y, Z z)}) * f (Y y) (Z z) = 0"
```
```   589       unfolding distribution_def by simp
```
```   590   qed (simp add: finite_space)
```
```   591
```
```   592   thus ?thesis
```
```   593     unfolding conditional_mutual_information_def Let_def mutual_information_eq
```
```   594     by (subst mutual_information_eq_generic)
```
```   595        (auto simp: prod_measure_space_def sigma_prod_sets_finite finite_space sigma_def
```
```   596         finite_prob_space_of_images finite_product_prob_space_of_images
```
```   597         setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf
```
```   598         setsum_left_distrib[symmetric] setsum_real_distribution
```
```   599       cong: setsum_cong)
```
```   600 qed
```
```   601
```
```   602 lemma (in finite_information_space) conditional_mutual_information_eq:
```
```   603   "\<I>(X ; Y | Z) = (\<Sum>(x, y, z) \<in> X ` space M \<times> Y ` space M \<times> Z ` space M.
```
```   604              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
```
```   605              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
```
```   606     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
```
```   607   unfolding conditional_mutual_information_def Let_def mutual_information_eq
```
```   608   by (subst mutual_information_eq_generic)
```
```   609      (auto simp add: prod_measure_space_def sigma_prod_sets_finite finite_space
```
```   610       finite_prob_space_of_images finite_product_prob_space_of_images sigma_def
```
```   611       setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf
```
```   612       setsum_left_distrib[symmetric] setsum_real_distribution setsum_commute[where A="Y`space M"]
```
```   613       real_of_pinfreal_mult[symmetric]
```
```   614     cong: setsum_cong)
```
```   615
```
```   616 lemma (in finite_information_space) conditional_mutual_information_eq_mutual_information:
```
```   617   "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
```
```   618 proof -
```
```   619   have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
```
```   620
```
```   621   show ?thesis
```
```   622     unfolding conditional_mutual_information_eq mutual_information_eq
```
```   623     by (simp add: setsum_cartesian_product' distribution_remove_const)
```
```   624 qed
```
```   625
```
```   626 lemma (in finite_prob_space) distribution_finite:
```
```   627   "distribution X A \<noteq> \<omega>"
```
```   628   by (auto simp: sets_eq_Pow distribution_def intro!: measure_finite)
```
```   629
```
```   630 lemma (in finite_prob_space) real_distribution_order:
```
```   631   shows "r \<le> real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r \<le> real (distribution X {x})"
```
```   632   and "r \<le> real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r \<le> real (distribution Y {y})"
```
```   633   and "r < real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r < real (distribution X {x})"
```
```   634   and "r < real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r < real (distribution Y {y})"
```
```   635   and "distribution X {x} = 0 \<Longrightarrow> real (joint_distribution X Y {(x, y)}) = 0"
```
```   636   and "distribution Y {y} = 0 \<Longrightarrow> real (joint_distribution X Y {(x, y)}) = 0"
```
```   637   using real_of_pinfreal_mono[OF distribution_finite joint_distribution_restriction_fst, of X Y "{(x, y)}"]
```
```   638   using real_of_pinfreal_mono[OF distribution_finite joint_distribution_restriction_snd, of X Y "{(x, y)}"]
```
```   639   using real_pinfreal_nonneg[of "joint_distribution X Y {(x, y)}"]
```
```   640   by auto
```
```   641
```
```   642 lemma (in finite_information_space) conditional_mutual_information_positive:
```
```   643   "0 \<le> \<I>(X ; Y | Z)"
```
```   644 proof -
```
```   645   let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
```
```   646   let "?dXZ A" = "real (joint_distribution X Z A)"
```
```   647   let "?dYZ A" = "real (joint_distribution Y Z A)"
```
```   648   let "?dX A" = "real (distribution X A)"
```
```   649   let "?dZ A" = "real (distribution Z A)"
```
```   650   let ?M = "X ` space M \<times> Y ` space M \<times> Z ` space M"
```
```   651
```
```   652   have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: expand_fun_eq)
```
```   653
```
```   654   have "- (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
```
```   655     log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))
```
```   656     \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
```
```   657     unfolding split_beta
```
```   658   proof (rule log_setsum_divide)
```
```   659     show "?M \<noteq> {}" using not_empty by simp
```
```   660     show "1 < b" using b_gt_1 .
```
```   661
```
```   662     fix x assume "x \<in> ?M"
```
```   663     let ?x = "(fst x, fst (snd x), snd (snd x))"
```
```   664
```
```   665     show "0 \<le> ?dXYZ {?x}" using real_pinfreal_nonneg .
```
```   666     show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
```
```   667      by (simp add: real_pinfreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
```
```   668
```
```   669     assume *: "0 < ?dXYZ {?x}"
```
```   670     thus "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
```
```   671       apply (rule_tac divide_pos_pos mult_pos_pos)+
```
```   672       by (auto simp add: real_distribution_gt_0 intro: distribution_order(6) distribution_mono_gt_0)
```
```   673   qed (simp_all add: setsum_cartesian_product' sum_over_space_real_distribution setsum_real_distribution finite_space)
```
```   674   also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>Z`space M. ?dZ {z})"
```
```   675     apply (simp add: setsum_cartesian_product')
```
```   676     apply (subst setsum_commute)
```
```   677     apply (subst (2) setsum_commute)
```
```   678     by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] setsum_real_distribution
```
```   679           intro!: setsum_cong)
```
```   680   finally show ?thesis
```
```   681     unfolding conditional_mutual_information_eq sum_over_space_real_distribution
```
```   682     by (simp add: real_of_pinfreal_mult[symmetric])
```
```   683 qed
```
```   684
```
```   685 definition (in prob_space)
```
```   686   "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
```
```   687
```
```   688 abbreviation (in finite_information_space)
```
```   689   finite_conditional_entropy ("\<H>'(_ | _')") where
```
```   690   "\<H>(X | Y) \<equiv> conditional_entropy b
```
```   691     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
```
```   692     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
```
```   693
```
```   694 lemma (in finite_information_space) conditional_entropy_positive:
```
```   695   "0 \<le> \<H>(X | Y)" unfolding conditional_entropy_def using conditional_mutual_information_positive .
```
```   696
```
```   697 lemma (in finite_information_space) conditional_entropy_eq:
```
```   698   "\<H>(X | Z) =
```
```   699      - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
```
```   700          real (joint_distribution X Z {(x, z)}) *
```
```   701          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
```
```   702 proof -
```
```   703   have *: "\<And>x y z. (\<lambda>x. (X x, X x, Z x)) -` {(x, y, z)} = (if x = y then (\<lambda>x. (X x, Z x)) -` {(x, z)} else {})" by auto
```
```   704   show ?thesis
```
```   705     unfolding conditional_mutual_information_eq_sum
```
```   706       conditional_entropy_def distribution_def *
```
```   707     by (auto intro!: setsum_0')
```
```   708 qed
```
```   709
```
```   710 lemma (in finite_information_space) mutual_information_eq_entropy_conditional_entropy:
```
```   711   "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
```
```   712   unfolding mutual_information_eq entropy_eq conditional_entropy_eq
```
```   713   using finite_space
```
```   714   by (auto simp add: setsum_addf setsum_subtractf setsum_cartesian_product'
```
```   715       setsum_left_distrib[symmetric] setsum_addf setsum_real_distribution)
```
```   716
```
```   717 lemma (in finite_information_space) conditional_entropy_less_eq_entropy:
```
```   718   "\<H>(X | Z) \<le> \<H>(X)"
```
```   719 proof -
```
```   720   have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy .
```
```   721   with mutual_information_positive[of X Z] entropy_positive[of X]
```
```   722   show ?thesis by auto
```
```   723 qed
```
```   724
```
```   725 (* -------------Entropy of a RV with a certain event is zero---------------- *)
```
```   726
```
```   727 lemma (in finite_information_space) finite_entropy_certainty_eq_0:
```
```   728   assumes "x \<in> X ` space M" and "distribution X {x} = 1"
```
```   729   shows "\<H>(X) = 0"
```
```   730 proof -
```
```   731   interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
```
```   732     by (rule finite_prob_space_of_images)
```
```   733
```
```   734   have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
```
```   735     using X.measure_compl[of "{x}"] assms by auto
```
```   736   also have "\<dots> = 0" using X.prob_space assms by auto
```
```   737   finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
```
```   738
```
```   739   { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
```
```   740     hence "{y} \<subseteq> X ` space M - {x}" by auto
```
```   741     from X.measure_mono[OF this] X0 asm
```
```   742     have "distribution X {y} = 0" by auto }
```
```   743
```
```   744   hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
```
```   745     using assms by auto
```
```   746
```
```   747   have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
```
```   748
```
```   749   show ?thesis unfolding entropy_eq by (auto simp: y fi)
```
```   750 qed
```
```   751 (* --------------- upper bound on entropy for a rv ------------------------- *)
```
```   752
```
```   753 lemma (in finite_prob_space) distribution_1:
```
```   754   "distribution X A \<le> 1"
```
```   755   unfolding distribution_def measure_space_1[symmetric]
```
```   756   by (auto intro!: measure_mono simp: sets_eq_Pow)
```
```   757
```
```   758 lemma (in finite_prob_space) real_distribution_1:
```
```   759   "real (distribution X A) \<le> 1"
```
```   760   unfolding real_pinfreal_1[symmetric]
```
```   761   by (rule real_of_pinfreal_mono[OF _ distribution_1]) simp
```
```   762
```
```   763 lemma (in finite_information_space) finite_entropy_le_card:
```
```   764   "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
```
```   765 proof -
```
```   766   let "?d x" = "distribution X {x}"
```
```   767   let "?p x" = "real (?d x)"
```
```   768   have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
```
```   769     by (auto intro!: setsum_cong simp: entropy_eq setsum_negf[symmetric])
```
```   770   also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
```
```   771     apply (rule log_setsum')
```
```   772     using not_empty b_gt_1 finite_space sum_over_space_real_distribution
```
```   773     by auto
```
```   774   also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
```
```   775     apply (rule arg_cong[where f="\<lambda>f. log b (\<Sum>x\<in>X`space M. f x)"])
```
```   776     using distribution_finite[of X] by (auto simp: expand_fun_eq real_of_pinfreal_eq_0)
```
```   777   finally show ?thesis
```
```   778     using finite_space by (auto simp: setsum_cases real_eq_of_nat)
```
```   779 qed
```
```   780
```
```   781 (* --------------- entropy is maximal for a uniform rv --------------------- *)
```
```   782
```
```   783 lemma (in finite_prob_space) uniform_prob:
```
```   784   assumes "x \<in> space M"
```
```   785   assumes "\<And> x y. \<lbrakk>x \<in> space M ; y \<in> space M\<rbrakk> \<Longrightarrow> prob {x} = prob {y}"
```
```   786   shows "prob {x} = 1 / real (card (space M))"
```
```   787 proof -
```
```   788   have prob_x: "\<And> y. y \<in> space M \<Longrightarrow> prob {y} = prob {x}"
```
```   789     using assms(2)[OF _ `x \<in> space M`] by blast
```
```   790   have "1 = prob (space M)"
```
```   791     using prob_space by auto
```
```   792   also have "\<dots> = (\<Sum> x \<in> space M. prob {x})"
```
```   793     using real_finite_measure_finite_Union[of "space M" "\<lambda> x. {x}", simplified]
```
```   794       sets_eq_Pow inj_singleton[unfolded inj_on_def, rule_format]
```
```   795       finite_space unfolding disjoint_family_on_def  prob_space[symmetric]
```
```   796     by (auto simp add:setsum_restrict_set)
```
```   797   also have "\<dots> = (\<Sum> y \<in> space M. prob {x})"
```
```   798     using prob_x by auto
```
```   799   also have "\<dots> = real_of_nat (card (space M)) * prob {x}" by simp
```
```   800   finally have one: "1 = real (card (space M)) * prob {x}"
```
```   801     using real_eq_of_nat by auto
```
```   802   hence two: "real (card (space M)) \<noteq> 0" by fastsimp
```
```   803   from one have three: "prob {x} \<noteq> 0" by fastsimp
```
```   804   thus ?thesis using one two three divide_cancel_right
```
```   805     by (auto simp:field_simps)
```
```   806 qed
```
```   807
```
```   808 lemma (in finite_information_space) finite_entropy_uniform_max:
```
```   809   assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
```
```   810   shows "\<H>(X) = log b (real (card (X ` space M)))"
```
```   811 proof -
```
```   812   note uniform =
```
```   813     finite_prob_space_of_images[of X, THEN finite_prob_space.uniform_prob, simplified]
```
```   814
```
```   815   have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
```
```   816     using finite_space not_empty by auto
```
```   817
```
```   818   { fix x assume "x \<in> X ` space M"
```
```   819     hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
```
```   820     proof (rule uniform)
```
```   821       fix x y assume "x \<in> X`space M" "y \<in> X`space M"
```
```   822       from assms[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
```
```   823     qed }
```
```   824   thus ?thesis
```
```   825     using not_empty finite_space b_gt_1 card_gt0
```
```   826     by (simp add: entropy_eq real_eq_of_nat[symmetric] log_divide)
```
```   827 qed
```
```   828
```
```   829 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
```
```   830
```
```   831 lemma subvimageI:
```
```   832   assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
```
```   833   shows "subvimage A f g"
```
```   834   using assms unfolding subvimage_def by blast
```
```   835
```
```   836 lemma subvimageE[consumes 1]:
```
```   837   assumes "subvimage A f g"
```
```   838   obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
```
```   839   using assms unfolding subvimage_def by blast
```
```   840
```
```   841 lemma subvimageD:
```
```   842   "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
```
```   843   using assms unfolding subvimage_def by blast
```
```   844
```
```   845 lemma subvimage_subset:
```
```   846   "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
```
```   847   unfolding subvimage_def by auto
```
```   848
```
```   849 lemma subvimage_idem[intro]: "subvimage A g g"
```
```   850   by (safe intro!: subvimageI)
```
```   851
```
```   852 lemma subvimage_comp_finer[intro]:
```
```   853   assumes svi: "subvimage A g h"
```
```   854   shows "subvimage A g (f \<circ> h)"
```
```   855 proof (rule subvimageI, simp)
```
```   856   fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
```
```   857   from svi[THEN subvimageD, OF this]
```
```   858   show "f (h x) = f (h y)" by simp
```
```   859 qed
```
```   860
```
```   861 lemma subvimage_comp_gran:
```
```   862   assumes svi: "subvimage A g h"
```
```   863   assumes inj: "inj_on f (g ` A)"
```
```   864   shows "subvimage A (f \<circ> g) h"
```
```   865   by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
```
```   866
```
```   867 lemma subvimage_comp:
```
```   868   assumes svi: "subvimage (f ` A) g h"
```
```   869   shows "subvimage A (g \<circ> f) (h \<circ> f)"
```
```   870   by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
```
```   871
```
```   872 lemma subvimage_trans:
```
```   873   assumes fg: "subvimage A f g"
```
```   874   assumes gh: "subvimage A g h"
```
```   875   shows "subvimage A f h"
```
```   876   by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
```
```   877
```
```   878 lemma subvimage_translator:
```
```   879   assumes svi: "subvimage A f g"
```
```   880   shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
```
```   881 proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
```
```   882   fix x assume "x \<in> A"
```
```   883   show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
```
```   884     by (rule theI2[of _ "g x"])
```
```   885       (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
```
```   886 qed
```
```   887
```
```   888 lemma subvimage_translator_image:
```
```   889   assumes svi: "subvimage A f g"
```
```   890   shows "\<exists>h. h ` f ` A = g ` A"
```
```   891 proof -
```
```   892   from subvimage_translator[OF svi]
```
```   893   obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
```
```   894   thus ?thesis
```
```   895     by (auto intro!: exI[of _ h]
```
```   896       simp: image_compose[symmetric] comp_def cong: image_cong)
```
```   897 qed
```
```   898
```
```   899 lemma subvimage_finite:
```
```   900   assumes svi: "subvimage A f g" and fin: "finite (f`A)"
```
```   901   shows "finite (g`A)"
```
```   902 proof -
```
```   903   from subvimage_translator_image[OF svi]
```
```   904   obtain h where "g`A = h`f`A" by fastsimp
```
```   905   with fin show "finite (g`A)" by simp
```
```   906 qed
```
```   907
```
```   908 lemma subvimage_disj:
```
```   909   assumes svi: "subvimage A f g"
```
```   910   shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
```
```   911       f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
```
```   912 proof (rule disjCI)
```
```   913   assume "\<not> ?dist"
```
```   914   then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
```
```   915   thus "?sub" using svi unfolding subvimage_def by auto
```
```   916 qed
```
```   917
```
```   918 lemma setsum_image_split:
```
```   919   assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
```
```   920   shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
```
```   921     (is "?lhs = ?rhs")
```
```   922 proof -
```
```   923   have "f ` A =
```
```   924       snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
```
```   925       (is "_ = snd ` ?SIGMA")
```
```   926     unfolding image_split_eq_Sigma[symmetric]
```
```   927     by (simp add: image_compose[symmetric] comp_def)
```
```   928   moreover
```
```   929   have snd_inj: "inj_on snd ?SIGMA"
```
```   930     unfolding image_split_eq_Sigma[symmetric]
```
```   931     by (auto intro!: inj_onI subvimageD[OF svi])
```
```   932   ultimately
```
```   933   have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
```
```   934     by (auto simp: setsum_reindex intro: setsum_cong)
```
```   935   also have "... = ?rhs"
```
```   936     using subvimage_finite[OF svi fin] fin
```
```   937     apply (subst setsum_Sigma[symmetric])
```
```   938     by (auto intro!: finite_subset[of _ "f`A"])
```
```   939   finally show ?thesis .
```
```   940 qed
```
```   941
```
```   942 lemma (in finite_information_space) entropy_partition:
```
```   943   assumes svi: "subvimage (space M) X P"
```
```   944   shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
```
```   945 proof -
```
```   946   have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
```
```   947     (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
```
```   948     real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
```
```   949   proof (subst setsum_image_split[OF svi],
```
```   950       safe intro!: finite_imageI finite_space setsum_mono_zero_cong_left imageI)
```
```   951     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
```
```   952     assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
```
```   953     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
```
```   954     with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
```
```   955     show "x \<in> P -` {P p}" by auto
```
```   956   next
```
```   957     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
```
```   958     assume "P x = P p"
```
```   959     from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
```
```   960     have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
```
```   961       by auto
```
```   962     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
```
```   963       by auto
```
```   964     thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
```
```   965           real (joint_distribution X P {(X x, P p)}) *
```
```   966           log b (real (joint_distribution X P {(X x, P p)}))"
```
```   967       by (auto simp: distribution_def)
```
```   968   qed
```
```   969   thus ?thesis
```
```   970   unfolding entropy_eq conditional_entropy_eq
```
```   971     by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
```
```   972       setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
```
```   973 qed
```
```   974
```
```   975 corollary (in finite_information_space) entropy_data_processing:
```
```   976   "\<H>(f \<circ> X) \<le> \<H>(X)"
```
```   977   by (subst (2) entropy_partition[of _ "f \<circ> X"]) (auto intro: conditional_entropy_positive)
```
```   978
```
```   979 lemma (in prob_space) distribution_cong:
```
```   980   assumes "\<And>x. x \<in> space M \<Longrightarrow> X x = Y x"
```
```   981   shows "distribution X = distribution Y"
```
```   982   unfolding distribution_def expand_fun_eq
```
```   983   using assms by (auto intro!: arg_cong[where f="\<mu>"])
```
```   984
```
```   985 lemma (in prob_space) joint_distribution_cong:
```
```   986   assumes "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
```
```   987   assumes "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
```
```   988   shows "joint_distribution X Y = joint_distribution X' Y'"
```
```   989   unfolding distribution_def expand_fun_eq
```
```   990   using assms by (auto intro!: arg_cong[where f="\<mu>"])
```
```   991
```
```   992 lemma image_cong:
```
```   993   "\<lbrakk> \<And>x. x \<in> S \<Longrightarrow> X x = X' x \<rbrakk> \<Longrightarrow> X ` S = X' ` S"
```
```   994   by (auto intro!: image_eqI)
```
```   995
```
```   996 lemma (in finite_information_space) mutual_information_cong:
```
```   997   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
```
```   998   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
```
```   999   shows "\<I>(X ; Y) = \<I>(X' ; Y')"
```
```  1000 proof -
```
```  1001   have "X ` space M = X' ` space M" using X by (rule image_cong)
```
```  1002   moreover have "Y ` space M = Y' ` space M" using Y by (rule image_cong)
```
```  1003   ultimately show ?thesis
```
```  1004   unfolding mutual_information_eq
```
```  1005     using
```
```  1006       assms[THEN distribution_cong]
```
```  1007       joint_distribution_cong[OF assms]
```
```  1008     by (auto intro!: setsum_cong)
```
```  1009 qed
```
```  1010
```
```  1011 corollary (in finite_information_space) entropy_of_inj:
```
```  1012   assumes "inj_on f (X`space M)"
```
```  1013   shows "\<H>(f \<circ> X) = \<H>(X)"
```
```  1014 proof (rule antisym)
```
```  1015   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing .
```
```  1016 next
```
```  1017   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
```
```  1018     by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF assms])
```
```  1019   also have "... \<le> \<H>(f \<circ> X)"
```
```  1020     using entropy_data_processing .
```
```  1021   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
```
```  1022 qed
```
```  1023
```
```  1024 end
```