src/HOL/Probability/Information.thy
author hoelzl
Thu Sep 02 17:12:40 2010 +0200 (2010-09-02)
changeset 39092 98de40859858
parent 38656 d5d342611edb
child 39097 943c7b348524
permissions -rw-r--r--
move lemmas to correct theory files
     1 theory Information
     2 imports Probability_Space Product_Measure Convex Radon_Nikodym
     3 begin
     4 
     5 lemma real_of_pinfreal_inverse[simp]:
     6   fixes X :: pinfreal
     7   shows "real (inverse X) = 1 / real X"
     8   by (cases X) (auto simp: inverse_eq_divide)
     9 
    10 section "Convex theory"
    11 
    12 lemma log_setsum:
    13   assumes "finite s" "s \<noteq> {}"
    14   assumes "b > 1"
    15   assumes "(\<Sum> i \<in> s. a i) = 1"
    16   assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
    17   assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
    18   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    19 proof -
    20   have "convex_on {0 <..} (\<lambda> x. - log b x)"
    21     by (rule minus_log_convex[OF `b > 1`])
    22   hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
    23     using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
    24   thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
    25 qed
    26 
    27 lemma log_setsum':
    28   assumes "finite s" "s \<noteq> {}"
    29   assumes "b > 1"
    30   assumes "(\<Sum> i \<in> s. a i) = 1"
    31   assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
    32           "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
    33   shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
    34 proof -
    35   have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
    36     using assms by (auto intro!: setsum_mono_zero_cong_left)
    37   moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
    38   proof (rule log_setsum)
    39     have "setsum a (s - {i. a i = 0}) = setsum a s"
    40       using assms(1) by (rule setsum_mono_zero_cong_left) auto
    41     thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
    42       "finite (s - {i. a i = 0})" using assms by simp_all
    43 
    44     show "s - {i. a i = 0} \<noteq> {}"
    45     proof
    46       assume *: "s - {i. a i = 0} = {}"
    47       hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
    48       with sum_1 show False by simp
    49     qed
    50 
    51     fix i assume "i \<in> s - {i. a i = 0}"
    52     hence "i \<in> s" "a i \<noteq> 0" by simp_all
    53     thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
    54   qed fact+
    55   ultimately show ?thesis by simp
    56 qed
    57 
    58 lemma log_setsum_divide:
    59   assumes "finite S" and "S \<noteq> {}" and "1 < b"
    60   assumes "(\<Sum>x\<in>S. g x) = 1"
    61   assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
    62   assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
    63   shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
    64 proof -
    65   have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
    66     using `1 < b` by (subst log_le_cancel_iff) auto
    67 
    68   have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
    69   proof (unfold setsum_negf[symmetric], rule setsum_cong)
    70     fix x assume x: "x \<in> S"
    71     show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
    72     proof (cases "g x = 0")
    73       case False
    74       with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
    75       thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
    76     qed simp
    77   qed rule
    78   also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
    79   proof (rule log_setsum')
    80     fix x assume x: "x \<in> S" "0 < g x"
    81     with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
    82   qed fact+
    83   also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
    84     by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
    85         split: split_if_asm)
    86   also have "... \<le> log b (\<Sum>x\<in>S. f x)"
    87   proof (rule log_mono)
    88     have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
    89     also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
    90     proof (rule setsum_strict_mono)
    91       show "finite (S - {x. g x = 0})" using `finite S` by simp
    92       show "S - {x. g x = 0} \<noteq> {}"
    93       proof
    94         assume "S - {x. g x = 0} = {}"
    95         hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
    96         with `(\<Sum>x\<in>S. g x) = 1` show False by simp
    97       qed
    98       fix x assume "x \<in> S - {x. g x = 0}"
    99       thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
   100     qed
   101     finally show "0 < ?sum" .
   102     show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
   103       using `finite S` pos by (auto intro!: setsum_mono2)
   104   qed
   105   finally show ?thesis .
   106 qed
   107 
   108 lemma (in finite_prob_space) sum_over_space_distrib:
   109   "(\<Sum>x\<in>X`space M. distribution X {x}) = 1"
   110   unfolding distribution_def measure_space_1[symmetric] using finite_space
   111   by (subst measure_finitely_additive'')
   112      (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=\<mu>])
   113 
   114 lemma (in finite_prob_space) sum_over_space_real_distribution:
   115   "(\<Sum>x\<in>X`space M. real (distribution X {x})) = 1"
   116   unfolding distribution_def prob_space[symmetric] using finite_space
   117   by (subst real_finite_measure_finite_Union[symmetric])
   118      (auto simp add: disjoint_family_on_def sets_eq_Pow intro!: arg_cong[where f=prob])
   119 
   120 section "Information theory"
   121 
   122 definition
   123   "KL_divergence b M \<mu> \<nu> =
   124     measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"
   125 
   126 locale finite_information_space = finite_prob_space +
   127   fixes b :: real assumes b_gt_1: "1 < b"
   128 
   129 lemma (in finite_prob_space) distribution_mono:
   130   assumes "\<And>t. \<lbrakk> t \<in> space M ; X t \<in> x \<rbrakk> \<Longrightarrow> Y t \<in> y"
   131   shows "distribution X x \<le> distribution Y y"
   132   unfolding distribution_def
   133   using assms by (auto simp: sets_eq_Pow intro!: measure_mono)
   134 
   135 lemma (in prob_space) distribution_remove_const:
   136   shows "joint_distribution X (\<lambda>x. ()) {(x, ())} = distribution X {x}"
   137   and "joint_distribution (\<lambda>x. ()) X {((), x)} = distribution X {x}"
   138   and "joint_distribution X (\<lambda>x. (Y x, ())) {(x, y, ())} = joint_distribution X Y {(x, y)}"
   139   and "joint_distribution X (\<lambda>x. ((), Y x)) {(x, (), y)} = joint_distribution X Y {(x, y)}"
   140   and "distribution (\<lambda>x. ()) {()} = 1"
   141   unfolding measure_space_1[symmetric]
   142   by (auto intro!: arg_cong[where f="\<mu>"] simp: distribution_def)
   143 
   144 context finite_information_space
   145 begin
   146 
   147 lemma distribution_mono_gt_0:
   148   assumes gt_0: "0 < distribution X x"
   149   assumes *: "\<And>t. \<lbrakk> t \<in> space M ; X t \<in> x \<rbrakk> \<Longrightarrow> Y t \<in> y"
   150   shows "0 < distribution Y y"
   151   by (rule less_le_trans[OF gt_0 distribution_mono]) (rule *)
   152 
   153 lemma
   154   assumes "0 \<le> A" and pos: "0 < A \<Longrightarrow> 0 < B" "0 < A \<Longrightarrow> 0 < C"
   155   shows mult_log_mult: "A * log b (B * C) = A * log b B + A * log b C" (is "?mult")
   156   and mult_log_divide: "A * log b (B / C) = A * log b B - A * log b C" (is "?div")
   157 proof -
   158   have "?mult \<and> ?div"
   159   proof (cases "A = 0")
   160     case False
   161     hence "0 < A" using `0 \<le> A` by auto
   162       with pos[OF this] show "?mult \<and> ?div" using b_gt_1
   163         by (auto simp: log_divide log_mult field_simps)
   164   qed simp
   165   thus ?mult and ?div by auto
   166 qed
   167 
   168 lemma split_pairs:
   169   shows
   170     "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
   171     "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
   172 
   173 lemma (in finite_information_space) distribution_finite:
   174   "distribution X A \<noteq> \<omega>"
   175   using measure_finite[of "X -` A \<inter> space M"]
   176   unfolding distribution_def sets_eq_Pow by auto
   177 
   178 lemma (in finite_information_space) real_distribution_gt_0[simp]:
   179   "0 < real (distribution Y y) \<longleftrightarrow>  0 < distribution Y y"
   180   using assms by (auto intro!: real_pinfreal_pos distribution_finite)
   181 
   182 lemma real_distribution_mult_pos_pos:
   183   assumes "0 < distribution Y y"
   184   and "0 < distribution X x"
   185   shows "0 < real (distribution Y y * distribution X x)"
   186   unfolding real_of_pinfreal_mult[symmetric]
   187   using assms by (auto intro!: mult_pos_pos)
   188 
   189 lemma real_distribution_divide_pos_pos:
   190   assumes "0 < distribution Y y"
   191   and "0 < distribution X x"
   192   shows "0 < real (distribution Y y / distribution X x)"
   193   unfolding divide_pinfreal_def real_of_pinfreal_mult[symmetric]
   194   using assms distribution_finite[of X x] by (cases "distribution X x") (auto intro!: mult_pos_pos)
   195 
   196 lemma real_distribution_mult_inverse_pos_pos:
   197   assumes "0 < distribution Y y"
   198   and "0 < distribution X x"
   199   shows "0 < real (distribution Y y * inverse (distribution X x))"
   200   unfolding divide_pinfreal_def real_of_pinfreal_mult[symmetric]
   201   using assms distribution_finite[of X x] by (cases "distribution X x") (auto intro!: mult_pos_pos)
   202 
   203 ML {*
   204 
   205   (* tactic to solve equations of the form @{term "W * log b (X / (Y * Z)) = W * log b X - W * log b (Y * Z)"}
   206      where @{term W} is a joint distribution of @{term X}, @{term Y}, and @{term Z}. *)
   207 
   208   val mult_log_intros = [@{thm mult_log_divide}, @{thm mult_log_mult}]
   209   val intros = [@{thm divide_pos_pos}, @{thm mult_pos_pos}, @{thm real_pinfreal_nonneg},
   210     @{thm real_distribution_divide_pos_pos},
   211     @{thm real_distribution_mult_inverse_pos_pos},
   212     @{thm real_distribution_mult_pos_pos}]
   213 
   214   val distribution_gt_0_tac = (rtac @{thm distribution_mono_gt_0}
   215     THEN' assume_tac
   216     THEN' clarsimp_tac (clasimpset_of @{context} addsimps2 @{thms split_pairs}))
   217 
   218   val distr_mult_log_eq_tac = REPEAT_ALL_NEW (CHANGED o TRY o
   219     (resolve_tac (mult_log_intros @ intros)
   220       ORELSE' distribution_gt_0_tac
   221       ORELSE' clarsimp_tac (clasimpset_of @{context})))
   222 
   223   fun instanciate_term thy redex intro =
   224     let
   225       val intro_concl = Thm.concl_of intro
   226 
   227       val lhs = intro_concl |> HOLogic.dest_Trueprop |> HOLogic.dest_eq |> fst
   228 
   229       val m = SOME (Pattern.match thy (lhs, redex) (Vartab.empty, Vartab.empty))
   230         handle Pattern.MATCH => NONE
   231 
   232     in
   233       Option.map (fn m => Envir.subst_term m intro_concl) m
   234     end
   235 
   236   fun mult_log_simproc simpset redex =
   237   let
   238     val ctxt = Simplifier.the_context simpset
   239     val thy = ProofContext.theory_of ctxt
   240     fun prove (SOME thm) = (SOME
   241           (Goal.prove ctxt [] [] thm (K (distr_mult_log_eq_tac 1))
   242            |> mk_meta_eq)
   243             handle THM _ => NONE)
   244       | prove NONE = NONE
   245   in
   246     get_first (instanciate_term thy (term_of redex) #> prove) mult_log_intros
   247   end
   248 *}
   249 
   250 simproc_setup mult_log ("real (distribution X x) * log b (A * B)" |
   251                         "real (distribution X x) * log b (A / B)") = {* K mult_log_simproc *}
   252 
   253 end
   254 
   255 lemma (in finite_measure_space) absolutely_continuousI:
   256   assumes "finite_measure_space M \<nu>"
   257   assumes v: "\<And>x. \<lbrakk> x \<in> space M ; \<mu> {x} = 0 \<rbrakk> \<Longrightarrow> \<nu> {x} = 0"
   258   shows "absolutely_continuous \<nu>"
   259 proof (unfold absolutely_continuous_def sets_eq_Pow, safe)
   260   fix N assume "\<mu> N = 0" "N \<subseteq> space M"
   261 
   262   interpret v: finite_measure_space M \<nu> by fact
   263 
   264   have "\<nu> N = \<nu> (\<Union>x\<in>N. {x})" by simp
   265   also have "\<dots> = (\<Sum>x\<in>N. \<nu> {x})"
   266   proof (rule v.measure_finitely_additive''[symmetric])
   267     show "finite N" using `N \<subseteq> space M` finite_space by (auto intro: finite_subset)
   268     show "disjoint_family_on (\<lambda>i. {i}) N" unfolding disjoint_family_on_def by auto
   269     fix x assume "x \<in> N" thus "{x} \<in> sets M" using `N \<subseteq> space M` sets_eq_Pow by auto
   270   qed
   271   also have "\<dots> = 0"
   272   proof (safe intro!: setsum_0')
   273     fix x assume "x \<in> N"
   274     hence "\<mu> {x} \<le> \<mu> N" using sets_eq_Pow `N \<subseteq> space M` by (auto intro!: measure_mono)
   275     hence "\<mu> {x} = 0" using `\<mu> N = 0` by simp
   276     thus "\<nu> {x} = 0" using v[of x] `x \<in> N` `N \<subseteq> space M` by auto
   277   qed
   278   finally show "\<nu> N = 0" .
   279 qed
   280 
   281 lemma (in finite_measure_space) KL_divergence_eq_finite:
   282   assumes v: "finite_measure_space M \<nu>"
   283   assumes ac: "\<forall>x\<in>space M. \<mu> {x} = 0 \<longrightarrow> \<nu> {x} = 0"
   284   shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
   285 proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
   286   interpret v: finite_measure_space M \<nu> by fact
   287   have ms: "measure_space M \<nu>" by fact
   288 
   289   have ac: "absolutely_continuous \<nu>"
   290     using ac by (auto intro!: absolutely_continuousI[OF v])
   291 
   292   show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
   293     using RN_deriv_finite_measure[OF ms ac]
   294     by (auto intro!: setsum_cong simp: field_simps real_of_pinfreal_mult[symmetric])
   295 qed
   296 
   297 lemma (in finite_prob_space) finite_sum_over_space_eq_1:
   298   "(\<Sum>x\<in>space M. real (\<mu> {x})) = 1"
   299   using sum_over_space_eq_1 finite_measure by (simp add: real_of_pinfreal_setsum sets_eq_Pow)
   300 
   301 lemma (in finite_prob_space) KL_divergence_positive_finite:
   302   assumes v: "finite_prob_space M \<nu>"
   303   assumes ac: "\<And>x. \<lbrakk> x \<in> space M ; \<mu> {x} = 0 \<rbrakk> \<Longrightarrow> \<nu> {x} = 0"
   304   and "1 < b"
   305   shows "0 \<le> KL_divergence b M \<nu> \<mu>"
   306 proof -
   307   interpret v: finite_prob_space M \<nu> using v .
   308 
   309   have *: "space M \<noteq> {}" using not_empty by simp
   310 
   311   hence "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   312   proof (subst KL_divergence_eq_finite)
   313     show "finite_measure_space  M \<nu>" by fact
   314 
   315     show "\<forall>x\<in>space M. \<mu> {x} = 0 \<longrightarrow> \<nu> {x} = 0" using ac by auto
   316     show "- (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x}))) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
   317     proof (safe intro!: log_setsum_divide *)
   318       show "finite (space M)" using finite_space by simp
   319       show "1 < b" by fact
   320       show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
   321 
   322       fix x assume x: "x \<in> space M"
   323       { assume "0 < real (\<nu> {x})"
   324         hence "\<mu> {x} \<noteq> 0" using ac[OF x] by auto
   325         thus "0 < prob {x}" using measure_finite[of "{x}"] sets_eq_Pow x
   326           by (cases "\<mu> {x}") simp_all }
   327     qed auto
   328   qed
   329   thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
   330 qed
   331 
   332 definition (in prob_space)
   333   "mutual_information b S T X Y =
   334     KL_divergence b (prod_measure_space S T)
   335       (joint_distribution X Y)
   336       (prod_measure S (distribution X) T (distribution Y))"
   337 
   338 abbreviation (in finite_information_space)
   339   finite_mutual_information ("\<I>'(_ ; _')") where
   340   "\<I>(X ; Y) \<equiv> mutual_information b
   341     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   342     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   343 
   344 lemma prod_measure_times_finite:
   345   assumes fms: "finite_measure_space M \<mu>" "finite_measure_space N \<nu>" and a: "a \<in> space M \<times> space N"
   346   shows "prod_measure M \<mu> N \<nu> {a} = \<mu> {fst a} * \<nu> {snd a}"
   347 proof (cases a)
   348   case (Pair b c)
   349   hence a_eq: "{a} = {b} \<times> {c}" by simp
   350 
   351   interpret M: finite_measure_space M \<mu> by fact
   352   interpret N: finite_measure_space N \<nu> by fact
   353 
   354   from finite_measure_space.finite_prod_measure_times[OF fms, of "{b}" "{c}"] M.sets_eq_Pow N.sets_eq_Pow a Pair
   355   show ?thesis unfolding a_eq by simp
   356 qed
   357 
   358 lemma setsum_cartesian_product':
   359   "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
   360   unfolding setsum_cartesian_product by simp
   361 
   362 lemma (in finite_information_space) mutual_information_generic_eq:
   363   assumes MX: "finite_measure_space MX (distribution X)"
   364   assumes MY: "finite_measure_space MY (distribution Y)"
   365   shows "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   366       real (joint_distribution X Y {(x,y)}) *
   367       log b (real (joint_distribution X Y {(x,y)}) /
   368       (real (distribution X {x}) * real (distribution Y {y}))))"
   369 proof -
   370   let ?P = "prod_measure_space MX MY"
   371   let ?\<mu> = "prod_measure MX (distribution X) MY (distribution Y)"
   372   let ?\<nu> = "joint_distribution X Y"
   373   interpret X: finite_measure_space MX "distribution X" by fact
   374   moreover interpret Y: finite_measure_space MY "distribution Y" by fact
   375   have fms: "finite_measure_space MX (distribution X)"
   376             "finite_measure_space MY (distribution Y)" by fact+
   377   have fms_P: "finite_measure_space ?P ?\<mu>"
   378     by (rule X.finite_measure_space_finite_prod_measure) fact
   379   then interpret P: finite_measure_space ?P ?\<mu> .
   380   have fms_P': "finite_measure_space ?P ?\<nu>"
   381       using finite_product_measure_space[of "space MX" "space MY"]
   382         X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
   383         X.sets_eq_Pow Y.sets_eq_Pow
   384       by (simp add: prod_measure_space_def sigma_def)
   385   then interpret P': finite_measure_space ?P ?\<nu> .
   386   { fix x assume "x \<in> space ?P"
   387     hence in_MX: "{fst x} \<in> sets MX" "{snd x} \<in> sets MY" using X.sets_eq_Pow Y.sets_eq_Pow
   388       by (auto simp: prod_measure_space_def)
   389     assume "?\<mu> {x} = 0"
   390     with X.finite_prod_measure_times[OF fms(2), of "{fst x}" "{snd x}"] in_MX
   391     have "distribution X {fst x} = 0 \<or> distribution Y {snd x} = 0"
   392       by (simp add: prod_measure_space_def)
   393     hence "joint_distribution X Y {x} = 0"
   394       by (cases x) (auto simp: distribution_order) }
   395   note measure_0 = this
   396   show ?thesis
   397     unfolding Let_def mutual_information_def
   398     using measure_0 fms_P fms_P' MX MY P.absolutely_continuous_def
   399     by (subst P.KL_divergence_eq_finite)
   400        (auto simp add: prod_measure_space_def prod_measure_times_finite
   401          finite_prob_space_eq setsum_cartesian_product' real_of_pinfreal_mult[symmetric])
   402 qed
   403 
   404 lemma (in finite_information_space)
   405   assumes MX: "finite_prob_space MX (distribution X)"
   406   assumes MY: "finite_prob_space MY (distribution Y)"
   407   and X_space: "X ` space M \<subseteq> space MX" and Y_space: "Y ` space M \<subseteq> space MY"
   408   shows mutual_information_eq_generic:
   409     "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
   410       real (joint_distribution X Y {(x,y)}) *
   411       log b (real (joint_distribution X Y {(x,y)}) /
   412       (real (distribution X {x}) * real (distribution Y {y}))))"
   413     (is "?equality")
   414   and mutual_information_positive_generic:
   415     "0 \<le> mutual_information b MX MY X Y" (is "?positive")
   416 proof -
   417   let ?P = "prod_measure_space MX MY"
   418   let ?\<mu> = "prod_measure MX (distribution X) MY (distribution Y)"
   419   let ?\<nu> = "joint_distribution X Y"
   420 
   421   interpret X: finite_prob_space MX "distribution X" by fact
   422   moreover interpret Y: finite_prob_space MY "distribution Y" by fact
   423   have ms_X: "measure_space MX (distribution X)"
   424     and ms_Y: "measure_space MY (distribution Y)"
   425     and fms: "finite_measure_space MX (distribution X)" "finite_measure_space MY (distribution Y)" by fact+
   426   have fms_P: "finite_measure_space ?P ?\<mu>"
   427     by (rule X.finite_measure_space_finite_prod_measure) fact
   428   then interpret P: finite_measure_space ?P ?\<mu> .
   429 
   430   have fms_P': "finite_measure_space ?P ?\<nu>"
   431       using finite_product_measure_space[of "space MX" "space MY"]
   432         X.finite_space Y.finite_space sigma_prod_sets_finite[OF X.finite_space Y.finite_space]
   433         X.sets_eq_Pow Y.sets_eq_Pow
   434       by (simp add: prod_measure_space_def sigma_def)
   435   then interpret P': finite_measure_space ?P ?\<nu> .
   436 
   437   { fix x assume "x \<in> space ?P"
   438     hence in_MX: "{fst x} \<in> sets MX" "{snd x} \<in> sets MY" using X.sets_eq_Pow Y.sets_eq_Pow
   439       by (auto simp: prod_measure_space_def)
   440 
   441     assume "?\<mu> {x} = 0"
   442     with X.finite_prod_measure_times[OF fms(2), of "{fst x}" "{snd x}"] in_MX
   443     have "distribution X {fst x} = 0 \<or> distribution Y {snd x} = 0"
   444       by (simp add: prod_measure_space_def)
   445 
   446     hence "joint_distribution X Y {x} = 0"
   447       by (cases x) (auto simp: distribution_order) }
   448   note measure_0 = this
   449 
   450   show ?equality
   451     unfolding Let_def mutual_information_def
   452     using measure_0 fms_P fms_P' MX MY P.absolutely_continuous_def
   453     by (subst P.KL_divergence_eq_finite)
   454        (auto simp add: prod_measure_space_def prod_measure_times_finite
   455          finite_prob_space_eq setsum_cartesian_product' real_of_pinfreal_mult[symmetric])
   456 
   457   show ?positive
   458     unfolding Let_def mutual_information_def using measure_0 b_gt_1
   459   proof (safe intro!: finite_prob_space.KL_divergence_positive_finite, simp_all)
   460     have "?\<mu> (space ?P) = 1"
   461       using X.top Y.top X.measure_space_1 Y.measure_space_1 fms
   462       by (simp add: prod_measure_space_def X.finite_prod_measure_times)
   463     with fms_P show "finite_prob_space ?P ?\<mu>"
   464       by (simp add: finite_prob_space_eq)
   465 
   466     from ms_X ms_Y X.top Y.top X.measure_space_1 Y.measure_space_1 Y.not_empty X_space Y_space
   467     have "?\<nu> (space ?P) = 1" unfolding measure_space_1[symmetric]
   468       by (auto intro!: arg_cong[where f="\<mu>"]
   469                simp add: prod_measure_space_def distribution_def vimage_Times comp_def)
   470     with fms_P' show "finite_prob_space ?P ?\<nu>"
   471       by (simp add: finite_prob_space_eq)
   472   qed
   473 qed
   474 
   475 lemma (in finite_information_space) mutual_information_eq:
   476   "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
   477     real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
   478                                                    (real (distribution X {x}) * real (distribution Y {y}))))"
   479   by (subst mutual_information_eq_generic) (simp_all add: finite_prob_space_of_images)
   480 
   481 lemma (in finite_information_space) mutual_information_positive: "0 \<le> \<I>(X;Y)"
   482   by (subst mutual_information_positive_generic) (simp_all add: finite_prob_space_of_images)
   483 
   484 definition (in prob_space)
   485   "entropy b s X = mutual_information b s s X X"
   486 
   487 abbreviation (in finite_information_space)
   488   finite_entropy ("\<H>'(_')") where
   489   "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
   490 
   491 lemma (in finite_information_space) joint_distribution_remove[simp]:
   492     "joint_distribution X X {(x, x)} = distribution X {x}"
   493   unfolding distribution_def by (auto intro!: arg_cong[where f="\<mu>"])
   494 
   495 lemma (in finite_information_space) entropy_eq:
   496   "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
   497 proof -
   498   { fix f
   499     { fix x y
   500       have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
   501         hence "real (distribution (\<lambda>x. (X x, X x))  {(x,y)}) * f x y =
   502             (if x = y then real (distribution X {x}) * f x y else 0)"
   503         unfolding distribution_def by auto }
   504     hence "(\<Sum>(x, y) \<in> X ` space M \<times> X ` space M. real (joint_distribution X X {(x, y)}) * f x y) =
   505       (\<Sum>x \<in> X ` space M. real (distribution X {x}) * f x x)"
   506       unfolding setsum_cartesian_product' by (simp add: setsum_cases finite_space) }
   507   note remove_cartesian_product = this
   508 
   509   show ?thesis
   510     unfolding entropy_def mutual_information_eq setsum_negf[symmetric] remove_cartesian_product
   511     by (auto intro!: setsum_cong)
   512 qed
   513 
   514 lemma (in finite_information_space) entropy_positive: "0 \<le> \<H>(X)"
   515   unfolding entropy_def using mutual_information_positive .
   516 
   517 definition (in prob_space)
   518   "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
   519     mutual_information b M1 (prod_measure_space M2 M3) X (\<lambda>x. (Y x, Z x)) -
   520     mutual_information b M1 M3 X Z"
   521 
   522 abbreviation (in finite_information_space)
   523   finite_conditional_mutual_information ("\<I>'( _ ; _ | _ ')") where
   524   "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
   525     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   526     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
   527     \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
   528     X Y Z"
   529 
   530 lemma (in finite_information_space) setsum_distribution_gen:
   531   assumes "Z -` {c} \<inter> space M = (\<Union>x \<in> X`space M. Y -` {f x}) \<inter> space M"
   532   and "inj_on f (X`space M)"
   533   shows "(\<Sum>x \<in> X`space M. distribution Y {f x}) = distribution Z {c}"
   534   unfolding distribution_def assms
   535   using finite_space assms
   536   by (subst measure_finitely_additive'')
   537      (auto simp add: disjoint_family_on_def sets_eq_Pow inj_on_def
   538       intro!: arg_cong[where f=prob])
   539 
   540 lemma (in finite_information_space) setsum_distribution:
   541   "(\<Sum>x \<in> X`space M. joint_distribution X Y {(x, y)}) = distribution Y {y}"
   542   "(\<Sum>y \<in> Y`space M. joint_distribution X Y {(x, y)}) = distribution X {x}"
   543   "(\<Sum>x \<in> X`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution Y Z {(y, z)}"
   544   "(\<Sum>y \<in> Y`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Z {(x, z)}"
   545   "(\<Sum>z \<in> Z`space M. joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)}) = joint_distribution X Y {(x, y)}"
   546   by (auto intro!: inj_onI setsum_distribution_gen)
   547 
   548 lemma (in finite_information_space) setsum_real_distribution_gen:
   549   assumes "Z -` {c} \<inter> space M = (\<Union>x \<in> X`space M. Y -` {f x}) \<inter> space M"
   550   and "inj_on f (X`space M)"
   551   shows "(\<Sum>x \<in> X`space M. real (distribution Y {f x})) = real (distribution Z {c})"
   552   unfolding distribution_def assms
   553   using finite_space assms
   554   by (subst real_finite_measure_finite_Union[symmetric])
   555      (auto simp add: disjoint_family_on_def sets_eq_Pow inj_on_def
   556         intro!: arg_cong[where f=prob])
   557 
   558 lemma (in finite_information_space) setsum_real_distribution:
   559   "(\<Sum>x \<in> X`space M. real (joint_distribution X Y {(x, y)})) = real (distribution Y {y})"
   560   "(\<Sum>y \<in> Y`space M. real (joint_distribution X Y {(x, y)})) = real (distribution X {x})"
   561   "(\<Sum>x \<in> X`space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)})) = real (joint_distribution Y Z {(y, z)})"
   562   "(\<Sum>y \<in> Y`space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)})) = real (joint_distribution X Z {(x, z)})"
   563   "(\<Sum>z \<in> Z`space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)})) = real (joint_distribution X Y {(x, y)})"
   564   by (auto intro!: inj_onI setsum_real_distribution_gen)
   565 
   566 lemma (in finite_information_space) conditional_mutual_information_eq_sum:
   567    "\<I>(X ; Y | Z) =
   568      (\<Sum>(x, y, z)\<in>X ` space M \<times> (\<lambda>x. (Y x, Z x)) ` space M.
   569              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   570              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)})/
   571         real (distribution (\<lambda>x. (Y x, Z x)) {(y, z)}))) -
   572      (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   573         real (distribution (\<lambda>x. (X x, Z x)) {(x,z)}) * log b (real (distribution (\<lambda>x. (X x, Z x)) {(x,z)}) / real (distribution Z {z})))"
   574   (is "_ = ?rhs")
   575 proof -
   576   have setsum_product:
   577     "\<And>f x. (\<Sum>v\<in>(\<lambda>x. (Y x, Z x)) ` space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x,v)}) * f v)
   578       = (\<Sum>v\<in>Y ` space M \<times> Z ` space M. real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x,v)}) * f v)"
   579   proof (safe intro!: setsum_mono_zero_cong_left imageI)
   580     fix x y z f
   581     assume *: "(Y y, Z z) \<notin> (\<lambda>x. (Y x, Z x)) ` space M" and "y \<in> space M" "z \<in> space M"
   582     hence "(\<lambda>x. (X x, Y x, Z x)) -` {(x, Y y, Z z)} \<inter> space M = {}"
   583     proof safe
   584       fix x' assume x': "x' \<in> space M" and eq: "Y x' = Y y" "Z x' = Z z"
   585       have "(Y y, Z z) \<in> (\<lambda>x. (Y x, Z x)) ` space M" using eq[symmetric] x' by auto
   586       thus "x' \<in> {}" using * by auto
   587     qed
   588     thus "real (joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, Y y, Z z)}) * f (Y y) (Z z) = 0"
   589       unfolding distribution_def by simp
   590   qed (simp add: finite_space)
   591 
   592   thus ?thesis
   593     unfolding conditional_mutual_information_def Let_def mutual_information_eq
   594     by (subst mutual_information_eq_generic)
   595        (auto simp: prod_measure_space_def sigma_prod_sets_finite finite_space sigma_def
   596         finite_prob_space_of_images finite_product_prob_space_of_images
   597         setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf
   598         setsum_left_distrib[symmetric] setsum_real_distribution
   599       cong: setsum_cong)
   600 qed
   601 
   602 lemma (in finite_information_space) conditional_mutual_information_eq:
   603   "\<I>(X ; Y | Z) = (\<Sum>(x, y, z) \<in> X ` space M \<times> Y ` space M \<times> Z ` space M.
   604              real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
   605              log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
   606     (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
   607   unfolding conditional_mutual_information_def Let_def mutual_information_eq
   608   by (subst mutual_information_eq_generic)
   609      (auto simp add: prod_measure_space_def sigma_prod_sets_finite finite_space
   610       finite_prob_space_of_images finite_product_prob_space_of_images sigma_def
   611       setsum_cartesian_product' setsum_product setsum_subtractf setsum_addf
   612       setsum_left_distrib[symmetric] setsum_real_distribution setsum_commute[where A="Y`space M"]
   613       real_of_pinfreal_mult[symmetric]
   614     cong: setsum_cong)
   615 
   616 lemma (in finite_information_space) conditional_mutual_information_eq_mutual_information:
   617   "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
   618 proof -
   619   have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
   620 
   621   show ?thesis
   622     unfolding conditional_mutual_information_eq mutual_information_eq
   623     by (simp add: setsum_cartesian_product' distribution_remove_const)
   624 qed
   625 
   626 lemma (in finite_prob_space) distribution_finite:
   627   "distribution X A \<noteq> \<omega>"
   628   by (auto simp: sets_eq_Pow distribution_def intro!: measure_finite)
   629 
   630 lemma (in finite_prob_space) real_distribution_order:
   631   shows "r \<le> real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r \<le> real (distribution X {x})"
   632   and "r \<le> real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r \<le> real (distribution Y {y})"
   633   and "r < real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r < real (distribution X {x})"
   634   and "r < real (joint_distribution X Y {(x, y)}) \<Longrightarrow> r < real (distribution Y {y})"
   635   and "distribution X {x} = 0 \<Longrightarrow> real (joint_distribution X Y {(x, y)}) = 0"
   636   and "distribution Y {y} = 0 \<Longrightarrow> real (joint_distribution X Y {(x, y)}) = 0"
   637   using real_of_pinfreal_mono[OF distribution_finite joint_distribution_restriction_fst, of X Y "{(x, y)}"]
   638   using real_of_pinfreal_mono[OF distribution_finite joint_distribution_restriction_snd, of X Y "{(x, y)}"]
   639   using real_pinfreal_nonneg[of "joint_distribution X Y {(x, y)}"]
   640   by auto
   641 
   642 lemma (in finite_information_space) conditional_mutual_information_positive:
   643   "0 \<le> \<I>(X ; Y | Z)"
   644 proof -
   645   let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
   646   let "?dXZ A" = "real (joint_distribution X Z A)"
   647   let "?dYZ A" = "real (joint_distribution Y Z A)"
   648   let "?dX A" = "real (distribution X A)"
   649   let "?dZ A" = "real (distribution Z A)"
   650   let ?M = "X ` space M \<times> Y ` space M \<times> Z ` space M"
   651 
   652   have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: expand_fun_eq)
   653 
   654   have "- (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
   655     log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))
   656     \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
   657     unfolding split_beta
   658   proof (rule log_setsum_divide)
   659     show "?M \<noteq> {}" using not_empty by simp
   660     show "1 < b" using b_gt_1 .
   661 
   662     fix x assume "x \<in> ?M"
   663     let ?x = "(fst x, fst (snd x), snd (snd x))"
   664 
   665     show "0 \<le> ?dXYZ {?x}" using real_pinfreal_nonneg .
   666     show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   667      by (simp add: real_pinfreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
   668 
   669     assume *: "0 < ?dXYZ {?x}"
   670     thus "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
   671       apply (rule_tac divide_pos_pos mult_pos_pos)+
   672       by (auto simp add: real_distribution_gt_0 intro: distribution_order(6) distribution_mono_gt_0)
   673   qed (simp_all add: setsum_cartesian_product' sum_over_space_real_distribution setsum_real_distribution finite_space)
   674   also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>Z`space M. ?dZ {z})"
   675     apply (simp add: setsum_cartesian_product')
   676     apply (subst setsum_commute)
   677     apply (subst (2) setsum_commute)
   678     by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric] setsum_real_distribution
   679           intro!: setsum_cong)
   680   finally show ?thesis
   681     unfolding conditional_mutual_information_eq sum_over_space_real_distribution
   682     by (simp add: real_of_pinfreal_mult[symmetric])
   683 qed
   684 
   685 definition (in prob_space)
   686   "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
   687 
   688 abbreviation (in finite_information_space)
   689   finite_conditional_entropy ("\<H>'(_ | _')") where
   690   "\<H>(X | Y) \<equiv> conditional_entropy b
   691     \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
   692     \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
   693 
   694 lemma (in finite_information_space) conditional_entropy_positive:
   695   "0 \<le> \<H>(X | Y)" unfolding conditional_entropy_def using conditional_mutual_information_positive .
   696 
   697 lemma (in finite_information_space) conditional_entropy_eq:
   698   "\<H>(X | Z) =
   699      - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
   700          real (joint_distribution X Z {(x, z)}) *
   701          log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
   702 proof -
   703   have *: "\<And>x y z. (\<lambda>x. (X x, X x, Z x)) -` {(x, y, z)} = (if x = y then (\<lambda>x. (X x, Z x)) -` {(x, z)} else {})" by auto
   704   show ?thesis
   705     unfolding conditional_mutual_information_eq_sum
   706       conditional_entropy_def distribution_def *
   707     by (auto intro!: setsum_0')
   708 qed
   709 
   710 lemma (in finite_information_space) mutual_information_eq_entropy_conditional_entropy:
   711   "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
   712   unfolding mutual_information_eq entropy_eq conditional_entropy_eq
   713   using finite_space
   714   by (auto simp add: setsum_addf setsum_subtractf setsum_cartesian_product'
   715       setsum_left_distrib[symmetric] setsum_addf setsum_real_distribution)
   716 
   717 lemma (in finite_information_space) conditional_entropy_less_eq_entropy:
   718   "\<H>(X | Z) \<le> \<H>(X)"
   719 proof -
   720   have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy .
   721   with mutual_information_positive[of X Z] entropy_positive[of X]
   722   show ?thesis by auto
   723 qed
   724 
   725 (* -------------Entropy of a RV with a certain event is zero---------------- *)
   726 
   727 lemma (in finite_information_space) finite_entropy_certainty_eq_0:
   728   assumes "x \<in> X ` space M" and "distribution X {x} = 1"
   729   shows "\<H>(X) = 0"
   730 proof -
   731   interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
   732     by (rule finite_prob_space_of_images)
   733 
   734   have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
   735     using X.measure_compl[of "{x}"] assms by auto
   736   also have "\<dots> = 0" using X.prob_space assms by auto
   737   finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
   738 
   739   { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
   740     hence "{y} \<subseteq> X ` space M - {x}" by auto
   741     from X.measure_mono[OF this] X0 asm
   742     have "distribution X {y} = 0" by auto }
   743 
   744   hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
   745     using assms by auto
   746 
   747   have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
   748 
   749   show ?thesis unfolding entropy_eq by (auto simp: y fi)
   750 qed
   751 (* --------------- upper bound on entropy for a rv ------------------------- *)
   752 
   753 lemma (in finite_prob_space) distribution_1:
   754   "distribution X A \<le> 1"
   755   unfolding distribution_def measure_space_1[symmetric]
   756   by (auto intro!: measure_mono simp: sets_eq_Pow)
   757 
   758 lemma (in finite_prob_space) real_distribution_1:
   759   "real (distribution X A) \<le> 1"
   760   unfolding real_pinfreal_1[symmetric]
   761   by (rule real_of_pinfreal_mono[OF _ distribution_1]) simp
   762 
   763 lemma (in finite_information_space) finite_entropy_le_card:
   764   "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
   765 proof -
   766   let "?d x" = "distribution X {x}"
   767   let "?p x" = "real (?d x)"
   768   have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
   769     by (auto intro!: setsum_cong simp: entropy_eq setsum_negf[symmetric])
   770   also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
   771     apply (rule log_setsum')
   772     using not_empty b_gt_1 finite_space sum_over_space_real_distribution
   773     by auto
   774   also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
   775     apply (rule arg_cong[where f="\<lambda>f. log b (\<Sum>x\<in>X`space M. f x)"])
   776     using distribution_finite[of X] by (auto simp: expand_fun_eq real_of_pinfreal_eq_0)
   777   finally show ?thesis
   778     using finite_space by (auto simp: setsum_cases real_eq_of_nat)
   779 qed
   780 
   781 (* --------------- entropy is maximal for a uniform rv --------------------- *)
   782 
   783 lemma (in finite_prob_space) uniform_prob:
   784   assumes "x \<in> space M"
   785   assumes "\<And> x y. \<lbrakk>x \<in> space M ; y \<in> space M\<rbrakk> \<Longrightarrow> prob {x} = prob {y}"
   786   shows "prob {x} = 1 / real (card (space M))"
   787 proof -
   788   have prob_x: "\<And> y. y \<in> space M \<Longrightarrow> prob {y} = prob {x}"
   789     using assms(2)[OF _ `x \<in> space M`] by blast
   790   have "1 = prob (space M)"
   791     using prob_space by auto
   792   also have "\<dots> = (\<Sum> x \<in> space M. prob {x})"
   793     using real_finite_measure_finite_Union[of "space M" "\<lambda> x. {x}", simplified]
   794       sets_eq_Pow inj_singleton[unfolded inj_on_def, rule_format]
   795       finite_space unfolding disjoint_family_on_def  prob_space[symmetric]
   796     by (auto simp add:setsum_restrict_set)
   797   also have "\<dots> = (\<Sum> y \<in> space M. prob {x})"
   798     using prob_x by auto
   799   also have "\<dots> = real_of_nat (card (space M)) * prob {x}" by simp
   800   finally have one: "1 = real (card (space M)) * prob {x}"
   801     using real_eq_of_nat by auto
   802   hence two: "real (card (space M)) \<noteq> 0" by fastsimp 
   803   from one have three: "prob {x} \<noteq> 0" by fastsimp
   804   thus ?thesis using one two three divide_cancel_right
   805     by (auto simp:field_simps)
   806 qed
   807 
   808 lemma (in finite_information_space) finite_entropy_uniform_max:
   809   assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
   810   shows "\<H>(X) = log b (real (card (X ` space M)))"
   811 proof -
   812   note uniform =
   813     finite_prob_space_of_images[of X, THEN finite_prob_space.uniform_prob, simplified]
   814 
   815   have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
   816     using finite_space not_empty by auto
   817 
   818   { fix x assume "x \<in> X ` space M"
   819     hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
   820     proof (rule uniform)
   821       fix x y assume "x \<in> X`space M" "y \<in> X`space M"
   822       from assms[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
   823     qed }
   824   thus ?thesis
   825     using not_empty finite_space b_gt_1 card_gt0
   826     by (simp add: entropy_eq real_eq_of_nat[symmetric] log_divide)
   827 qed
   828 
   829 definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
   830 
   831 lemma subvimageI:
   832   assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   833   shows "subvimage A f g"
   834   using assms unfolding subvimage_def by blast
   835 
   836 lemma subvimageE[consumes 1]:
   837   assumes "subvimage A f g"
   838   obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   839   using assms unfolding subvimage_def by blast
   840 
   841 lemma subvimageD:
   842   "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
   843   using assms unfolding subvimage_def by blast
   844 
   845 lemma subvimage_subset:
   846   "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
   847   unfolding subvimage_def by auto
   848 
   849 lemma subvimage_idem[intro]: "subvimage A g g"
   850   by (safe intro!: subvimageI)
   851 
   852 lemma subvimage_comp_finer[intro]:
   853   assumes svi: "subvimage A g h"
   854   shows "subvimage A g (f \<circ> h)"
   855 proof (rule subvimageI, simp)
   856   fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
   857   from svi[THEN subvimageD, OF this]
   858   show "f (h x) = f (h y)" by simp
   859 qed
   860 
   861 lemma subvimage_comp_gran:
   862   assumes svi: "subvimage A g h"
   863   assumes inj: "inj_on f (g ` A)"
   864   shows "subvimage A (f \<circ> g) h"
   865   by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
   866 
   867 lemma subvimage_comp:
   868   assumes svi: "subvimage (f ` A) g h"
   869   shows "subvimage A (g \<circ> f) (h \<circ> f)"
   870   by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
   871 
   872 lemma subvimage_trans:
   873   assumes fg: "subvimage A f g"
   874   assumes gh: "subvimage A g h"
   875   shows "subvimage A f h"
   876   by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
   877 
   878 lemma subvimage_translator:
   879   assumes svi: "subvimage A f g"
   880   shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
   881 proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
   882   fix x assume "x \<in> A"
   883   show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
   884     by (rule theI2[of _ "g x"])
   885       (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
   886 qed
   887 
   888 lemma subvimage_translator_image:
   889   assumes svi: "subvimage A f g"
   890   shows "\<exists>h. h ` f ` A = g ` A"
   891 proof -
   892   from subvimage_translator[OF svi]
   893   obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
   894   thus ?thesis
   895     by (auto intro!: exI[of _ h]
   896       simp: image_compose[symmetric] comp_def cong: image_cong)
   897 qed
   898 
   899 lemma subvimage_finite:
   900   assumes svi: "subvimage A f g" and fin: "finite (f`A)"
   901   shows "finite (g`A)"
   902 proof -
   903   from subvimage_translator_image[OF svi]
   904   obtain h where "g`A = h`f`A" by fastsimp
   905   with fin show "finite (g`A)" by simp
   906 qed
   907 
   908 lemma subvimage_disj:
   909   assumes svi: "subvimage A f g"
   910   shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
   911       f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
   912 proof (rule disjCI)
   913   assume "\<not> ?dist"
   914   then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
   915   thus "?sub" using svi unfolding subvimage_def by auto
   916 qed
   917 
   918 lemma setsum_image_split:
   919   assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
   920   shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
   921     (is "?lhs = ?rhs")
   922 proof -
   923   have "f ` A =
   924       snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
   925       (is "_ = snd ` ?SIGMA")
   926     unfolding image_split_eq_Sigma[symmetric]
   927     by (simp add: image_compose[symmetric] comp_def)
   928   moreover
   929   have snd_inj: "inj_on snd ?SIGMA"
   930     unfolding image_split_eq_Sigma[symmetric]
   931     by (auto intro!: inj_onI subvimageD[OF svi])
   932   ultimately
   933   have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
   934     by (auto simp: setsum_reindex intro: setsum_cong)
   935   also have "... = ?rhs"
   936     using subvimage_finite[OF svi fin] fin
   937     apply (subst setsum_Sigma[symmetric])
   938     by (auto intro!: finite_subset[of _ "f`A"])
   939   finally show ?thesis .
   940 qed
   941 
   942 lemma (in finite_information_space) entropy_partition:
   943   assumes svi: "subvimage (space M) X P"
   944   shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
   945 proof -
   946   have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
   947     (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
   948     real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
   949   proof (subst setsum_image_split[OF svi],
   950       safe intro!: finite_imageI finite_space setsum_mono_zero_cong_left imageI)
   951     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
   952     assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
   953     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
   954     with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
   955     show "x \<in> P -` {P p}" by auto
   956   next
   957     fix p x assume in_space: "p \<in> space M" "x \<in> space M"
   958     assume "P x = P p"
   959     from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
   960     have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
   961       by auto
   962     hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
   963       by auto
   964     thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
   965           real (joint_distribution X P {(X x, P p)}) *
   966           log b (real (joint_distribution X P {(X x, P p)}))"
   967       by (auto simp: distribution_def)
   968   qed
   969   thus ?thesis
   970   unfolding entropy_eq conditional_entropy_eq
   971     by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
   972       setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
   973 qed
   974 
   975 corollary (in finite_information_space) entropy_data_processing:
   976   "\<H>(f \<circ> X) \<le> \<H>(X)"
   977   by (subst (2) entropy_partition[of _ "f \<circ> X"]) (auto intro: conditional_entropy_positive)
   978 
   979 lemma (in prob_space) distribution_cong:
   980   assumes "\<And>x. x \<in> space M \<Longrightarrow> X x = Y x"
   981   shows "distribution X = distribution Y"
   982   unfolding distribution_def expand_fun_eq
   983   using assms by (auto intro!: arg_cong[where f="\<mu>"])
   984 
   985 lemma (in prob_space) joint_distribution_cong:
   986   assumes "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   987   assumes "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   988   shows "joint_distribution X Y = joint_distribution X' Y'"
   989   unfolding distribution_def expand_fun_eq
   990   using assms by (auto intro!: arg_cong[where f="\<mu>"])
   991 
   992 lemma image_cong:
   993   "\<lbrakk> \<And>x. x \<in> S \<Longrightarrow> X x = X' x \<rbrakk> \<Longrightarrow> X ` S = X' ` S"
   994   by (auto intro!: image_eqI)
   995 
   996 lemma (in finite_information_space) mutual_information_cong:
   997   assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
   998   assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
   999   shows "\<I>(X ; Y) = \<I>(X' ; Y')"
  1000 proof -
  1001   have "X ` space M = X' ` space M" using X by (rule image_cong)
  1002   moreover have "Y ` space M = Y' ` space M" using Y by (rule image_cong)
  1003   ultimately show ?thesis
  1004   unfolding mutual_information_eq
  1005     using
  1006       assms[THEN distribution_cong]
  1007       joint_distribution_cong[OF assms]
  1008     by (auto intro!: setsum_cong)
  1009 qed
  1010 
  1011 corollary (in finite_information_space) entropy_of_inj:
  1012   assumes "inj_on f (X`space M)"
  1013   shows "\<H>(f \<circ> X) = \<H>(X)"
  1014 proof (rule antisym)
  1015   show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing .
  1016 next
  1017   have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
  1018     by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF assms])
  1019   also have "... \<le> \<H>(f \<circ> X)"
  1020     using entropy_data_processing .
  1021   finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
  1022 qed
  1023 
  1024 end