src/HOL/Probability/Information.thy
author wenzelm
Wed Dec 29 17:34:41 2010 +0100 (2010-12-29)
changeset 41413 64cd30d6b0b8
parent 41095 c335d880ff82
child 41661 baf1964bc468
permissions -rw-r--r--
explicit file specifications -- avoid secondary load path;
hoelzl@36080
     1
theory Information
wenzelm@41413
     2
imports
wenzelm@41413
     3
  Probability_Space
wenzelm@41413
     4
  "~~/src/HOL/Library/Convex"
wenzelm@41413
     5
  Lebesgue_Measure
hoelzl@36080
     6
begin
hoelzl@36080
     7
hoelzl@39097
     8
lemma log_le: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log a x \<le> log a y"
hoelzl@39097
     9
  by (subst log_le_cancel_iff) auto
hoelzl@39097
    10
hoelzl@39097
    11
lemma log_less: "1 < a \<Longrightarrow> 0 < x \<Longrightarrow> x < y \<Longrightarrow> log a x < log a y"
hoelzl@39097
    12
  by (subst log_less_cancel_iff) auto
hoelzl@39097
    13
hoelzl@39097
    14
lemma setsum_cartesian_product':
hoelzl@39097
    15
  "(\<Sum>x\<in>A \<times> B. f x) = (\<Sum>x\<in>A. setsum (\<lambda>y. f (x, y)) B)"
hoelzl@39097
    16
  unfolding setsum_cartesian_product by simp
hoelzl@39097
    17
hoelzl@36624
    18
section "Convex theory"
hoelzl@36080
    19
hoelzl@36624
    20
lemma log_setsum:
hoelzl@36624
    21
  assumes "finite s" "s \<noteq> {}"
hoelzl@36624
    22
  assumes "b > 1"
hoelzl@36624
    23
  assumes "(\<Sum> i \<in> s. a i) = 1"
hoelzl@36624
    24
  assumes "\<And> i. i \<in> s \<Longrightarrow> a i \<ge> 0"
hoelzl@36624
    25
  assumes "\<And> i. i \<in> s \<Longrightarrow> y i \<in> {0 <..}"
hoelzl@36624
    26
  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
hoelzl@36624
    27
proof -
hoelzl@36624
    28
  have "convex_on {0 <..} (\<lambda> x. - log b x)"
hoelzl@36624
    29
    by (rule minus_log_convex[OF `b > 1`])
hoelzl@36624
    30
  hence "- log b (\<Sum> i \<in> s. a i * y i) \<le> (\<Sum> i \<in> s. a i * - log b (y i))"
hoelzl@36624
    31
    using convex_on_setsum[of _ _ "\<lambda> x. - log b x"] assms pos_is_convex by fastsimp
hoelzl@36624
    32
  thus ?thesis by (auto simp add:setsum_negf le_imp_neg_le)
hoelzl@36624
    33
qed
hoelzl@36080
    34
hoelzl@36624
    35
lemma log_setsum':
hoelzl@36624
    36
  assumes "finite s" "s \<noteq> {}"
hoelzl@36624
    37
  assumes "b > 1"
hoelzl@36624
    38
  assumes "(\<Sum> i \<in> s. a i) = 1"
hoelzl@36624
    39
  assumes pos: "\<And> i. i \<in> s \<Longrightarrow> 0 \<le> a i"
hoelzl@36624
    40
          "\<And> i. \<lbrakk> i \<in> s ; 0 < a i \<rbrakk> \<Longrightarrow> 0 < y i"
hoelzl@36624
    41
  shows "log b (\<Sum> i \<in> s. a i * y i) \<ge> (\<Sum> i \<in> s. a i * log b (y i))"
hoelzl@36080
    42
proof -
hoelzl@36624
    43
  have "\<And>y. (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) = (\<Sum> i \<in> s. a i * y i)"
hoelzl@36624
    44
    using assms by (auto intro!: setsum_mono_zero_cong_left)
hoelzl@36624
    45
  moreover have "log b (\<Sum> i \<in> s - {i. a i = 0}. a i * y i) \<ge> (\<Sum> i \<in> s - {i. a i = 0}. a i * log b (y i))"
hoelzl@36624
    46
  proof (rule log_setsum)
hoelzl@36624
    47
    have "setsum a (s - {i. a i = 0}) = setsum a s"
hoelzl@36624
    48
      using assms(1) by (rule setsum_mono_zero_cong_left) auto
hoelzl@36624
    49
    thus sum_1: "setsum a (s - {i. a i = 0}) = 1"
hoelzl@36624
    50
      "finite (s - {i. a i = 0})" using assms by simp_all
hoelzl@36624
    51
hoelzl@36624
    52
    show "s - {i. a i = 0} \<noteq> {}"
hoelzl@36624
    53
    proof
hoelzl@36624
    54
      assume *: "s - {i. a i = 0} = {}"
hoelzl@36624
    55
      hence "setsum a (s - {i. a i = 0}) = 0" by (simp add: * setsum_empty)
hoelzl@36624
    56
      with sum_1 show False by simp
hoelzl@38656
    57
    qed
hoelzl@36624
    58
hoelzl@36624
    59
    fix i assume "i \<in> s - {i. a i = 0}"
hoelzl@36624
    60
    hence "i \<in> s" "a i \<noteq> 0" by simp_all
hoelzl@36624
    61
    thus "0 \<le> a i" "y i \<in> {0<..}" using pos[of i] by auto
hoelzl@36624
    62
  qed fact+
hoelzl@36624
    63
  ultimately show ?thesis by simp
hoelzl@36080
    64
qed
hoelzl@36080
    65
hoelzl@36624
    66
lemma log_setsum_divide:
hoelzl@36624
    67
  assumes "finite S" and "S \<noteq> {}" and "1 < b"
hoelzl@36624
    68
  assumes "(\<Sum>x\<in>S. g x) = 1"
hoelzl@36624
    69
  assumes pos: "\<And>x. x \<in> S \<Longrightarrow> g x \<ge> 0" "\<And>x. x \<in> S \<Longrightarrow> f x \<ge> 0"
hoelzl@36624
    70
  assumes g_pos: "\<And>x. \<lbrakk> x \<in> S ; 0 < g x \<rbrakk> \<Longrightarrow> 0 < f x"
hoelzl@36624
    71
  shows "- (\<Sum>x\<in>S. g x * log b (g x / f x)) \<le> log b (\<Sum>x\<in>S. f x)"
hoelzl@36624
    72
proof -
hoelzl@36624
    73
  have log_mono: "\<And>x y. 0 < x \<Longrightarrow> x \<le> y \<Longrightarrow> log b x \<le> log b y"
hoelzl@36624
    74
    using `1 < b` by (subst log_le_cancel_iff) auto
hoelzl@36080
    75
hoelzl@36624
    76
  have "- (\<Sum>x\<in>S. g x * log b (g x / f x)) = (\<Sum>x\<in>S. g x * log b (f x / g x))"
hoelzl@36624
    77
  proof (unfold setsum_negf[symmetric], rule setsum_cong)
hoelzl@36624
    78
    fix x assume x: "x \<in> S"
hoelzl@36624
    79
    show "- (g x * log b (g x / f x)) = g x * log b (f x / g x)"
hoelzl@36624
    80
    proof (cases "g x = 0")
hoelzl@36624
    81
      case False
hoelzl@36624
    82
      with pos[OF x] g_pos[OF x] have "0 < f x" "0 < g x" by simp_all
hoelzl@36624
    83
      thus ?thesis using `1 < b` by (simp add: log_divide field_simps)
hoelzl@36624
    84
    qed simp
hoelzl@36624
    85
  qed rule
hoelzl@36624
    86
  also have "... \<le> log b (\<Sum>x\<in>S. g x * (f x / g x))"
hoelzl@36624
    87
  proof (rule log_setsum')
hoelzl@36624
    88
    fix x assume x: "x \<in> S" "0 < g x"
hoelzl@36624
    89
    with g_pos[OF x] show "0 < f x / g x" by (safe intro!: divide_pos_pos)
hoelzl@36624
    90
  qed fact+
hoelzl@36624
    91
  also have "... = log b (\<Sum>x\<in>S - {x. g x = 0}. f x)" using `finite S`
hoelzl@36624
    92
    by (auto intro!: setsum_mono_zero_cong_right arg_cong[where f="log b"]
hoelzl@36624
    93
        split: split_if_asm)
hoelzl@36624
    94
  also have "... \<le> log b (\<Sum>x\<in>S. f x)"
hoelzl@36624
    95
  proof (rule log_mono)
hoelzl@36624
    96
    have "0 = (\<Sum>x\<in>S - {x. g x = 0}. 0)" by simp
hoelzl@36624
    97
    also have "... < (\<Sum>x\<in>S - {x. g x = 0}. f x)" (is "_ < ?sum")
hoelzl@36624
    98
    proof (rule setsum_strict_mono)
hoelzl@36624
    99
      show "finite (S - {x. g x = 0})" using `finite S` by simp
hoelzl@36624
   100
      show "S - {x. g x = 0} \<noteq> {}"
hoelzl@36624
   101
      proof
hoelzl@36624
   102
        assume "S - {x. g x = 0} = {}"
hoelzl@36624
   103
        hence "(\<Sum>x\<in>S. g x) = 0" by (subst setsum_0') auto
hoelzl@36624
   104
        with `(\<Sum>x\<in>S. g x) = 1` show False by simp
hoelzl@36624
   105
      qed
hoelzl@36624
   106
      fix x assume "x \<in> S - {x. g x = 0}"
hoelzl@36624
   107
      thus "0 < f x" using g_pos[of x] pos(1)[of x] by auto
hoelzl@36624
   108
    qed
hoelzl@36624
   109
    finally show "0 < ?sum" .
hoelzl@36624
   110
    show "(\<Sum>x\<in>S - {x. g x = 0}. f x) \<le> (\<Sum>x\<in>S. f x)"
hoelzl@36624
   111
      using `finite S` pos by (auto intro!: setsum_mono2)
hoelzl@36080
   112
  qed
hoelzl@36624
   113
  finally show ?thesis .
hoelzl@36080
   114
qed
hoelzl@36080
   115
hoelzl@39097
   116
lemma split_pairs:
hoelzl@40859
   117
  "((A, B) = X) \<longleftrightarrow> (fst X = A \<and> snd X = B)" and
hoelzl@40859
   118
  "(X = (A, B)) \<longleftrightarrow> (fst X = A \<and> snd X = B)" by auto
hoelzl@38656
   119
hoelzl@38656
   120
section "Information theory"
hoelzl@38656
   121
hoelzl@40859
   122
locale information_space = prob_space +
hoelzl@38656
   123
  fixes b :: real assumes b_gt_1: "1 < b"
hoelzl@38656
   124
hoelzl@40859
   125
context information_space
hoelzl@38656
   126
begin
hoelzl@38656
   127
hoelzl@40859
   128
text {* Introduce some simplification rules for logarithm of base @{term b}. *}
hoelzl@40859
   129
hoelzl@40859
   130
lemma log_neg_const:
hoelzl@40859
   131
  assumes "x \<le> 0"
hoelzl@40859
   132
  shows "log b x = log b 0"
hoelzl@36624
   133
proof -
hoelzl@40859
   134
  { fix u :: real
hoelzl@40859
   135
    have "x \<le> 0" by fact
hoelzl@40859
   136
    also have "0 < exp u"
hoelzl@40859
   137
      using exp_gt_zero .
hoelzl@40859
   138
    finally have "exp u \<noteq> x"
hoelzl@40859
   139
      by auto }
hoelzl@40859
   140
  then show "log b x = log b 0"
hoelzl@40859
   141
    by (simp add: log_def ln_def)
hoelzl@38656
   142
qed
hoelzl@38656
   143
hoelzl@40859
   144
lemma log_mult_eq:
hoelzl@40859
   145
  "log b (A * B) = (if 0 < A * B then log b \<bar>A\<bar> + log b \<bar>B\<bar> else log b 0)"
hoelzl@40859
   146
  using log_mult[of b "\<bar>A\<bar>" "\<bar>B\<bar>"] b_gt_1 log_neg_const[of "A * B"]
hoelzl@40859
   147
  by (auto simp: zero_less_mult_iff mult_le_0_iff)
hoelzl@38656
   148
hoelzl@40859
   149
lemma log_inverse_eq:
hoelzl@40859
   150
  "log b (inverse B) = (if 0 < B then - log b B else log b 0)"
hoelzl@40859
   151
  using log_inverse[of b B] log_neg_const[of "inverse B"] b_gt_1 by simp
hoelzl@36080
   152
hoelzl@40859
   153
lemma log_divide_eq:
hoelzl@40859
   154
  "log b (A / B) = (if 0 < A * B then log b \<bar>A\<bar> - log b \<bar>B\<bar> else log b 0)"
hoelzl@40859
   155
  unfolding divide_inverse log_mult_eq log_inverse_eq abs_inverse
hoelzl@40859
   156
  by (auto simp: zero_less_mult_iff mult_le_0_iff)
hoelzl@38656
   157
hoelzl@40859
   158
lemmas log_simps = log_mult_eq log_inverse_eq log_divide_eq
hoelzl@38656
   159
hoelzl@38656
   160
end
hoelzl@38656
   161
hoelzl@39097
   162
subsection "Kullback$-$Leibler divergence"
hoelzl@36080
   163
hoelzl@39097
   164
text {* The Kullback$-$Leibler divergence is also known as relative entropy or
hoelzl@39097
   165
Kullback$-$Leibler distance. *}
hoelzl@39097
   166
hoelzl@39097
   167
definition
hoelzl@39097
   168
  "KL_divergence b M \<mu> \<nu> =
hoelzl@39097
   169
    measure_space.integral M \<mu> (\<lambda>x. log b (real (sigma_finite_measure.RN_deriv M \<nu> \<mu> x)))"
hoelzl@38656
   170
hoelzl@40859
   171
lemma (in sigma_finite_measure) KL_divergence_cong:
hoelzl@40859
   172
  assumes "measure_space M \<nu>"
hoelzl@40859
   173
  and cong: "\<And>A. A \<in> sets M \<Longrightarrow> \<mu>' A = \<mu> A" "\<And>A. A \<in> sets M \<Longrightarrow> \<nu>' A = \<nu> A"
hoelzl@40859
   174
  shows "KL_divergence b M \<nu>' \<mu>' = KL_divergence b M \<nu> \<mu>"
hoelzl@40859
   175
proof -
hoelzl@40859
   176
  interpret \<nu>: measure_space M \<nu> by fact
hoelzl@40859
   177
  show ?thesis
hoelzl@40859
   178
    unfolding KL_divergence_def
hoelzl@40859
   179
    using RN_deriv_cong[OF cong, of "\<lambda>A. A"]
hoelzl@40859
   180
    by (simp add: cong \<nu>.integral_cong_measure[OF cong(2)])
hoelzl@40859
   181
qed
hoelzl@40859
   182
hoelzl@40859
   183
lemma (in sigma_finite_measure) KL_divergence_vimage:
hoelzl@40859
   184
  assumes f: "bij_betw f S (space M)"
hoelzl@40859
   185
  assumes \<nu>: "measure_space M \<nu>" "absolutely_continuous \<nu>"
hoelzl@40859
   186
  shows "KL_divergence b (vimage_algebra S f) (\<lambda>A. \<nu> (f ` A)) (\<lambda>A. \<mu> (f ` A)) = KL_divergence b M \<nu> \<mu>"
hoelzl@40859
   187
    (is "KL_divergence b ?M ?\<nu> ?\<mu> = _")
hoelzl@40859
   188
proof -
hoelzl@40859
   189
  interpret \<nu>: measure_space M \<nu> by fact
hoelzl@40859
   190
  interpret v: measure_space ?M ?\<nu>
hoelzl@40859
   191
    using f by (rule \<nu>.measure_space_isomorphic)
hoelzl@40859
   192
hoelzl@40859
   193
  let ?RN = "sigma_finite_measure.RN_deriv ?M ?\<mu> ?\<nu>"
hoelzl@41095
   194
  from RN_deriv_vimage[OF f[THEN bij_inv_the_inv_into] \<nu>]
hoelzl@40859
   195
  have *: "\<nu>.almost_everywhere (\<lambda>x. ?RN (the_inv_into S f x) = RN_deriv \<nu> x)"
hoelzl@40859
   196
    by (rule absolutely_continuous_AE[OF \<nu>])
hoelzl@40859
   197
hoelzl@40859
   198
  show ?thesis
hoelzl@40859
   199
    unfolding KL_divergence_def \<nu>.integral_vimage_inv[OF f]
hoelzl@40859
   200
    apply (rule \<nu>.integral_cong_AE)
hoelzl@40859
   201
    apply (rule \<nu>.AE_mp[OF *])
hoelzl@40859
   202
    apply (rule \<nu>.AE_cong)
hoelzl@40859
   203
    apply simp
hoelzl@40859
   204
    done
hoelzl@40859
   205
qed
hoelzl@40859
   206
hoelzl@38656
   207
lemma (in finite_measure_space) KL_divergence_eq_finite:
hoelzl@38656
   208
  assumes v: "finite_measure_space M \<nu>"
hoelzl@40859
   209
  assumes ac: "absolutely_continuous \<nu>"
hoelzl@38656
   210
  shows "KL_divergence b M \<nu> \<mu> = (\<Sum>x\<in>space M. real (\<nu> {x}) * log b (real (\<nu> {x}) / real (\<mu> {x})))" (is "_ = ?sum")
hoelzl@38656
   211
proof (simp add: KL_divergence_def finite_measure_space.integral_finite_singleton[OF v])
hoelzl@38656
   212
  interpret v: finite_measure_space M \<nu> by fact
hoelzl@38656
   213
  have ms: "measure_space M \<nu>" by fact
hoelzl@38656
   214
  show "(\<Sum>x \<in> space M. log b (real (RN_deriv \<nu> x)) * real (\<nu> {x})) = ?sum"
hoelzl@38656
   215
    using RN_deriv_finite_measure[OF ms ac]
hoelzl@41023
   216
    by (auto intro!: setsum_cong simp: field_simps real_of_pextreal_mult[symmetric])
hoelzl@38656
   217
qed
hoelzl@36080
   218
hoelzl@38656
   219
lemma (in finite_prob_space) KL_divergence_positive_finite:
hoelzl@38656
   220
  assumes v: "finite_prob_space M \<nu>"
hoelzl@40859
   221
  assumes ac: "absolutely_continuous \<nu>"
hoelzl@38656
   222
  and "1 < b"
hoelzl@38656
   223
  shows "0 \<le> KL_divergence b M \<nu> \<mu>"
hoelzl@38656
   224
proof -
hoelzl@38656
   225
  interpret v: finite_prob_space M \<nu> using v .
hoelzl@40859
   226
  have ms: "finite_measure_space M \<nu>" by default
hoelzl@38656
   227
hoelzl@40859
   228
  have "- (KL_divergence b M \<nu> \<mu>) \<le> log b (\<Sum>x\<in>space M. real (\<mu> {x}))"
hoelzl@40859
   229
  proof (subst KL_divergence_eq_finite[OF ms ac], safe intro!: log_setsum_divide not_empty)
hoelzl@40859
   230
    show "finite (space M)" using finite_space by simp
hoelzl@40859
   231
    show "1 < b" by fact
hoelzl@40859
   232
    show "(\<Sum>x\<in>space M. real (\<nu> {x})) = 1" using v.finite_sum_over_space_eq_1 by simp
hoelzl@38656
   233
hoelzl@40859
   234
    fix x assume "x \<in> space M"
hoelzl@40859
   235
    then have x: "{x} \<in> sets M" unfolding sets_eq_Pow by auto
hoelzl@40859
   236
    { assume "0 < real (\<nu> {x})"
hoelzl@40859
   237
      then have "\<nu> {x} \<noteq> 0" by auto
hoelzl@40859
   238
      then have "\<mu> {x} \<noteq> 0"
hoelzl@40859
   239
        using ac[unfolded absolutely_continuous_def, THEN bspec, of "{x}"] x by auto
hoelzl@40859
   240
      thus "0 < prob {x}" using finite_measure[of "{x}"] x by auto }
hoelzl@40859
   241
  qed auto
hoelzl@38656
   242
  thus "0 \<le> KL_divergence b M \<nu> \<mu>" using finite_sum_over_space_eq_1 by simp
hoelzl@36080
   243
qed
hoelzl@36080
   244
hoelzl@39097
   245
subsection {* Mutual Information *}
hoelzl@39097
   246
hoelzl@36080
   247
definition (in prob_space)
hoelzl@38656
   248
  "mutual_information b S T X Y =
hoelzl@40859
   249
    KL_divergence b (sigma (pair_algebra S T))
hoelzl@38656
   250
      (joint_distribution X Y)
hoelzl@40859
   251
      (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y))"
hoelzl@36080
   252
hoelzl@40859
   253
definition (in prob_space)
hoelzl@40859
   254
  "entropy b s X = mutual_information b s s X X"
hoelzl@40859
   255
hoelzl@40859
   256
abbreviation (in information_space)
hoelzl@40859
   257
  mutual_information_Pow ("\<I>'(_ ; _')") where
hoelzl@36624
   258
  "\<I>(X ; Y) \<equiv> mutual_information b
hoelzl@36080
   259
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
hoelzl@36080
   260
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
hoelzl@36080
   261
hoelzl@40859
   262
lemma (in information_space) mutual_information_commute_generic:
hoelzl@40859
   263
  assumes X: "random_variable S X" and Y: "random_variable T Y"
hoelzl@40859
   264
  assumes ac: "measure_space.absolutely_continuous (sigma (pair_algebra S T))
hoelzl@40859
   265
   (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
hoelzl@40859
   266
  shows "mutual_information b S T X Y = mutual_information b T S Y X"
hoelzl@39092
   267
proof -
hoelzl@40859
   268
  interpret P: prob_space "sigma (pair_algebra S T)" "joint_distribution X Y"
hoelzl@40859
   269
    using random_variable_pairI[OF X Y] by (rule distribution_prob_space)
hoelzl@40859
   270
  interpret Q: prob_space "sigma (pair_algebra T S)" "joint_distribution Y X"
hoelzl@40859
   271
    using random_variable_pairI[OF Y X] by (rule distribution_prob_space)
hoelzl@40859
   272
  interpret X: prob_space S "distribution X" using X by (rule distribution_prob_space)
hoelzl@40859
   273
  interpret Y: prob_space T "distribution Y" using Y by (rule distribution_prob_space)
hoelzl@40859
   274
  interpret ST: pair_sigma_finite S "distribution X" T "distribution Y" by default
hoelzl@40859
   275
  interpret TS: pair_sigma_finite T "distribution Y" S "distribution X" by default
hoelzl@40859
   276
hoelzl@40859
   277
  have ST: "measure_space (sigma (pair_algebra S T)) (joint_distribution X Y)" by default
hoelzl@40859
   278
  have TS: "measure_space (sigma (pair_algebra T S)) (joint_distribution Y X)" by default
hoelzl@40859
   279
hoelzl@40859
   280
  have bij_ST: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra S T))) (space (sigma (pair_algebra T S)))"
hoelzl@40859
   281
    by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
hoelzl@40859
   282
  have bij_TS: "bij_betw (\<lambda>(x, y). (y, x)) (space (sigma (pair_algebra T S))) (space (sigma (pair_algebra S T)))"
hoelzl@40859
   283
    by (auto intro!: inj_onI simp: space_pair_algebra bij_betw_def)
hoelzl@40859
   284
hoelzl@40859
   285
  { fix A
hoelzl@40859
   286
    have "joint_distribution X Y ((\<lambda>(x, y). (y, x)) ` A) = joint_distribution Y X A"
hoelzl@40859
   287
      unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
hoelzl@40859
   288
  note jd_commute = this
hoelzl@40859
   289
hoelzl@40859
   290
  { fix A assume A: "A \<in> sets (sigma (pair_algebra T S))"
hoelzl@41023
   291
    have *: "\<And>x y. indicator ((\<lambda>(x, y). (y, x)) ` A) (x, y) = (indicator A (y, x) :: pextreal)"
hoelzl@40859
   292
      unfolding indicator_def by auto
hoelzl@40859
   293
    have "ST.pair_measure ((\<lambda>(x, y). (y, x)) ` A) = TS.pair_measure A"
hoelzl@40859
   294
      unfolding ST.pair_measure_def TS.pair_measure_def
hoelzl@40859
   295
      using A by (auto simp add: TS.Fubini[symmetric] *) }
hoelzl@40859
   296
  note pair_measure_commute = this
hoelzl@40859
   297
hoelzl@39092
   298
  show ?thesis
hoelzl@40859
   299
    unfolding mutual_information_def
hoelzl@40859
   300
    unfolding ST.KL_divergence_vimage[OF bij_TS ST ac, symmetric]
hoelzl@40859
   301
    unfolding space_sigma space_pair_algebra jd_commute
hoelzl@40859
   302
    unfolding ST.pair_sigma_algebra_swap[symmetric]
hoelzl@40859
   303
    by (simp cong: TS.KL_divergence_cong[OF TS] add: pair_measure_commute)
hoelzl@39092
   304
qed
hoelzl@39092
   305
hoelzl@40859
   306
lemma (in prob_space) finite_variables_absolutely_continuous:
hoelzl@40859
   307
  assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
hoelzl@40859
   308
  shows "measure_space.absolutely_continuous (sigma (pair_algebra S T))
hoelzl@40859
   309
   (pair_sigma_finite.pair_measure S (distribution X) T (distribution Y)) (joint_distribution X Y)"
hoelzl@40859
   310
proof -
hoelzl@40859
   311
  interpret X: finite_prob_space S "distribution X" using X by (rule distribution_finite_prob_space)
hoelzl@40859
   312
  interpret Y: finite_prob_space T "distribution Y" using Y by (rule distribution_finite_prob_space)
hoelzl@40859
   313
  interpret XY: pair_finite_prob_space S "distribution X" T "distribution Y" by default
hoelzl@40859
   314
  interpret P: finite_prob_space XY.P "joint_distribution X Y"
hoelzl@40859
   315
    using assms by (intro joint_distribution_finite_prob_space)
hoelzl@40859
   316
  show "XY.absolutely_continuous (joint_distribution X Y)"
hoelzl@40859
   317
  proof (rule XY.absolutely_continuousI)
hoelzl@40859
   318
    show "finite_measure_space XY.P (joint_distribution X Y)" by default
hoelzl@40859
   319
    fix x assume "x \<in> space XY.P" and "XY.pair_measure {x} = 0"
hoelzl@40859
   320
    then obtain a b where "(a, b) = x" and "a \<in> space S" "b \<in> space T"
hoelzl@40859
   321
      and distr: "distribution X {a} * distribution Y {b} = 0"
hoelzl@40859
   322
      by (cases x) (auto simp: pair_algebra_def)
hoelzl@40859
   323
    with assms[THEN finite_random_variableD]
hoelzl@40859
   324
      joint_distribution_Times_le_fst[of S X T Y "{a}" "{b}"]
hoelzl@40859
   325
      joint_distribution_Times_le_snd[of S X T Y "{a}" "{b}"]
hoelzl@40859
   326
    have "joint_distribution X Y {x} \<le> distribution Y {b}"
hoelzl@40859
   327
         "joint_distribution X Y {x} \<le> distribution X {a}"
hoelzl@40859
   328
      by auto
hoelzl@40859
   329
    with distr show "joint_distribution X Y {x} = 0" by auto
hoelzl@40859
   330
  qed
hoelzl@40859
   331
qed
hoelzl@40859
   332
hoelzl@40859
   333
lemma (in information_space) mutual_information_commute:
hoelzl@40859
   334
  assumes X: "finite_random_variable S X" and Y: "finite_random_variable T Y"
hoelzl@40859
   335
  shows "mutual_information b S T X Y = mutual_information b T S Y X"
hoelzl@40859
   336
  by (intro finite_random_variableD X Y mutual_information_commute_generic finite_variables_absolutely_continuous)
hoelzl@40859
   337
hoelzl@40859
   338
lemma (in information_space) mutual_information_commute_simple:
hoelzl@40859
   339
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   340
  shows "\<I>(X;Y) = \<I>(Y;X)"
hoelzl@40859
   341
  by (intro X Y simple_function_imp_finite_random_variable mutual_information_commute)
hoelzl@40859
   342
hoelzl@40859
   343
lemma (in information_space)
hoelzl@40859
   344
  assumes MX: "finite_random_variable MX X"
hoelzl@40859
   345
  assumes MY: "finite_random_variable MY Y"
hoelzl@40859
   346
  shows mutual_information_generic_eq:
hoelzl@36624
   347
    "mutual_information b MX MY X Y = (\<Sum> (x,y) \<in> space MX \<times> space MY.
hoelzl@38656
   348
      real (joint_distribution X Y {(x,y)}) *
hoelzl@38656
   349
      log b (real (joint_distribution X Y {(x,y)}) /
hoelzl@38656
   350
      (real (distribution X {x}) * real (distribution Y {y}))))"
hoelzl@40859
   351
    (is ?sum)
hoelzl@36624
   352
  and mutual_information_positive_generic:
hoelzl@40859
   353
     "0 \<le> mutual_information b MX MY X Y" (is ?positive)
hoelzl@36624
   354
proof -
hoelzl@40859
   355
  interpret X: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
hoelzl@40859
   356
  interpret Y: finite_prob_space MY "distribution Y" using MY by (rule distribution_finite_prob_space)
hoelzl@40859
   357
  interpret XY: pair_finite_prob_space MX "distribution X" MY "distribution Y" by default
hoelzl@40859
   358
  interpret P: finite_prob_space XY.P "joint_distribution X Y"
hoelzl@40859
   359
    using assms by (intro joint_distribution_finite_prob_space)
hoelzl@36080
   360
hoelzl@40859
   361
  have P_ms: "finite_measure_space XY.P (joint_distribution X Y)" by default
hoelzl@40859
   362
  have P_ps: "finite_prob_space XY.P (joint_distribution X Y)" by default
hoelzl@36624
   363
hoelzl@40859
   364
  show ?sum
hoelzl@38656
   365
    unfolding Let_def mutual_information_def
hoelzl@40859
   366
    by (subst XY.KL_divergence_eq_finite[OF P_ms finite_variables_absolutely_continuous[OF MX MY]])
hoelzl@41023
   367
       (auto simp add: pair_algebra_def setsum_cartesian_product' real_of_pextreal_mult[symmetric])
hoelzl@36080
   368
hoelzl@36624
   369
  show ?positive
hoelzl@40859
   370
    using XY.KL_divergence_positive_finite[OF P_ps finite_variables_absolutely_continuous[OF MX MY] b_gt_1]
hoelzl@40859
   371
    unfolding mutual_information_def .
hoelzl@36080
   372
qed
hoelzl@36080
   373
hoelzl@40859
   374
lemma (in information_space) mutual_information_eq:
hoelzl@40859
   375
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   376
  shows "\<I>(X;Y) = (\<Sum> (x,y) \<in> X ` space M \<times> Y ` space M.
hoelzl@38656
   377
    real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) * log b (real (distribution (\<lambda>x. (X x, Y x)) {(x,y)}) /
hoelzl@38656
   378
                                                   (real (distribution X {x}) * real (distribution Y {y}))))"
hoelzl@40859
   379
  using assms by (simp add: mutual_information_generic_eq)
hoelzl@36080
   380
hoelzl@40859
   381
lemma (in information_space) mutual_information_generic_cong:
hoelzl@39097
   382
  assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
hoelzl@39097
   383
  assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
hoelzl@40859
   384
  shows "mutual_information b MX MY X Y = mutual_information b MX MY X' Y'"
hoelzl@40859
   385
  unfolding mutual_information_def using X Y
hoelzl@40859
   386
  by (simp cong: distribution_cong)
hoelzl@39097
   387
hoelzl@40859
   388
lemma (in information_space) mutual_information_cong:
hoelzl@40859
   389
  assumes X: "\<And>x. x \<in> space M \<Longrightarrow> X x = X' x"
hoelzl@40859
   390
  assumes Y: "\<And>x. x \<in> space M \<Longrightarrow> Y x = Y' x"
hoelzl@40859
   391
  shows "\<I>(X; Y) = \<I>(X'; Y')"
hoelzl@40859
   392
  unfolding mutual_information_def using X Y
hoelzl@40859
   393
  by (simp cong: distribution_cong image_cong)
hoelzl@40859
   394
hoelzl@40859
   395
lemma (in information_space) mutual_information_positive:
hoelzl@40859
   396
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   397
  shows "0 \<le> \<I>(X;Y)"
hoelzl@40859
   398
  using assms by (simp add: mutual_information_positive_generic)
hoelzl@36080
   399
hoelzl@39097
   400
subsection {* Entropy *}
hoelzl@39097
   401
hoelzl@40859
   402
abbreviation (in information_space)
hoelzl@40859
   403
  entropy_Pow ("\<H>'(_')") where
hoelzl@36624
   404
  "\<H>(X) \<equiv> entropy b \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr> X"
hoelzl@36080
   405
hoelzl@40859
   406
lemma (in information_space) entropy_generic_eq:
hoelzl@40859
   407
  assumes MX: "finite_random_variable MX X"
hoelzl@39097
   408
  shows "entropy b MX X = -(\<Sum> x \<in> space MX. real (distribution X {x}) * log b (real (distribution X {x})))"
hoelzl@39097
   409
proof -
hoelzl@40859
   410
  interpret MX: finite_prob_space MX "distribution X" using MX by (rule distribution_finite_prob_space)
hoelzl@39097
   411
  let "?X x" = "real (distribution X {x})"
hoelzl@39097
   412
  let "?XX x y" = "real (joint_distribution X X {(x, y)})"
hoelzl@39097
   413
  { fix x y
hoelzl@39097
   414
    have "(\<lambda>x. (X x, X x)) -` {(x, y)} = (if x = y then X -` {x} else {})" by auto
hoelzl@39097
   415
    then have "?XX x y * log b (?XX x y / (?X x * ?X y)) =
hoelzl@39097
   416
        (if x = y then - ?X y * log b (?X y) else 0)"
hoelzl@40859
   417
      unfolding distribution_def by (auto simp: log_simps zero_less_mult_iff) }
hoelzl@39097
   418
  note remove_XX = this
hoelzl@39097
   419
  show ?thesis
hoelzl@39097
   420
    unfolding entropy_def mutual_information_generic_eq[OF MX MX]
hoelzl@39097
   421
    unfolding setsum_cartesian_product[symmetric] setsum_negf[symmetric] remove_XX
hoelzl@39097
   422
    by (auto simp: setsum_cases MX.finite_space)
hoelzl@39097
   423
qed
hoelzl@36624
   424
hoelzl@40859
   425
lemma (in information_space) entropy_eq:
hoelzl@40859
   426
  assumes "simple_function X"
hoelzl@40859
   427
  shows "\<H>(X) = -(\<Sum> x \<in> X ` space M. real (distribution X {x}) * log b (real (distribution X {x})))"
hoelzl@40859
   428
  using assms by (simp add: entropy_generic_eq)
hoelzl@36080
   429
hoelzl@40859
   430
lemma (in information_space) entropy_positive:
hoelzl@40859
   431
  "simple_function X \<Longrightarrow> 0 \<le> \<H>(X)"
hoelzl@40859
   432
  unfolding entropy_def by (simp add: mutual_information_positive)
hoelzl@36080
   433
hoelzl@40859
   434
lemma (in information_space) entropy_certainty_eq_0:
hoelzl@40859
   435
  assumes "simple_function X" and "x \<in> X ` space M" and "distribution X {x} = 1"
hoelzl@39097
   436
  shows "\<H>(X) = 0"
hoelzl@39097
   437
proof -
hoelzl@39097
   438
  interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
hoelzl@40859
   439
    using simple_function_imp_finite_random_variable[OF `simple_function X`]
hoelzl@40859
   440
    by (rule distribution_finite_prob_space)
hoelzl@39097
   441
  have "distribution X (X ` space M - {x}) = distribution X (X ` space M) - distribution X {x}"
hoelzl@39097
   442
    using X.measure_compl[of "{x}"] assms by auto
hoelzl@39097
   443
  also have "\<dots> = 0" using X.prob_space assms by auto
hoelzl@39097
   444
  finally have X0: "distribution X (X ` space M - {x}) = 0" by auto
hoelzl@39097
   445
  { fix y assume asm: "y \<noteq> x" "y \<in> X ` space M"
hoelzl@39097
   446
    hence "{y} \<subseteq> X ` space M - {x}" by auto
hoelzl@39097
   447
    from X.measure_mono[OF this] X0 asm
hoelzl@39097
   448
    have "distribution X {y} = 0" by auto }
hoelzl@39097
   449
  hence fi: "\<And> y. y \<in> X ` space M \<Longrightarrow> real (distribution X {y}) = (if x = y then 1 else 0)"
hoelzl@39097
   450
    using assms by auto
hoelzl@39097
   451
  have y: "\<And>y. (if x = y then 1 else 0) * log b (if x = y then 1 else 0) = 0" by simp
hoelzl@40859
   452
  show ?thesis unfolding entropy_eq[OF `simple_function X`] by (auto simp: y fi)
hoelzl@39097
   453
qed
hoelzl@39097
   454
hoelzl@40859
   455
lemma (in information_space) entropy_le_card_not_0:
hoelzl@40859
   456
  assumes "simple_function X"
hoelzl@40859
   457
  shows "\<H>(X) \<le> log b (real (card (X ` space M \<inter> {x . distribution X {x} \<noteq> 0})))"
hoelzl@39097
   458
proof -
hoelzl@39097
   459
  let "?d x" = "distribution X {x}"
hoelzl@39097
   460
  let "?p x" = "real (?d x)"
hoelzl@39097
   461
  have "\<H>(X) = (\<Sum>x\<in>X`space M. ?p x * log b (1 / ?p x))"
hoelzl@40859
   462
    by (auto intro!: setsum_cong simp: entropy_eq[OF `simple_function X`] setsum_negf[symmetric] log_simps not_less)
hoelzl@39097
   463
  also have "\<dots> \<le> log b (\<Sum>x\<in>X`space M. ?p x * (1 / ?p x))"
hoelzl@39097
   464
    apply (rule log_setsum')
hoelzl@40859
   465
    using not_empty b_gt_1 `simple_function X` sum_over_space_real_distribution
hoelzl@40859
   466
    by (auto simp: simple_function_def)
hoelzl@39097
   467
  also have "\<dots> = log b (\<Sum>x\<in>X`space M. if ?d x \<noteq> 0 then 1 else 0)"
hoelzl@40859
   468
    using distribution_finite[OF `simple_function X`[THEN simple_function_imp_random_variable], simplified]
hoelzl@41023
   469
    by (intro arg_cong[where f="\<lambda>X. log b X"] setsum_cong) (auto simp: real_of_pextreal_eq_0)
hoelzl@39097
   470
  finally show ?thesis
hoelzl@40859
   471
    using `simple_function X` by (auto simp: setsum_cases real_eq_of_nat simple_function_def)
hoelzl@39097
   472
qed
hoelzl@39097
   473
hoelzl@40859
   474
lemma (in information_space) entropy_uniform_max:
hoelzl@40859
   475
  assumes "simple_function X"
hoelzl@39097
   476
  assumes "\<And>x y. \<lbrakk> x \<in> X ` space M ; y \<in> X ` space M \<rbrakk> \<Longrightarrow> distribution X {x} = distribution X {y}"
hoelzl@39097
   477
  shows "\<H>(X) = log b (real (card (X ` space M)))"
hoelzl@39097
   478
proof -
hoelzl@40859
   479
  interpret X: finite_prob_space "\<lparr> space = X ` space M, sets = Pow (X ` space M) \<rparr>" "distribution X"
hoelzl@40859
   480
    using simple_function_imp_finite_random_variable[OF `simple_function X`]
hoelzl@40859
   481
    by (rule distribution_finite_prob_space)
hoelzl@39097
   482
  have card_gt0: "0 < card (X ` space M)" unfolding card_gt_0_iff
hoelzl@40859
   483
    using `simple_function X` not_empty by (auto simp: simple_function_def)
hoelzl@39097
   484
  { fix x assume "x \<in> X ` space M"
hoelzl@39097
   485
    hence "real (distribution X {x}) = 1 / real (card (X ` space M))"
hoelzl@40859
   486
    proof (rule X.uniform_prob[simplified])
hoelzl@39097
   487
      fix x y assume "x \<in> X`space M" "y \<in> X`space M"
hoelzl@40859
   488
      from assms(2)[OF this] show "real (distribution X {x}) = real (distribution X {y})" by simp
hoelzl@39097
   489
    qed }
hoelzl@39097
   490
  thus ?thesis
hoelzl@40859
   491
    using not_empty X.finite_space b_gt_1 card_gt0
hoelzl@40859
   492
    by (simp add: entropy_eq[OF `simple_function X`] real_eq_of_nat[symmetric] log_simps)
hoelzl@39097
   493
qed
hoelzl@39097
   494
hoelzl@40859
   495
lemma (in information_space) entropy_le_card:
hoelzl@40859
   496
  assumes "simple_function X"
hoelzl@40859
   497
  shows "\<H>(X) \<le> log b (real (card (X ` space M)))"
hoelzl@39097
   498
proof cases
hoelzl@39097
   499
  assume "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} = {}"
hoelzl@39097
   500
  then have "\<And>x. x\<in>X`space M \<Longrightarrow> distribution X {x} = 0" by auto
hoelzl@39097
   501
  moreover
hoelzl@39097
   502
  have "0 < card (X`space M)"
hoelzl@40859
   503
    using `simple_function X` not_empty
hoelzl@40859
   504
    by (auto simp: card_gt_0_iff simple_function_def)
hoelzl@39097
   505
  then have "log b 1 \<le> log b (real (card (X`space M)))"
hoelzl@39097
   506
    using b_gt_1 by (intro log_le) auto
hoelzl@40859
   507
  ultimately show ?thesis using assms by (simp add: entropy_eq)
hoelzl@39097
   508
next
hoelzl@39097
   509
  assume False: "X ` space M \<inter> {x. distribution X {x} \<noteq> 0} \<noteq> {}"
hoelzl@39097
   510
  have "card (X ` space M \<inter> {x. distribution X {x} \<noteq> 0}) \<le> card (X ` space M)"
hoelzl@40859
   511
    (is "?A \<le> ?B") using assms not_empty by (auto intro!: card_mono simp: simple_function_def)
hoelzl@40859
   512
  note entropy_le_card_not_0[OF assms]
hoelzl@39097
   513
  also have "log b (real ?A) \<le> log b (real ?B)"
hoelzl@40859
   514
    using b_gt_1 False not_empty `?A \<le> ?B` assms
hoelzl@40859
   515
    by (auto intro!: log_le simp: card_gt_0_iff simp: simple_function_def)
hoelzl@39097
   516
  finally show ?thesis .
hoelzl@39097
   517
qed
hoelzl@39097
   518
hoelzl@40859
   519
lemma (in information_space) entropy_commute:
hoelzl@40859
   520
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   521
  shows "\<H>(\<lambda>x. (X x, Y x)) = \<H>(\<lambda>x. (Y x, X x))"
hoelzl@39097
   522
proof -
hoelzl@40859
   523
  have sf: "simple_function (\<lambda>x. (X x, Y x))" "simple_function (\<lambda>x. (Y x, X x))"
hoelzl@40859
   524
    using assms by (auto intro: simple_function_Pair)
hoelzl@39097
   525
  have *: "(\<lambda>x. (Y x, X x))`space M = (\<lambda>(a,b). (b,a))`(\<lambda>x. (X x, Y x))`space M"
hoelzl@39097
   526
    by auto
hoelzl@39097
   527
  have inj: "\<And>X. inj_on (\<lambda>(a,b). (b,a)) X"
hoelzl@39097
   528
    by (auto intro!: inj_onI)
hoelzl@39097
   529
  show ?thesis
hoelzl@40859
   530
    unfolding sf[THEN entropy_eq] unfolding * setsum_reindex[OF inj]
hoelzl@39097
   531
    by (simp add: joint_distribution_commute[of Y X] split_beta)
hoelzl@39097
   532
qed
hoelzl@39097
   533
hoelzl@40859
   534
lemma (in information_space) entropy_eq_cartesian_product:
hoelzl@40859
   535
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   536
  shows "\<H>(\<lambda>x. (X x, Y x)) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
hoelzl@39097
   537
    real (joint_distribution X Y {(x,y)}) *
hoelzl@39097
   538
    log b (real (joint_distribution X Y {(x,y)})))"
hoelzl@39097
   539
proof -
hoelzl@40859
   540
  have sf: "simple_function (\<lambda>x. (X x, Y x))"
hoelzl@40859
   541
    using assms by (auto intro: simple_function_Pair)
hoelzl@39097
   542
  { fix x assume "x\<notin>(\<lambda>x. (X x, Y x))`space M"
hoelzl@39097
   543
    then have "(\<lambda>x. (X x, Y x)) -` {x} \<inter> space M = {}" by auto
hoelzl@39097
   544
    then have "joint_distribution X Y {x} = 0"
hoelzl@39097
   545
      unfolding distribution_def by auto }
hoelzl@40859
   546
  then show ?thesis using sf assms
hoelzl@40859
   547
    unfolding entropy_eq[OF sf] neg_equal_iff_equal setsum_cartesian_product
hoelzl@40859
   548
    by (auto intro!: setsum_mono_zero_cong_left simp: simple_function_def)
hoelzl@39097
   549
qed
hoelzl@39097
   550
hoelzl@39097
   551
subsection {* Conditional Mutual Information *}
hoelzl@39097
   552
hoelzl@36080
   553
definition (in prob_space)
hoelzl@38656
   554
  "conditional_mutual_information b M1 M2 M3 X Y Z \<equiv>
hoelzl@40859
   555
    mutual_information b M1 (sigma (pair_algebra M2 M3)) X (\<lambda>x. (Y x, Z x)) -
hoelzl@38656
   556
    mutual_information b M1 M3 X Z"
hoelzl@36080
   557
hoelzl@40859
   558
abbreviation (in information_space)
hoelzl@40859
   559
  conditional_mutual_information_Pow ("\<I>'( _ ; _ | _ ')") where
hoelzl@36624
   560
  "\<I>(X ; Y | Z) \<equiv> conditional_mutual_information b
hoelzl@36080
   561
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
hoelzl@36080
   562
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr>
hoelzl@36080
   563
    \<lparr> space = Z`space M, sets = Pow (Z`space M) \<rparr>
hoelzl@36080
   564
    X Y Z"
hoelzl@36080
   565
hoelzl@38656
   566
hoelzl@40859
   567
lemma (in information_space) conditional_mutual_information_generic_eq:
hoelzl@40859
   568
  assumes MX: "finite_random_variable MX X"
hoelzl@40859
   569
    and MY: "finite_random_variable MY Y"
hoelzl@40859
   570
    and MZ: "finite_random_variable MZ Z"
hoelzl@40859
   571
  shows "conditional_mutual_information b MX MY MZ X Y Z = (\<Sum>(x, y, z) \<in> space MX \<times> space MY \<times> space MZ.
hoelzl@38656
   572
             real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
hoelzl@38656
   573
             log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
hoelzl@38656
   574
    (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
hoelzl@40859
   575
  (is "_ = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z)))")
hoelzl@40859
   576
proof -
hoelzl@40859
   577
  let ?YZ = "\<lambda>y z. real (joint_distribution Y Z {(y, z)})"
hoelzl@40859
   578
  let ?X = "\<lambda>x. real (distribution X {x})"
hoelzl@40859
   579
  let ?Z = "\<lambda>z. real (distribution Z {z})"
hoelzl@40859
   580
hoelzl@40859
   581
  txt {* This proof is actually quiet easy, however we need to show that the
hoelzl@40859
   582
    distributions are finite and the joint distributions are zero when one of
hoelzl@40859
   583
    the variables distribution is also zero. *}
hoelzl@40859
   584
hoelzl@40859
   585
  note finite_var = MX MY MZ
hoelzl@40859
   586
  note random_var = finite_var[THEN finite_random_variableD]
hoelzl@40859
   587
hoelzl@40859
   588
  note space_simps = space_pair_algebra space_sigma algebra.simps
hoelzl@40859
   589
hoelzl@40859
   590
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
hoelzl@40859
   591
  note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
hoelzl@40859
   592
  note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
hoelzl@40859
   593
  note YZX = finite_random_variable_pairI[OF finite_var(2) ZX]
hoelzl@40859
   594
  note order1 =
hoelzl@40859
   595
    finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
hoelzl@40859
   596
    finite_distribution_order(5,6)[OF finite_var(1,3), simplified space_simps]
hoelzl@40859
   597
hoelzl@40859
   598
  note finite = finite_var(1) YZ finite_var(3) XZ YZX
hoelzl@40859
   599
  note finite[THEN finite_distribution_finite, simplified space_simps, simp]
hoelzl@40859
   600
hoelzl@40859
   601
  have order2: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
hoelzl@40859
   602
          \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
hoelzl@40859
   603
    unfolding joint_distribution_commute_singleton[of X]
hoelzl@40859
   604
    unfolding joint_distribution_assoc_singleton[symmetric]
hoelzl@40859
   605
    using finite_distribution_order(6)[OF finite_var(2) ZX]
hoelzl@40859
   606
    by (auto simp: space_simps)
hoelzl@36624
   607
hoelzl@40859
   608
  have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?XZ x z * ?YZdZ y z))) =
hoelzl@40859
   609
    (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * (log b (?XYZ x y z / (?X x * ?YZ y z)) - log b (?XZ x z / (?X x * ?Z z))))"
hoelzl@40859
   610
    (is "(\<Sum>(x, y, z)\<in>?S. ?L x y z) = (\<Sum>(x, y, z)\<in>?S. ?R x y z)")
hoelzl@40859
   611
  proof (safe intro!: setsum_cong)
hoelzl@40859
   612
    fix x y z assume space: "x \<in> space MX" "y \<in> space MY" "z \<in> space MZ"
hoelzl@40859
   613
    then have *: "?XYZ x y z / (?XZ x z * ?YZdZ y z) =
hoelzl@40859
   614
      (?XYZ x y z / (?X x * ?YZ y z)) / (?XZ x z / (?X x * ?Z z))"
hoelzl@40859
   615
      using order1(3)
hoelzl@41023
   616
      by (auto simp: real_of_pextreal_mult[symmetric] real_of_pextreal_eq_0)
hoelzl@40859
   617
    show "?L x y z = ?R x y z"
hoelzl@40859
   618
    proof cases
hoelzl@40859
   619
      assume "?XYZ x y z \<noteq> 0"
hoelzl@40859
   620
      with space b_gt_1 order1 order2 show ?thesis unfolding *
hoelzl@40859
   621
        by (subst log_divide)
hoelzl@41023
   622
           (auto simp: zero_less_divide_iff zero_less_real_of_pextreal
hoelzl@41023
   623
                       real_of_pextreal_eq_0 zero_less_mult_iff)
hoelzl@40859
   624
    qed simp
hoelzl@40859
   625
  qed
hoelzl@40859
   626
  also have "\<dots> = (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
hoelzl@40859
   627
                  (\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z)))"
hoelzl@40859
   628
    by (auto simp add: setsum_subtractf[symmetric] field_simps intro!: setsum_cong)
hoelzl@40859
   629
  also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XZ x z / (?X x * ?Z z))) =
hoelzl@40859
   630
             (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z)))"
hoelzl@40859
   631
    unfolding setsum_cartesian_product[symmetric] setsum_commute[of _ _ "space MY"]
hoelzl@40859
   632
              setsum_left_distrib[symmetric]
hoelzl@40859
   633
    unfolding joint_distribution_commute_singleton[of X]
hoelzl@40859
   634
    unfolding joint_distribution_assoc_singleton[symmetric]
hoelzl@40859
   635
    using setsum_real_joint_distribution_singleton[OF finite_var(2) ZX, unfolded space_simps]
hoelzl@40859
   636
    by (intro setsum_cong refl) simp
hoelzl@40859
   637
  also have "(\<Sum>(x, y, z)\<in>?S. ?XYZ x y z * log b (?XYZ x y z / (?X x * ?YZ y z))) -
hoelzl@40859
   638
             (\<Sum>(x, z)\<in>space MX \<times> space MZ. ?XZ x z * log b (?XZ x z / (?X x * ?Z z))) =
hoelzl@40859
   639
             conditional_mutual_information b MX MY MZ X Y Z"
hoelzl@40859
   640
    unfolding conditional_mutual_information_def
hoelzl@40859
   641
    unfolding mutual_information_generic_eq[OF finite_var(1,3)]
hoelzl@40859
   642
    unfolding mutual_information_generic_eq[OF finite_var(1) YZ]
hoelzl@40859
   643
    by (simp add: space_sigma space_pair_algebra setsum_cartesian_product')
hoelzl@40859
   644
  finally show ?thesis by simp
hoelzl@40859
   645
qed
hoelzl@40859
   646
hoelzl@40859
   647
lemma (in information_space) conditional_mutual_information_eq:
hoelzl@40859
   648
  assumes "simple_function X" "simple_function Y" "simple_function Z"
hoelzl@40859
   649
  shows "\<I>(X;Y|Z) = (\<Sum>(x, y, z) \<in> X`space M \<times> Y`space M \<times> Z`space M.
hoelzl@40859
   650
             real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) *
hoelzl@40859
   651
             log b (real (distribution (\<lambda>x. (X x, Y x, Z x)) {(x, y, z)}) /
hoelzl@40859
   652
    (real (joint_distribution X Z {(x, z)}) * real (joint_distribution Y Z {(y,z)} / distribution Z {z}))))"
hoelzl@40859
   653
  using conditional_mutual_information_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   654
  by simp
hoelzl@40859
   655
hoelzl@40859
   656
lemma (in information_space) conditional_mutual_information_eq_mutual_information:
hoelzl@40859
   657
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   658
  shows "\<I>(X ; Y) = \<I>(X ; Y | (\<lambda>x. ()))"
hoelzl@36624
   659
proof -
hoelzl@36624
   660
  have [simp]: "(\<lambda>x. ()) ` space M = {()}" using not_empty by auto
hoelzl@40859
   661
  have C: "simple_function (\<lambda>x. ())" by auto
hoelzl@36624
   662
  show ?thesis
hoelzl@40859
   663
    unfolding conditional_mutual_information_eq[OF X Y C]
hoelzl@40859
   664
    unfolding mutual_information_eq[OF X Y]
hoelzl@36624
   665
    by (simp add: setsum_cartesian_product' distribution_remove_const)
hoelzl@36624
   666
qed
hoelzl@36624
   667
hoelzl@40859
   668
lemma (in prob_space) distribution_unit[simp]: "distribution (\<lambda>x. ()) {()} = 1"
hoelzl@40859
   669
  unfolding distribution_def using measure_space_1 by auto
hoelzl@40859
   670
hoelzl@40859
   671
lemma (in prob_space) joint_distribution_unit[simp]: "distribution (\<lambda>x. (X x, ())) {(a, ())} = distribution X {a}"
hoelzl@40859
   672
  unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>])
hoelzl@40859
   673
hoelzl@40859
   674
lemma (in prob_space) setsum_distribution:
hoelzl@40859
   675
  assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. distribution X {a}) = 1"
hoelzl@40859
   676
  using setsum_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
hoelzl@40859
   677
  using sigma_algebra_Pow[of "UNIV::unit set"] by simp
hoelzl@40859
   678
hoelzl@40859
   679
lemma (in prob_space) setsum_real_distribution:
hoelzl@40859
   680
  assumes X: "finite_random_variable MX X" shows "(\<Sum>a\<in>space MX. real (distribution X {a})) = 1"
hoelzl@40859
   681
  using setsum_real_joint_distribution[OF assms, of "\<lparr> space = UNIV, sets = Pow UNIV \<rparr>" "\<lambda>x. ()" "{()}"]
hoelzl@40859
   682
  using sigma_algebra_Pow[of "UNIV::unit set"] by simp
hoelzl@40859
   683
hoelzl@40859
   684
lemma (in information_space) conditional_mutual_information_generic_positive:
hoelzl@40859
   685
  assumes "finite_random_variable MX X" and "finite_random_variable MY Y" and "finite_random_variable MZ Z"
hoelzl@40859
   686
  shows "0 \<le> conditional_mutual_information b MX MY MZ X Y Z"
hoelzl@40859
   687
proof (cases "space MX \<times> space MY \<times> space MZ = {}")
hoelzl@40859
   688
  case True show ?thesis
hoelzl@40859
   689
    unfolding conditional_mutual_information_generic_eq[OF assms] True
hoelzl@40859
   690
    by simp
hoelzl@40859
   691
next
hoelzl@40859
   692
  case False
hoelzl@38656
   693
  let "?dXYZ A" = "real (distribution (\<lambda>x. (X x, Y x, Z x)) A)"
hoelzl@38656
   694
  let "?dXZ A" = "real (joint_distribution X Z A)"
hoelzl@38656
   695
  let "?dYZ A" = "real (joint_distribution Y Z A)"
hoelzl@38656
   696
  let "?dX A" = "real (distribution X A)"
hoelzl@38656
   697
  let "?dZ A" = "real (distribution Z A)"
hoelzl@40859
   698
  let ?M = "space MX \<times> space MY \<times> space MZ"
hoelzl@36624
   699
nipkow@39302
   700
  have split_beta: "\<And>f. split f = (\<lambda>x. f (fst x) (snd x))" by (simp add: fun_eq_iff)
hoelzl@36080
   701
hoelzl@40859
   702
  note space_simps = space_pair_algebra space_sigma algebra.simps
hoelzl@40859
   703
hoelzl@40859
   704
  note finite_var = assms
hoelzl@40859
   705
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
hoelzl@40859
   706
  note XZ = finite_random_variable_pairI[OF finite_var(1,3)]
hoelzl@40859
   707
  note ZX = finite_random_variable_pairI[OF finite_var(3,1)]
hoelzl@40859
   708
  note YZ = finite_random_variable_pairI[OF finite_var(2,3)]
hoelzl@40859
   709
  note XYZ = finite_random_variable_pairI[OF finite_var(1) YZ]
hoelzl@40859
   710
  note finite = finite_var(3) YZ XZ XYZ
hoelzl@40859
   711
  note finite = finite[THEN finite_distribution_finite, simplified space_simps]
hoelzl@40859
   712
hoelzl@40859
   713
  have order: "\<And>x y z. \<lbrakk>x \<in> space MX; y \<in> space MY; z \<in> space MZ; joint_distribution X Z {(x, z)} = 0\<rbrakk>
hoelzl@40859
   714
          \<Longrightarrow> joint_distribution X (\<lambda>x. (Y x, Z x)) {(x, y, z)} = 0"
hoelzl@40859
   715
    unfolding joint_distribution_commute_singleton[of X]
hoelzl@40859
   716
    unfolding joint_distribution_assoc_singleton[symmetric]
hoelzl@40859
   717
    using finite_distribution_order(6)[OF finite_var(2) ZX]
hoelzl@40859
   718
    by (auto simp: space_simps)
hoelzl@40859
   719
hoelzl@40859
   720
  note order = order
hoelzl@40859
   721
    finite_distribution_order(5,6)[OF finite_var(1) YZ, simplified space_simps]
hoelzl@40859
   722
    finite_distribution_order(5,6)[OF finite_var(2,3), simplified space_simps]
hoelzl@40859
   723
hoelzl@40859
   724
  have "- conditional_mutual_information b MX MY MZ X Y Z = - (\<Sum>(x, y, z) \<in> ?M. ?dXYZ {(x, y, z)} *
hoelzl@40859
   725
    log b (?dXYZ {(x, y, z)} / (?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})))"
hoelzl@40859
   726
    unfolding conditional_mutual_information_generic_eq[OF assms] neg_equal_iff_equal
hoelzl@41023
   727
    by (intro setsum_cong) (auto intro!: arg_cong[where f="log b"] simp: real_of_pextreal_mult[symmetric])
hoelzl@40859
   728
  also have "\<dots> \<le> log b (\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z})"
hoelzl@36624
   729
    unfolding split_beta
hoelzl@36624
   730
  proof (rule log_setsum_divide)
hoelzl@40859
   731
    show "?M \<noteq> {}" using False by simp
hoelzl@36624
   732
    show "1 < b" using b_gt_1 .
hoelzl@36080
   733
hoelzl@40859
   734
    show "finite ?M" using assms
hoelzl@40859
   735
      unfolding finite_sigma_algebra_def finite_sigma_algebra_axioms_def by auto
hoelzl@40859
   736
hoelzl@40859
   737
    show "(\<Sum>x\<in>?M. ?dXYZ {(fst x, fst (snd x), snd (snd x))}) = 1"
hoelzl@40859
   738
      unfolding setsum_cartesian_product'
hoelzl@40859
   739
      unfolding setsum_commute[of _ "space MY"]
hoelzl@40859
   740
      unfolding setsum_commute[of _ "space MZ"]
hoelzl@40859
   741
      by (simp_all add: space_pair_algebra
hoelzl@40859
   742
        setsum_real_joint_distribution_singleton[OF `finite_random_variable MX X` YZ]
hoelzl@40859
   743
        setsum_real_joint_distribution_singleton[OF `finite_random_variable MY Y` finite_var(3)]
hoelzl@40859
   744
        setsum_real_distribution[OF `finite_random_variable MZ Z`])
hoelzl@40859
   745
hoelzl@36624
   746
    fix x assume "x \<in> ?M"
hoelzl@38656
   747
    let ?x = "(fst x, fst (snd x), snd (snd x))"
hoelzl@38656
   748
hoelzl@41023
   749
    show "0 \<le> ?dXYZ {?x}" using real_pextreal_nonneg .
hoelzl@36624
   750
    show "0 \<le> ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
hoelzl@41023
   751
     by (simp add: real_pextreal_nonneg mult_nonneg_nonneg divide_nonneg_nonneg)
hoelzl@36080
   752
hoelzl@38656
   753
    assume *: "0 < ?dXYZ {?x}"
hoelzl@40859
   754
    with `x \<in> ?M` show "0 < ?dXZ {(fst x, snd (snd x))} * ?dYZ {(fst (snd x), snd (snd x))} / ?dZ {snd (snd x)}"
hoelzl@40859
   755
      using finite order
hoelzl@40859
   756
      by (cases x)
hoelzl@41023
   757
         (auto simp add: zero_less_real_of_pextreal zero_less_mult_iff zero_less_divide_iff)
hoelzl@40859
   758
  qed
hoelzl@40859
   759
  also have "(\<Sum>(x, y, z) \<in> ?M. ?dXZ {(x, z)} * ?dYZ {(y,z)} / ?dZ {z}) = (\<Sum>z\<in>space MZ. ?dZ {z})"
hoelzl@36624
   760
    apply (simp add: setsum_cartesian_product')
hoelzl@36624
   761
    apply (subst setsum_commute)
hoelzl@36624
   762
    apply (subst (2) setsum_commute)
hoelzl@40859
   763
    by (auto simp: setsum_divide_distrib[symmetric] setsum_product[symmetric]
hoelzl@40859
   764
                   setsum_real_joint_distribution_singleton[OF finite_var(1,3)]
hoelzl@40859
   765
                   setsum_real_joint_distribution_singleton[OF finite_var(2,3)]
hoelzl@36624
   766
          intro!: setsum_cong)
hoelzl@40859
   767
  also have "log b (\<Sum>z\<in>space MZ. ?dZ {z}) = 0"
hoelzl@40859
   768
    unfolding setsum_real_distribution[OF finite_var(3)] by simp
hoelzl@40859
   769
  finally show ?thesis by simp
hoelzl@36080
   770
qed
hoelzl@36080
   771
hoelzl@40859
   772
lemma (in information_space) conditional_mutual_information_positive:
hoelzl@40859
   773
  assumes "simple_function X" and "simple_function Y" and "simple_function Z"
hoelzl@40859
   774
  shows "0 \<le> \<I>(X;Y|Z)"
hoelzl@40859
   775
  using conditional_mutual_information_generic_positive[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   776
  by simp
hoelzl@40859
   777
hoelzl@39097
   778
subsection {* Conditional Entropy *}
hoelzl@39097
   779
hoelzl@36080
   780
definition (in prob_space)
hoelzl@36080
   781
  "conditional_entropy b S T X Y = conditional_mutual_information b S S T X X Y"
hoelzl@36080
   782
hoelzl@40859
   783
abbreviation (in information_space)
hoelzl@40859
   784
  conditional_entropy_Pow ("\<H>'(_ | _')") where
hoelzl@36624
   785
  "\<H>(X | Y) \<equiv> conditional_entropy b
hoelzl@36080
   786
    \<lparr> space = X`space M, sets = Pow (X`space M) \<rparr>
hoelzl@36080
   787
    \<lparr> space = Y`space M, sets = Pow (Y`space M) \<rparr> X Y"
hoelzl@36080
   788
hoelzl@40859
   789
lemma (in information_space) conditional_entropy_positive:
hoelzl@40859
   790
  "simple_function X \<Longrightarrow> simple_function Y \<Longrightarrow> 0 \<le> \<H>(X | Y)"
hoelzl@40859
   791
  unfolding conditional_entropy_def by (auto intro!: conditional_mutual_information_positive)
hoelzl@36080
   792
hoelzl@40859
   793
lemma (in measure_space) empty_measureI: "A = {} \<Longrightarrow> \<mu> A = 0" by simp
hoelzl@40859
   794
hoelzl@40859
   795
lemma (in information_space) conditional_entropy_generic_eq:
hoelzl@40859
   796
  assumes MX: "finite_random_variable MX X"
hoelzl@40859
   797
  assumes MZ: "finite_random_variable MZ Z"
hoelzl@39097
   798
  shows "conditional_entropy b MX MZ X Z =
hoelzl@39097
   799
     - (\<Sum>(x, z)\<in>space MX \<times> space MZ.
hoelzl@39097
   800
         real (joint_distribution X Z {(x, z)}) *
hoelzl@39097
   801
         log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
hoelzl@40859
   802
proof -
hoelzl@40859
   803
  interpret MX: finite_sigma_algebra MX using MX by simp
hoelzl@40859
   804
  interpret MZ: finite_sigma_algebra MZ using MZ by simp
hoelzl@40859
   805
  let "?XXZ x y z" = "joint_distribution X (\<lambda>x. (X x, Z x)) {(x, y, z)}"
hoelzl@40859
   806
  let "?XZ x z" = "joint_distribution X Z {(x, z)}"
hoelzl@40859
   807
  let "?Z z" = "distribution Z {z}"
hoelzl@40859
   808
  let "?f x y z" = "log b (real (?XXZ x y z) / (real (?XZ x z) * real (?XZ y z / ?Z z)))"
hoelzl@40859
   809
  { fix x z have "?XXZ x x z = ?XZ x z"
hoelzl@40859
   810
      unfolding distribution_def by (auto intro!: arg_cong[where f=\<mu>]) }
hoelzl@40859
   811
  note this[simp]
hoelzl@40859
   812
  { fix x x' :: 'b and z assume "x' \<noteq> x"
hoelzl@40859
   813
    then have "?XXZ x x' z = 0"
hoelzl@40859
   814
      by (auto simp: distribution_def intro!: arg_cong[where f=\<mu>] empty_measureI) }
hoelzl@40859
   815
  note this[simp]
hoelzl@40859
   816
  { fix x x' z assume *: "x \<in> space MX" "z \<in> space MZ"
hoelzl@40859
   817
    then have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z)
hoelzl@40859
   818
      = (\<Sum>x'\<in>space MX. if x = x' then real (?XZ x z) * ?f x x z else 0)"
hoelzl@40859
   819
      by (auto intro!: setsum_cong)
hoelzl@40859
   820
    also have "\<dots> = real (?XZ x z) * ?f x x z"
hoelzl@40859
   821
      using `x \<in> space MX` by (simp add: setsum_cases[OF MX.finite_space])
hoelzl@40859
   822
    also have "\<dots> = real (?XZ x z) * log b (real (?Z z) / real (?XZ x z))"
hoelzl@41023
   823
      by (auto simp: real_of_pextreal_mult[symmetric])
hoelzl@40859
   824
    also have "\<dots> = - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))"
hoelzl@40859
   825
      using assms[THEN finite_distribution_finite]
hoelzl@40859
   826
      using finite_distribution_order(6)[OF MX MZ]
hoelzl@41023
   827
      by (auto simp: log_simps field_simps zero_less_mult_iff zero_less_real_of_pextreal real_of_pextreal_eq_0)
hoelzl@40859
   828
    finally have "(\<Sum>x'\<in>space MX. real (?XXZ x x' z) * ?f x x' z) =
hoelzl@40859
   829
      - real (?XZ x z) * log b (real (?XZ x z) / real (?Z z))" . }
hoelzl@40859
   830
  note * = this
hoelzl@40859
   831
hoelzl@40859
   832
  show ?thesis
hoelzl@40859
   833
    unfolding conditional_entropy_def
hoelzl@40859
   834
    unfolding conditional_mutual_information_generic_eq[OF MX MX MZ]
hoelzl@40859
   835
    by (auto simp: setsum_cartesian_product' setsum_negf[symmetric]
hoelzl@41023
   836
                   setsum_commute[of _ "space MZ"] *   simp del: divide_pextreal_def
hoelzl@40859
   837
             intro!: setsum_cong)
hoelzl@39097
   838
qed
hoelzl@39097
   839
hoelzl@40859
   840
lemma (in information_space) conditional_entropy_eq:
hoelzl@40859
   841
  assumes "simple_function X" "simple_function Z"
hoelzl@40859
   842
  shows "\<H>(X | Z) =
hoelzl@36080
   843
     - (\<Sum>(x, z)\<in>X ` space M \<times> Z ` space M.
hoelzl@38656
   844
         real (joint_distribution X Z {(x, z)}) *
hoelzl@38656
   845
         log b (real (joint_distribution X Z {(x, z)}) / real (distribution Z {z})))"
hoelzl@40859
   846
  using conditional_entropy_generic_eq[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   847
  by simp
hoelzl@39097
   848
hoelzl@40859
   849
lemma (in information_space) conditional_entropy_eq_ce_with_hypothesis:
hoelzl@40859
   850
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   851
  shows "\<H>(X | Y) =
hoelzl@39097
   852
    -(\<Sum>y\<in>Y`space M. real (distribution Y {y}) *
hoelzl@39097
   853
      (\<Sum>x\<in>X`space M. real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}) *
hoelzl@39097
   854
              log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {(y)}))))"
hoelzl@40859
   855
  unfolding conditional_entropy_eq[OF assms]
hoelzl@40859
   856
  using finite_distribution_finite[OF finite_random_variable_pairI[OF assms[THEN simple_function_imp_finite_random_variable]]]
hoelzl@40859
   857
  using finite_distribution_order(5,6)[OF assms[THEN simple_function_imp_finite_random_variable]]
hoelzl@40859
   858
  using finite_distribution_finite[OF Y[THEN simple_function_imp_finite_random_variable]]
hoelzl@41023
   859
  by (auto simp: setsum_cartesian_product'  setsum_commute[of _ "Y`space M"] setsum_right_distrib real_of_pextreal_eq_0
hoelzl@40859
   860
           intro!: setsum_cong)
hoelzl@39097
   861
hoelzl@40859
   862
lemma (in information_space) conditional_entropy_eq_cartesian_product:
hoelzl@40859
   863
  assumes "simple_function X" "simple_function Y"
hoelzl@40859
   864
  shows "\<H>(X | Y) = -(\<Sum>x\<in>X`space M. \<Sum>y\<in>Y`space M.
hoelzl@39097
   865
    real (joint_distribution X Y {(x,y)}) *
hoelzl@39097
   866
    log b (real (joint_distribution X Y {(x,y)}) / real (distribution Y {y})))"
hoelzl@40859
   867
  unfolding conditional_entropy_eq[OF assms]
hoelzl@40859
   868
  by (auto intro!: setsum_cong simp: setsum_cartesian_product')
hoelzl@36080
   869
hoelzl@39097
   870
subsection {* Equalities *}
hoelzl@39097
   871
hoelzl@40859
   872
lemma (in information_space) mutual_information_eq_entropy_conditional_entropy:
hoelzl@40859
   873
  assumes X: "simple_function X" and Z: "simple_function Z"
hoelzl@40859
   874
  shows  "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)"
hoelzl@40859
   875
proof -
hoelzl@40859
   876
  let "?XZ x z" = "real (joint_distribution X Z {(x, z)})"
hoelzl@40859
   877
  let "?Z z" = "real (distribution Z {z})"
hoelzl@40859
   878
  let "?X x" = "real (distribution X {x})"
hoelzl@40859
   879
  note fX = X[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   880
  note fZ = Z[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   881
  note fX[THEN finite_distribution_finite, simp] and fZ[THEN finite_distribution_finite, simp]
hoelzl@40859
   882
  note finite_distribution_order[OF fX fZ, simp]
hoelzl@40859
   883
  { fix x z assume "x \<in> X`space M" "z \<in> Z`space M"
hoelzl@40859
   884
    have "?XZ x z * log b (?XZ x z / (?X x * ?Z z)) =
hoelzl@40859
   885
          ?XZ x z * log b (?XZ x z / ?Z z) - ?XZ x z * log b (?X x)"
hoelzl@41023
   886
      by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
hoelzl@41023
   887
                     zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
hoelzl@40859
   888
  note * = this
hoelzl@40859
   889
  show ?thesis
hoelzl@40859
   890
    unfolding entropy_eq[OF X] conditional_entropy_eq[OF X Z] mutual_information_eq[OF X Z]
hoelzl@40859
   891
    using setsum_real_joint_distribution_singleton[OF fZ fX, unfolded joint_distribution_commute_singleton[of Z X]]
hoelzl@40859
   892
    by (simp add: * setsum_cartesian_product' setsum_subtractf setsum_left_distrib[symmetric]
hoelzl@40859
   893
                     setsum_real_distribution)
hoelzl@40859
   894
qed
hoelzl@36080
   895
hoelzl@40859
   896
lemma (in information_space) conditional_entropy_less_eq_entropy:
hoelzl@40859
   897
  assumes X: "simple_function X" and Z: "simple_function Z"
hoelzl@40859
   898
  shows "\<H>(X | Z) \<le> \<H>(X)"
hoelzl@36624
   899
proof -
hoelzl@40859
   900
  have "\<I>(X ; Z) = \<H>(X) - \<H>(X | Z)" using mutual_information_eq_entropy_conditional_entropy[OF assms] .
hoelzl@40859
   901
  with mutual_information_positive[OF X Z] entropy_positive[OF X]
hoelzl@36624
   902
  show ?thesis by auto
hoelzl@36080
   903
qed
hoelzl@36080
   904
hoelzl@40859
   905
lemma (in information_space) entropy_chain_rule:
hoelzl@40859
   906
  assumes X: "simple_function X" and Y: "simple_function Y"
hoelzl@40859
   907
  shows  "\<H>(\<lambda>x. (X x, Y x)) = \<H>(X) + \<H>(Y|X)"
hoelzl@40859
   908
proof -
hoelzl@40859
   909
  let "?XY x y" = "real (joint_distribution X Y {(x, y)})"
hoelzl@40859
   910
  let "?Y y" = "real (distribution Y {y})"
hoelzl@40859
   911
  let "?X x" = "real (distribution X {x})"
hoelzl@40859
   912
  note fX = X[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   913
  note fY = Y[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
   914
  note fX[THEN finite_distribution_finite, simp] and fY[THEN finite_distribution_finite, simp]
hoelzl@40859
   915
  note finite_distribution_order[OF fX fY, simp]
hoelzl@40859
   916
  { fix x y assume "x \<in> X`space M" "y \<in> Y`space M"
hoelzl@40859
   917
    have "?XY x y * log b (?XY x y / ?X x) =
hoelzl@40859
   918
          ?XY x y * log b (?XY x y) - ?XY x y * log b (?X x)"
hoelzl@41023
   919
      by (auto simp: log_simps real_of_pextreal_mult[symmetric] zero_less_mult_iff
hoelzl@41023
   920
                     zero_less_real_of_pextreal field_simps real_of_pextreal_eq_0 abs_mult) }
hoelzl@40859
   921
  note * = this
hoelzl@40859
   922
  show ?thesis
hoelzl@40859
   923
    using setsum_real_joint_distribution_singleton[OF fY fX]
hoelzl@40859
   924
    unfolding entropy_eq[OF X] conditional_entropy_eq_cartesian_product[OF Y X] entropy_eq_cartesian_product[OF X Y]
hoelzl@40859
   925
    unfolding joint_distribution_commute_singleton[of Y X] setsum_commute[of _ "X`space M"]
hoelzl@40859
   926
    by (simp add: * setsum_subtractf setsum_left_distrib[symmetric])
hoelzl@40859
   927
qed
hoelzl@38656
   928
hoelzl@39097
   929
section {* Partitioning *}
hoelzl@36080
   930
hoelzl@36624
   931
definition "subvimage A f g \<longleftrightarrow> (\<forall>x \<in> A. f -` {f x} \<inter> A \<subseteq> g -` {g x} \<inter> A)"
hoelzl@36624
   932
hoelzl@36624
   933
lemma subvimageI:
hoelzl@36624
   934
  assumes "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
hoelzl@36624
   935
  shows "subvimage A f g"
hoelzl@36624
   936
  using assms unfolding subvimage_def by blast
hoelzl@36624
   937
hoelzl@36624
   938
lemma subvimageE[consumes 1]:
hoelzl@36624
   939
  assumes "subvimage A f g"
hoelzl@36624
   940
  obtains "\<And>x y. \<lbrakk> x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
hoelzl@36624
   941
  using assms unfolding subvimage_def by blast
hoelzl@36624
   942
hoelzl@36624
   943
lemma subvimageD:
hoelzl@36624
   944
  "\<lbrakk> subvimage A f g ; x \<in> A ; y \<in> A ; f x = f y \<rbrakk> \<Longrightarrow> g x = g y"
hoelzl@36624
   945
  using assms unfolding subvimage_def by blast
hoelzl@36624
   946
hoelzl@36624
   947
lemma subvimage_subset:
hoelzl@36624
   948
  "\<lbrakk> subvimage B f g ; A \<subseteq> B \<rbrakk> \<Longrightarrow> subvimage A f g"
hoelzl@36624
   949
  unfolding subvimage_def by auto
hoelzl@36624
   950
hoelzl@36624
   951
lemma subvimage_idem[intro]: "subvimage A g g"
hoelzl@36624
   952
  by (safe intro!: subvimageI)
hoelzl@36624
   953
hoelzl@36624
   954
lemma subvimage_comp_finer[intro]:
hoelzl@36624
   955
  assumes svi: "subvimage A g h"
hoelzl@36624
   956
  shows "subvimage A g (f \<circ> h)"
hoelzl@36624
   957
proof (rule subvimageI, simp)
hoelzl@36624
   958
  fix x y assume "x \<in> A" "y \<in> A" "g x = g y"
hoelzl@36624
   959
  from svi[THEN subvimageD, OF this]
hoelzl@36624
   960
  show "f (h x) = f (h y)" by simp
hoelzl@36624
   961
qed
hoelzl@36624
   962
hoelzl@36624
   963
lemma subvimage_comp_gran:
hoelzl@36624
   964
  assumes svi: "subvimage A g h"
hoelzl@36624
   965
  assumes inj: "inj_on f (g ` A)"
hoelzl@36624
   966
  shows "subvimage A (f \<circ> g) h"
hoelzl@36624
   967
  by (rule subvimageI) (auto intro!: subvimageD[OF svi] simp: inj_on_iff[OF inj])
hoelzl@36624
   968
hoelzl@36624
   969
lemma subvimage_comp:
hoelzl@36624
   970
  assumes svi: "subvimage (f ` A) g h"
hoelzl@36624
   971
  shows "subvimage A (g \<circ> f) (h \<circ> f)"
hoelzl@36624
   972
  by (rule subvimageI) (auto intro!: svi[THEN subvimageD])
hoelzl@36624
   973
hoelzl@36624
   974
lemma subvimage_trans:
hoelzl@36624
   975
  assumes fg: "subvimage A f g"
hoelzl@36624
   976
  assumes gh: "subvimage A g h"
hoelzl@36624
   977
  shows "subvimage A f h"
hoelzl@36624
   978
  by (rule subvimageI) (auto intro!: fg[THEN subvimageD] gh[THEN subvimageD])
hoelzl@36624
   979
hoelzl@36624
   980
lemma subvimage_translator:
hoelzl@36624
   981
  assumes svi: "subvimage A f g"
hoelzl@36624
   982
  shows "\<exists>h. \<forall>x \<in> A. h (f x)  = g x"
hoelzl@36624
   983
proof (safe intro!: exI[of _ "\<lambda>x. (THE z. z \<in> (g ` (f -` {x} \<inter> A)))"])
hoelzl@36624
   984
  fix x assume "x \<in> A"
hoelzl@36624
   985
  show "(THE x'. x' \<in> (g ` (f -` {f x} \<inter> A))) = g x"
hoelzl@36624
   986
    by (rule theI2[of _ "g x"])
hoelzl@36624
   987
      (insert `x \<in> A`, auto intro!: svi[THEN subvimageD])
hoelzl@36624
   988
qed
hoelzl@36624
   989
hoelzl@36624
   990
lemma subvimage_translator_image:
hoelzl@36624
   991
  assumes svi: "subvimage A f g"
hoelzl@36624
   992
  shows "\<exists>h. h ` f ` A = g ` A"
hoelzl@36624
   993
proof -
hoelzl@36624
   994
  from subvimage_translator[OF svi]
hoelzl@36624
   995
  obtain h where "\<And>x. x \<in> A \<Longrightarrow> h (f x) = g x" by auto
hoelzl@36624
   996
  thus ?thesis
hoelzl@36624
   997
    by (auto intro!: exI[of _ h]
hoelzl@36624
   998
      simp: image_compose[symmetric] comp_def cong: image_cong)
hoelzl@36624
   999
qed
hoelzl@36624
  1000
hoelzl@36624
  1001
lemma subvimage_finite:
hoelzl@36624
  1002
  assumes svi: "subvimage A f g" and fin: "finite (f`A)"
hoelzl@36624
  1003
  shows "finite (g`A)"
hoelzl@36624
  1004
proof -
hoelzl@36624
  1005
  from subvimage_translator_image[OF svi]
hoelzl@36624
  1006
  obtain h where "g`A = h`f`A" by fastsimp
hoelzl@36624
  1007
  with fin show "finite (g`A)" by simp
hoelzl@36624
  1008
qed
hoelzl@36624
  1009
hoelzl@36624
  1010
lemma subvimage_disj:
hoelzl@36624
  1011
  assumes svi: "subvimage A f g"
hoelzl@36624
  1012
  shows "f -` {x} \<inter> A \<subseteq> g -` {y} \<inter> A \<or>
hoelzl@36624
  1013
      f -` {x} \<inter> g -` {y} \<inter> A = {}" (is "?sub \<or> ?dist")
hoelzl@36624
  1014
proof (rule disjCI)
hoelzl@36624
  1015
  assume "\<not> ?dist"
hoelzl@36624
  1016
  then obtain z where "z \<in> A" and "x = f z" and "y = g z" by auto
hoelzl@36624
  1017
  thus "?sub" using svi unfolding subvimage_def by auto
hoelzl@36624
  1018
qed
hoelzl@36624
  1019
hoelzl@36624
  1020
lemma setsum_image_split:
hoelzl@36624
  1021
  assumes svi: "subvimage A f g" and fin: "finite (f ` A)"
hoelzl@36624
  1022
  shows "(\<Sum>x\<in>f`A. h x) = (\<Sum>y\<in>g`A. \<Sum>x\<in>f`(g -` {y} \<inter> A). h x)"
hoelzl@36624
  1023
    (is "?lhs = ?rhs")
hoelzl@36624
  1024
proof -
hoelzl@36624
  1025
  have "f ` A =
hoelzl@36624
  1026
      snd ` (SIGMA x : g ` A. f ` (g -` {x} \<inter> A))"
hoelzl@36624
  1027
      (is "_ = snd ` ?SIGMA")
hoelzl@36624
  1028
    unfolding image_split_eq_Sigma[symmetric]
hoelzl@36624
  1029
    by (simp add: image_compose[symmetric] comp_def)
hoelzl@36624
  1030
  moreover
hoelzl@36624
  1031
  have snd_inj: "inj_on snd ?SIGMA"
hoelzl@36624
  1032
    unfolding image_split_eq_Sigma[symmetric]
hoelzl@36624
  1033
    by (auto intro!: inj_onI subvimageD[OF svi])
hoelzl@36624
  1034
  ultimately
hoelzl@36624
  1035
  have "(\<Sum>x\<in>f`A. h x) = (\<Sum>(x,y)\<in>?SIGMA. h y)"
hoelzl@36624
  1036
    by (auto simp: setsum_reindex intro: setsum_cong)
hoelzl@36624
  1037
  also have "... = ?rhs"
hoelzl@36624
  1038
    using subvimage_finite[OF svi fin] fin
hoelzl@36624
  1039
    apply (subst setsum_Sigma[symmetric])
hoelzl@36624
  1040
    by (auto intro!: finite_subset[of _ "f`A"])
hoelzl@36624
  1041
  finally show ?thesis .
hoelzl@36624
  1042
qed
hoelzl@36624
  1043
hoelzl@40859
  1044
lemma (in information_space) entropy_partition:
hoelzl@40859
  1045
  assumes sf: "simple_function X" "simple_function P"
hoelzl@36624
  1046
  assumes svi: "subvimage (space M) X P"
hoelzl@36624
  1047
  shows "\<H>(X) = \<H>(P) + \<H>(X|P)"
hoelzl@36624
  1048
proof -
hoelzl@40859
  1049
  let "?XP x p" = "real (joint_distribution X P {(x, p)})"
hoelzl@40859
  1050
  let "?X x" = "real (distribution X {x})"
hoelzl@40859
  1051
  let "?P p" = "real (distribution P {p})"
hoelzl@40859
  1052
  note fX = sf(1)[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
  1053
  note fP = sf(2)[THEN simple_function_imp_finite_random_variable]
hoelzl@40859
  1054
  note fX[THEN finite_distribution_finite, simp] and fP[THEN finite_distribution_finite, simp]
hoelzl@40859
  1055
  note finite_distribution_order[OF fX fP, simp]
hoelzl@38656
  1056
  have "(\<Sum>x\<in>X ` space M. real (distribution X {x}) * log b (real (distribution X {x}))) =
hoelzl@36624
  1057
    (\<Sum>y\<in>P `space M. \<Sum>x\<in>X ` space M.
hoelzl@38656
  1058
    real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})))"
hoelzl@36624
  1059
  proof (subst setsum_image_split[OF svi],
hoelzl@40859
  1060
      safe intro!: setsum_mono_zero_cong_left imageI)
hoelzl@40859
  1061
    show "finite (X ` space M)" "finite (X ` space M)" "finite (P ` space M)"
hoelzl@40859
  1062
      using sf unfolding simple_function_def by auto
hoelzl@40859
  1063
  next
hoelzl@36624
  1064
    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
hoelzl@38656
  1065
    assume "real (joint_distribution X P {(X x, P p)}) * log b (real (joint_distribution X P {(X x, P p)})) \<noteq> 0"
hoelzl@36624
  1066
    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M \<noteq> {}" by (auto simp: distribution_def)
hoelzl@36624
  1067
    with svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
hoelzl@36624
  1068
    show "x \<in> P -` {P p}" by auto
hoelzl@36624
  1069
  next
hoelzl@36624
  1070
    fix p x assume in_space: "p \<in> space M" "x \<in> space M"
hoelzl@36624
  1071
    assume "P x = P p"
hoelzl@36624
  1072
    from this[symmetric] svi[unfolded subvimage_def, rule_format, OF `x \<in> space M`]
hoelzl@36624
  1073
    have "X -` {X x} \<inter> space M \<subseteq> P -` {P p} \<inter> space M"
hoelzl@36624
  1074
      by auto
hoelzl@36624
  1075
    hence "(\<lambda>x. (X x, P x)) -` {(X x, P p)} \<inter> space M = X -` {X x} \<inter> space M"
hoelzl@36624
  1076
      by auto
hoelzl@38656
  1077
    thus "real (distribution X {X x}) * log b (real (distribution X {X x})) =
hoelzl@38656
  1078
          real (joint_distribution X P {(X x, P p)}) *
hoelzl@38656
  1079
          log b (real (joint_distribution X P {(X x, P p)}))"
hoelzl@36624
  1080
      by (auto simp: distribution_def)
hoelzl@36624
  1081
  qed
hoelzl@40859
  1082
  moreover have "\<And>x y. real (joint_distribution X P {(x, y)}) *
hoelzl@40859
  1083
      log b (real (joint_distribution X P {(x, y)}) / real (distribution P {y})) =
hoelzl@40859
  1084
      real (joint_distribution X P {(x, y)}) * log b (real (joint_distribution X P {(x, y)})) -
hoelzl@40859
  1085
      real (joint_distribution X P {(x, y)}) * log b (real (distribution P {y}))"
hoelzl@40859
  1086
    by (auto simp add: log_simps zero_less_mult_iff field_simps)
hoelzl@40859
  1087
  ultimately show ?thesis
hoelzl@40859
  1088
    unfolding sf[THEN entropy_eq] conditional_entropy_eq[OF sf]
hoelzl@40859
  1089
    using setsum_real_joint_distribution_singleton[OF fX fP]
hoelzl@38656
  1090
    by (simp add: setsum_cartesian_product' setsum_subtractf setsum_real_distribution
hoelzl@36624
  1091
      setsum_left_distrib[symmetric] setsum_commute[where B="P`space M"])
hoelzl@36624
  1092
qed
hoelzl@36624
  1093
hoelzl@40859
  1094
corollary (in information_space) entropy_data_processing:
hoelzl@40859
  1095
  assumes X: "simple_function X" shows "\<H>(f \<circ> X) \<le> \<H>(X)"
hoelzl@40859
  1096
proof -
hoelzl@40859
  1097
  note X
hoelzl@40859
  1098
  moreover have fX: "simple_function (f \<circ> X)" using X by auto
hoelzl@40859
  1099
  moreover have "subvimage (space M) X (f \<circ> X)" by auto
hoelzl@40859
  1100
  ultimately have "\<H>(X) = \<H>(f\<circ>X) + \<H>(X|f\<circ>X)" by (rule entropy_partition)
hoelzl@40859
  1101
  then show "\<H>(f \<circ> X) \<le> \<H>(X)"
hoelzl@40859
  1102
    by (auto intro: conditional_entropy_positive[OF X fX])
hoelzl@40859
  1103
qed
hoelzl@36624
  1104
hoelzl@40859
  1105
corollary (in information_space) entropy_of_inj:
hoelzl@40859
  1106
  assumes X: "simple_function X" and inj: "inj_on f (X`space M)"
hoelzl@36624
  1107
  shows "\<H>(f \<circ> X) = \<H>(X)"
hoelzl@36624
  1108
proof (rule antisym)
hoelzl@40859
  1109
  show "\<H>(f \<circ> X) \<le> \<H>(X)" using entropy_data_processing[OF X] .
hoelzl@36624
  1110
next
hoelzl@40859
  1111
  have sf: "simple_function (f \<circ> X)"
hoelzl@40859
  1112
    using X by auto
hoelzl@36624
  1113
  have "\<H>(X) = \<H>(the_inv_into (X`space M) f \<circ> (f \<circ> X))"
hoelzl@40859
  1114
    by (auto intro!: mutual_information_cong simp: entropy_def the_inv_into_f_f[OF inj])
hoelzl@36624
  1115
  also have "... \<le> \<H>(f \<circ> X)"
hoelzl@40859
  1116
    using entropy_data_processing[OF sf] .
hoelzl@36624
  1117
  finally show "\<H>(X) \<le> \<H>(f \<circ> X)" .
hoelzl@36624
  1118
qed
hoelzl@36624
  1119
hoelzl@36080
  1120
end