50222
|
1 |
# Title: HOL/Tools/Sledgehammer/MaSh/src/stats.py
|
|
2 |
# Author: Daniel Kuehlwein, ICIS, Radboud University Nijmegen
|
|
3 |
# Copyright 2012
|
|
4 |
#
|
|
5 |
# Statistics collector.
|
|
6 |
|
50220
|
7 |
'''
|
|
8 |
Created on Jul 9, 2012
|
|
9 |
|
|
10 |
@author: Daniel Kuehlwein
|
|
11 |
'''
|
|
12 |
|
|
13 |
import logging,string
|
|
14 |
from cPickle import load,dump
|
|
15 |
|
|
16 |
class Statistics(object):
|
|
17 |
'''
|
|
18 |
Class for all the statistics
|
|
19 |
'''
|
|
20 |
|
|
21 |
def __init__(self,cutOff=500):
|
|
22 |
'''
|
|
23 |
Constructor
|
|
24 |
'''
|
|
25 |
self.logger = logging.getLogger('Statistics')
|
|
26 |
self.avgAUC = 0.0
|
|
27 |
self.avgRecall100 = 0.0
|
|
28 |
self.avgAvailable = 0.0
|
|
29 |
self.avgDepNr = 0.0
|
|
30 |
self.problems = 0.0
|
|
31 |
self.cutOff = cutOff
|
|
32 |
self.recallData = [0]*cutOff
|
|
33 |
self.recall100Data = [0]*cutOff
|
|
34 |
self.aucData = []
|
50388
|
35 |
self.premiseOccurenceCounter = {}
|
|
36 |
self.firstDepAppearance = {}
|
|
37 |
self.depAppearances = []
|
|
38 |
|
|
39 |
def update(self,predictions,dependencies,statementCounter):
|
50220
|
40 |
"""
|
|
41 |
Evaluates AUC, dependencies, recall100 and number of available premises of a prediction.
|
|
42 |
"""
|
|
43 |
available = len(predictions)
|
|
44 |
predictions = predictions[:self.cutOff]
|
|
45 |
dependencies = set(dependencies)
|
50388
|
46 |
# No Stats for if no dependencies
|
|
47 |
if len(dependencies) == 0:
|
|
48 |
self.logger.debug('No Dependencies for statement %s' % statementCounter )
|
|
49 |
self.badPreds = []
|
|
50 |
return
|
|
51 |
if len(predictions) < self.cutOff:
|
|
52 |
for i in range(len(predictions),self.cutOff):
|
|
53 |
self.recall100Data[i] += 1
|
|
54 |
self.recallData[i] += 1
|
|
55 |
for d in dependencies:
|
|
56 |
if self.premiseOccurenceCounter.has_key(d):
|
|
57 |
self.premiseOccurenceCounter[d] += 1
|
|
58 |
else:
|
|
59 |
self.premiseOccurenceCounter[d] = 1
|
|
60 |
if self.firstDepAppearance.has_key(d):
|
|
61 |
self.depAppearances.append(statementCounter-self.firstDepAppearance[d])
|
|
62 |
else:
|
|
63 |
self.firstDepAppearance[d] = statementCounter
|
50220
|
64 |
depNr = len(dependencies)
|
50388
|
65 |
aucSum = 0.
|
|
66 |
posResults = 0.
|
50220
|
67 |
positives, negatives = 0, 0
|
|
68 |
recall100 = 0.0
|
|
69 |
badPreds = []
|
|
70 |
depsFound = []
|
|
71 |
for index,pId in enumerate(predictions):
|
|
72 |
if pId in dependencies: #positive
|
|
73 |
posResults+=1
|
|
74 |
positives+=1
|
|
75 |
recall100 = index+1
|
|
76 |
depsFound.append(pId)
|
|
77 |
if index > 200:
|
|
78 |
badPreds.append(pId)
|
50388
|
79 |
else:
|
50220
|
80 |
aucSum += posResults
|
|
81 |
negatives+=1
|
|
82 |
# Update Recall and Recall100 stats
|
|
83 |
if depNr == positives:
|
|
84 |
self.recall100Data[index] += 1
|
|
85 |
if depNr == 0:
|
|
86 |
self.recallData[index] += 1
|
|
87 |
else:
|
|
88 |
self.recallData[index] += float(positives)/depNr
|
50388
|
89 |
|
50220
|
90 |
if not depNr == positives:
|
|
91 |
depsFound = set(depsFound)
|
|
92 |
missing = []
|
|
93 |
for dep in dependencies:
|
|
94 |
if not dep in depsFound:
|
|
95 |
missing.append(dep)
|
|
96 |
badPreds.append(dep)
|
|
97 |
recall100 = len(predictions)+1
|
|
98 |
positives+=1
|
|
99 |
self.logger.debug('Dependencies missing for %s in accessibles! Estimating Statistics.',\
|
|
100 |
string.join([str(dep) for dep in missing],','))
|
50388
|
101 |
|
50220
|
102 |
if positives == 0 or negatives == 0:
|
|
103 |
auc = 1.0
|
50388
|
104 |
else:
|
50220
|
105 |
auc = aucSum/(negatives*positives)
|
50388
|
106 |
|
50220
|
107 |
self.aucData.append(auc)
|
|
108 |
self.avgAUC += auc
|
|
109 |
self.avgRecall100 += recall100
|
|
110 |
self.problems += 1
|
|
111 |
self.badPreds = badPreds
|
50388
|
112 |
self.avgAvailable += available
|
50220
|
113 |
self.avgDepNr += depNr
|
50388
|
114 |
self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\
|
|
115 |
statementCounter,round(100*auc,2),depNr,recall100,available,self.cutOff)
|
|
116 |
|
50220
|
117 |
def printAvg(self):
|
|
118 |
self.logger.info('Average results:')
|
|
119 |
self.logger.info('avgAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t cutOff:%s', \
|
50388
|
120 |
round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),self.cutOff)
|
50220
|
121 |
|
50388
|
122 |
#try:
|
|
123 |
if True:
|
50220
|
124 |
from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist
|
|
125 |
avgRecall = [float(x)/self.problems for x in self.recallData]
|
|
126 |
figure('Recall')
|
|
127 |
plot(range(self.cutOff),avgRecall)
|
|
128 |
ylabel('Average Recall')
|
|
129 |
xlabel('Highest ranked premises')
|
|
130 |
axis([0,self.cutOff,0.0,1.0])
|
|
131 |
figure('100%Recall')
|
|
132 |
plot(range(self.cutOff),self.recall100Data)
|
|
133 |
ylabel('100%Recall')
|
|
134 |
xlabel('Highest ranked premises')
|
|
135 |
axis([0,self.cutOff,0,self.problems])
|
|
136 |
figure('AUC Histogram')
|
|
137 |
hist(self.aucData,bins=100)
|
|
138 |
ylabel('Problems')
|
|
139 |
xlabel('AUC')
|
50388
|
140 |
maxCount = max(self.premiseOccurenceCounter.values())
|
|
141 |
minCount = min(self.premiseOccurenceCounter.values())
|
|
142 |
figure('Dependency Occurances')
|
|
143 |
hist(self.premiseOccurenceCounter.values(),bins=range(minCount,maxCount+2),align = 'left')
|
|
144 |
#ylabel('Occurences')
|
|
145 |
xlabel('Number of Times a Dependency Occurs')
|
|
146 |
figure('Dependency Appearance in Problems after Introduction.')
|
|
147 |
hist(self.depAppearances,bins=50)
|
|
148 |
figure('Dependency Appearance in Problems after Introduction in Percent.')
|
|
149 |
xAxis = range(max(self.depAppearances)+1)
|
|
150 |
yAxis = [0] * (max(self.depAppearances)+1)
|
|
151 |
for val in self.depAppearances:
|
|
152 |
yAxis[val] += 1
|
|
153 |
yAxis = [float(x)/len(self.firstDepAppearance.keys()) for x in yAxis]
|
|
154 |
plot(xAxis,yAxis)
|
50220
|
155 |
show()
|
50388
|
156 |
#except:
|
|
157 |
# self.logger.warning('Matplotlib module missing. Skipping graphs.')
|
|
158 |
|
|
159 |
def save(self,fileName):
|
50220
|
160 |
oStream = open(fileName, 'wb')
|
50388
|
161 |
dump((self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter),oStream)
|
50220
|
162 |
oStream.close()
|
|
163 |
def load(self,fileName):
|
50388
|
164 |
iStream = open(fileName, 'rb')
|
|
165 |
self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter = load(iStream)
|
|
166 |
iStream.close()
|