isabelle: comparison src/HOL/Tools/Sledgehammer/MaSh/src/fullNaiveBayes.py

equal deleted inserted replaced

-:5c7780d21d24
+:1133b9e83f09
+'''
+Created on Jul 11, 2012
+@author: Daniel Kuehlwein
+'''
+from cPickle import dump,load
+from numpy import array,exp
+from math import log
+class NBClassifier(object):
+'''
+An updateable naive Bayes classifier.
+'''
+def __init__(self):
+'''
+Constructor
+'''
+self.counts = {}
+self.negCounts = {}
+def initializeModel(self,trainData,dicts):
+"""
+Build basic model from training data.
+"""
+for d in trainData:
+self.counts[d] = [0,{}]
+self.negCounts[d] = [0,{}]
+dAccUnExp = dicts.accessibleDict[d]
+if dicts.expandedAccessibles.has_key(d):
+dAcc = dicts.expandedAccessibles(d)
+else:
+if len(dicts.expandedAccessibles.keys()) >= 100:
+dicts.expandedAccessibles = {}
+dAcc = dicts.expand_accessibles(dAccUnExp)
+dicts.expandedAccessibles[d] = dAcc
+dDeps = set(dicts.dependenciesDict[d])
+dFeatures = dicts.featureDict[d]
+# d proves d
+self.counts[d][0] += 1
+for f in dFeatures:
+if self.counts[d][1].has_key(f):
+self.counts[d][1][f] += 1
+else:
+self.counts[d][1][f] = 1
+for acc in dAcc:
+if not self.counts.has_key(acc):
+self.counts[acc] = [0,{}]
+if not self.negCounts.has_key(acc):
+self.negCounts[acc] = [0,{}]
+if acc in dDeps:
+self.counts[acc][0] += 1
+for f in dFeatures:
+if self.counts[acc][1].has_key(f):
+self.counts[acc][1][f] += 1
+else:
+self.counts[acc][1][f] = 1
+else:
+self.negCounts[acc][0] += 1
+for f in dFeatures:
+if self.negCounts[acc][1].has_key(f):
+self.negCounts[acc][1][f] += 1
+else:
+self.negCounts[acc][1][f] = 1
+def update(self,dataPoint,features,dependencies,dicts):
+"""
+Updates the Model.
+"""
+if not self.counts.has_key(dataPoint):
+self.counts[dataPoint] = [0,{}]
+if not self.negCounts.has_key(dataPoint):
+self.negCounts[dataPoint] = [0,{}]
+if dicts.expandedAccessibles.has_key(dataPoint):
+dAcc = dicts.expandedAccessibles(dataPoint)
+else:
+if len(dicts.expandedAccessibles.keys()) >= 100:
+dicts.expandedAccessibles = {}
+dAccUnExp = dicts.accessibleDict[dataPoint]
+dAcc = dicts.expand_accessibles(dAccUnExp)
+dicts.expandedAccessibles[dataPoint] = dAcc
+dDeps = set(dicts.dependenciesDict[dataPoint])
+dFeatures = dicts.featureDict[dataPoint]
+# d proves d
+self.counts[dataPoint][0] += 1
+for f in dFeatures:
+if self.counts[dataPoint][1].has_key(f):
+self.counts[dataPoint][1][f] += 1
+else:
+self.counts[dataPoint][1][f] = 1
+for acc in dAcc:
+if acc in dDeps:
+self.counts[acc][0] += 1
+for f in dFeatures:
+if self.counts[acc][1].has_key(f):
+self.counts[acc][1][f] += 1
+else:
+self.counts[acc][1][f] = 1
+else:
+self.negCounts[acc][0] += 1
+for f in dFeatures:
+if self.negCounts[acc][1].has_key(f):
+self.negCounts[acc][1][f] += 1
+else:
+self.negCounts[acc][1][f] = 1
+def delete(self,dataPoint,features,dependencies):
+"""
+Deletes a single datapoint from the model.
+"""
+for dep in dependencies:
+self.counts[dep][0] -= 1
+for f in features:
+self.counts[dep][1][f] -= 1
+def overwrite(self,problemId,newDependencies,dicts):
+"""
+Deletes the old dependencies of problemId and replaces them with the new ones. Updates the model accordingly.
+"""
+assert self.counts.has_key(problemId)
+oldDeps = dicts.dependenciesDict[problemId]
+features = dicts.featureDict[problemId]
+self.delete(problemId,features,oldDeps)
+self.update(problemId,features,newDependencies)
+def predict(self,features,accessibles):
+"""
+For each accessible, predicts the probability of it being useful given the features.
+Returns a ranking of the accessibles.
+"""
+predictions = []
+for a in accessibles:
+posA = self.counts[a][0]
+negA = self.negCounts[a][0]
+fPosA = set(self.counts[a][1].keys())
+fNegA = set(self.negCounts[a][1].keys())
+fPosWeightsA = self.counts[a][1]
+fNegWeightsA = self.negCounts[a][1]
+if negA == 0:
+resultA = 0
+elif posA == 0:
+print a
+print 'xx'
+import sys
+sys.exit(-1)
+else:
+resultA = log(posA) - log(negA)
+for f in features:
+if f in fPosA:
+# P(f | a)
+if fPosWeightsA[f] == 0:
+resultA -= 15
+else:
+assert fPosWeightsA[f] <= posA
+resultA += log(float(fPosWeightsA[f])/posA)
+else:
+resultA -= 15
+# P(f | not a)
+if f in fNegA:
+if fNegWeightsA[f] == 0:
+resultA += 15
+else:
+assert fNegWeightsA[f] <= negA
+resultA -= log(float(fNegWeightsA[f])/negA)
+else:
+resultA += 15
+predictions.append(resultA)
+#expPredictions = array([exp(x) for x in predictions])
+predictions = array(predictions)
+perm = (-predictions).argsort()
+#return array(accessibles)[perm],expPredictions[perm]
+return array(accessibles)[perm],predictions[perm]
+def save(self,fileName):
+OStream = open(fileName, 'wb')
+dump((self.counts,self.negCounts),OStream)
+OStream.close()
+def load(self,fileName):
+OStream = open(fileName, 'rb')
+self.counts,self.negCounts = load(OStream)
+OStream.close()
+if __name__ == '__main__':
+featureDict = {0:[0,1,2],1:[3,2,1]}
+dependenciesDict = {0:[0],1:[0,1]}
+libDicts = (featureDict,dependenciesDict,{})
+c = NBClassifier()
+c.initializeModel([0,1],libDicts)
+c.update(2,[14,1,3],[0,2])
+print c.counts
+print c.predict([0,14],[0,1,2])
+c.storeModel('x')
+d = NBClassifier()
+d.loadModel('x')
+print c.counts
+print d.counts
+print 'Done'