# HG changeset patch # User blanchet # Date 1355267698 -3600 # Node ID d7be7ccf428b76a272bdbbedc08aa4aa66fa02db # Parent 5d147d492792c7af7d9cb1ddf3890cdfd2aa2ce5 updated version of MaSh learner engine diff -r 5d147d492792 -r d7be7ccf428b src/HOL/Tools/Sledgehammer/MaSh/src/mash.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Wed Dec 12 00:14:58 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Wed Dec 12 00:14:58 2012 +0100 @@ -21,7 +21,8 @@ from stats import Statistics from dictionaries import Dictionaries #from fullNaiveBayes import NBClassifier -from naiveBayes import NBClassifier +from sparseNaiveBayes import sparseNBClassifier +#from naiveBayes import sparseNBClassifier from snow import SNoW from predefined import Predefined @@ -83,7 +84,7 @@ # Pick algorithm if args.nb: logger.info('Using sparse Naive Bayes for learning.') - model = NBClassifier() + model = sparseNBClassifier() modelFile = os.path.join(args.outputDir,'NB.pickle') elif args.snow: logger.info('Using naive bayes (SNoW) for learning.') @@ -97,7 +98,7 @@ modelFile = os.path.join(args.outputDir,'mepo.pickle') else: logger.info('No algorithm specified. Using sparse Naive Bayes.') - model = NBClassifier() + model = sparseNBClassifier() modelFile = os.path.join(args.outputDir,'NB.pickle') dictsFile = os.path.join(args.outputDir,'dicts.pickle') diff -r 5d147d492792 -r d7be7ccf428b src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py Wed Dec 12 00:14:58 2012 +0100 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,139 +0,0 @@ -# Title: HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py -# Author: Daniel Kuehlwein, ICIS, Radboud University Nijmegen -# Copyright 2012 -# -# An updatable naive Bayes classifier. - -''' -Created on Jul 11, 2012 - -@author: Daniel Kuehlwein -''' - -from cPickle import dump,load -from numpy import array,exp -from math import log - -class NBClassifier(object): - ''' - An updateable naive Bayes classifier. - ''' - - def __init__(self): - ''' - Constructor - ''' - self.counts = {} - - def initializeModel(self,trainData,dicts): - """ - Build basic model from training data. - """ - for d in trainData: - dPosCount = 0 - dFeatureCounts = {} - self.counts[d] = [dPosCount,dFeatureCounts] - - for key in dicts.dependenciesDict.keys(): - # Add p proves p - keyDeps = [key]+dicts.dependenciesDict[key] - - for dep in keyDeps: - self.counts[dep][0] += 1 - depFeatures = dicts.featureDict[key] - for f,_w in depFeatures: - if self.counts[dep][1].has_key(f): - self.counts[dep][1][f] += 1 - else: - self.counts[dep][1][f] = 1 - - - def update(self,dataPoint,features,dependencies): - """ - Updates the Model. - """ - if not self.counts.has_key(dataPoint): - dPosCount = 0 - dFeatureCounts = {} - self.counts[dataPoint] = [dPosCount,dFeatureCounts] - for dep in dependencies: - self.counts[dep][0] += 1 - for f,_w in features: - if self.counts[dep][1].has_key(f): - self.counts[dep][1][f] += 1 - else: - self.counts[dep][1][f] = 1 - - def delete(self,dataPoint,features,dependencies): - """ - Deletes a single datapoint from the model. - """ - for dep in dependencies: - self.counts[dep][0] -= 1 - for f in features: - self.counts[dep][1][f] -= 1 - - - def overwrite(self,problemId,newDependencies,dicts): - """ - Deletes the old dependencies of problemId and replaces them with the new ones. Updates the model accordingly. - """ - assert self.counts.has_key(problemId) - oldDeps = dicts.dependenciesDict[problemId] - features = dicts.featureDict[problemId] - self.delete(problemId,features,oldDeps) - self.update(problemId,features,newDependencies) - - def predict(self,features,accessibles): - """ - For each accessible, predicts the probability of it being useful given the features. - Returns a ranking of the accessibles. - """ - predictions = [] - for a in accessibles: - posA = self.counts[a][0] - fA = set(self.counts[a][1].keys()) - fWeightsA = self.counts[a][1] - resultA = log(posA) - for f,w in features: - if f in fA: - if fWeightsA[f] == 0: - resultA -= w*15 - else: - assert fWeightsA[f] <= posA - resultA += w*log(float(fWeightsA[f])/posA) - else: - resultA -= w*15 - predictions.append(resultA) - #expPredictions = array([exp(x) for x in predictions]) - predictions = array(predictions) - perm = (-predictions).argsort() - #return array(accessibles)[perm],expPredictions[perm] - return array(accessibles)[perm],predictions[perm] - - def save(self,fileName): - OStream = open(fileName, 'wb') - dump(self.counts,OStream) - OStream.close() - - def load(self,fileName): - OStream = open(fileName, 'rb') - self.counts = load(OStream) - OStream.close() - - -if __name__ == '__main__': - featureDict = {0:[0,1,2],1:[3,2,1]} - dependenciesDict = {0:[0],1:[0,1]} - libDicts = (featureDict,dependenciesDict,{}) - c = NBClassifier() - c.initializeModel([0,1],libDicts) - c.update(2,[14,1,3],[0,2]) - print c.counts - print c.predict([0,14],[0,1,2]) - c.storeModel('x') - d = NBClassifier() - d.loadModel('x') - print c.counts - print d.counts - print 'Done' diff -r 5d147d492792 -r d7be7ccf428b src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Wed Dec 12 00:14:58 2012 +0100 @@ -0,0 +1,139 @@ +# Title: HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py +# Author: Daniel Kuehlwein, ICIS, Radboud University Nijmegen +# Copyright 2012 +# +# An updatable naive Bayes classifier. + +''' +Created on Jul 11, 2012 + +@author: Daniel Kuehlwein +''' + +from cPickle import dump,load +from numpy import array,exp +from math import log + +class sparseNBClassifier(object): + ''' + An updateable naive Bayes classifier. + ''' + + def __init__(self): + ''' + Constructor + ''' + self.counts = {} + + def initializeModel(self,trainData,dicts): + """ + Build basic model from training data. + """ + for d in trainData: + dPosCount = 0 + dFeatureCounts = {} + self.counts[d] = [dPosCount,dFeatureCounts] + + for key in dicts.dependenciesDict.keys(): + # Add p proves p + keyDeps = [key]+dicts.dependenciesDict[key] + + for dep in keyDeps: + self.counts[dep][0] += 1 + depFeatures = dicts.featureDict[key] + for f,_w in depFeatures: + if self.counts[dep][1].has_key(f): + self.counts[dep][1][f] += 1 + else: + self.counts[dep][1][f] = 1 + + + def update(self,dataPoint,features,dependencies): + """ + Updates the Model. + """ + if not self.counts.has_key(dataPoint): + dPosCount = 0 + dFeatureCounts = {} + self.counts[dataPoint] = [dPosCount,dFeatureCounts] + for dep in dependencies: + self.counts[dep][0] += 1 + for f,_w in features: + if self.counts[dep][1].has_key(f): + self.counts[dep][1][f] += 1 + else: + self.counts[dep][1][f] = 1 + + def delete(self,dataPoint,features,dependencies): + """ + Deletes a single datapoint from the model. + """ + for dep in dependencies: + self.counts[dep][0] -= 1 + for f in features: + self.counts[dep][1][f] -= 1 + + + def overwrite(self,problemId,newDependencies,dicts): + """ + Deletes the old dependencies of problemId and replaces them with the new ones. Updates the model accordingly. + """ + assert self.counts.has_key(problemId) + oldDeps = dicts.dependenciesDict[problemId] + features = dicts.featureDict[problemId] + self.delete(problemId,features,oldDeps) + self.update(problemId,features,newDependencies) + + def predict(self,features,accessibles): + """ + For each accessible, predicts the probability of it being useful given the features. + Returns a ranking of the accessibles. + """ + predictions = [] + for a in accessibles: + posA = self.counts[a][0] + fA = set(self.counts[a][1].keys()) + fWeightsA = self.counts[a][1] + resultA = log(posA) + for f,w in features: + if f in fA: + if fWeightsA[f] == 0: + resultA -= w*15 + else: + assert fWeightsA[f] <= posA + resultA += w*log(float(fWeightsA[f])/posA) + else: + resultA -= w*15 + predictions.append(resultA) + #expPredictions = array([exp(x) for x in predictions]) + predictions = array(predictions) + perm = (-predictions).argsort() + #return array(accessibles)[perm],expPredictions[perm] + return array(accessibles)[perm],predictions[perm] + + def save(self,fileName): + OStream = open(fileName, 'wb') + dump(self.counts,OStream) + OStream.close() + + def load(self,fileName): + OStream = open(fileName, 'rb') + self.counts = load(OStream) + OStream.close() + + +if __name__ == '__main__': + featureDict = {0:[0,1,2],1:[3,2,1]} + dependenciesDict = {0:[0],1:[0,1]} + libDicts = (featureDict,dependenciesDict,{}) + c = sparseNBClassifier() + c.initializeModel([0,1],libDicts) + c.update(2,[14,1,3],[0,2]) + print c.counts + print c.predict([0,14],[0,1,2]) + c.storeModel('x') + d = sparseNBClassifier() + d.loadModel('x') + print c.counts + print d.counts + print 'Done'