# HG changeset patch # User blanchet # Date 1357918256 -3600 # Node ID aba769dc82e9f3c15fb7b2b29fc6b141cf8f2b34 # Parent 18ace05656cf2b3c009fb060bb4ea83d2c22c702 updated MaSh Python component diff -r 18ace05656cf -r aba769dc82e9 src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Fri Jan 11 16:30:56 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Fri Jan 11 16:30:56 2013 +0100 @@ -31,6 +31,7 @@ self.featureDict = {} self.dependenciesDict = {} self.accessibleDict = {} + self.featureCountDict = {} self.expandedAccessibles = {} self.changed = True @@ -38,8 +39,8 @@ Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict, articleDict get filled! """ def init_featureDict(self,featureFile): - self.featureDict,self.maxNameId,self.maxFeatureId = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\ - self.maxFeatureId,featureFile) + self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\ + self.maxFeatureId,self.featureCountDict,featureFile) def init_dependenciesDict(self,depFile): self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile) def init_accessibleDict(self,accFile): @@ -73,9 +74,12 @@ def add_feature(self,featureName): if not self.featureIdDict.has_key(featureName): self.featureIdDict[featureName] = self.maxFeatureId + self.featureCountDict[self.maxFeatureId] = 0 self.maxFeatureId += 1 self.changed = True - return self.featureIdDict[featureName] + fId = self.featureIdDict[featureName] + self.featureCountDict[fId] += 1 + return fId def get_features(self,line): # Feature Ids @@ -83,12 +87,12 @@ features = [] for fn in featureNames: tmp = fn.split('=') + weight = 1.0 if len(tmp) == 2: - fId = self.add_feature(tmp[0]) - features.append((fId,float(tmp[1]))) - else: - fId = self.add_feature(fn) - features.append((fId,1.0)) + fn = tmp[0] + weight = float(tmp[1]) + fId = self.add_feature(tmp[0]) + features.append((fId,weight)) return features def expand_accessibles(self,acc): @@ -150,7 +154,7 @@ def parse_problem(self,line): """ - Parses a problem and returns the features and the accessibles. + Parses a problem and returns the features, the accessibles, and any hints. """ assert line.startswith('? ') line = line[2:] @@ -176,19 +180,24 @@ self.changed = True accessibles = self.expand_accessibles(unExpAcc) features = self.get_features(line) + # Get hints: + if len(line) == 3: + hints = [self.nameIdDict[d.strip()] for d in line[2].split()] + else: + hints = [] - return name,features,accessibles + return name,features,accessibles,hints def save(self,fileName): if self.changed: dictsStream = open(fileName, 'wb') dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\ - self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict),dictsStream) + self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict),dictsStream) self.changed = False dictsStream.close() def load(self,fileName): dictsStream = open(fileName, 'rb') self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\ - self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream) + self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict = load(dictsStream) self.changed = False dictsStream.close() diff -r 18ace05656cf -r aba769dc82e9 src/HOL/Tools/Sledgehammer/MaSh/src/mash.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Fri Jan 11 16:30:56 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Fri Jan 11 16:30:56 2013 +0100 @@ -48,6 +48,8 @@ help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies') parser.add_argument('--saveModel',default=False,action='store_true',help="Stores the learned Model at the end of a prediction run. Default=False.") parser.add_argument('--learnTheories',default=False,action='store_true',help="Uses a two-lvl prediction mode. First the theories, then the premises. Default=False.") +#DEBUG: Change sineprioir default to false +parser.add_argument('--sinePrior',default=True,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.") parser.add_argument('--nb',default=False,action='store_true',help="Use Naive Bayes for learning. This is the default learning method.") @@ -87,7 +89,7 @@ # Pick algorithm if args.nb: logger.info('Using sparse Naive Bayes for learning.') - model = sparseNBClassifier() + model = sparseNBClassifier(args.sinePrior) modelFile = os.path.join(args.outputDir,'NB.pickle') elif args.snow: logger.info('Using naive bayes (SNoW) for learning.') @@ -101,7 +103,7 @@ modelFile = os.path.join(args.outputDir,'mepo.pickle') else: logger.info('No algorithm specified. Using sparse Naive Bayes.') - model = sparseNBClassifier() + model = sparseNBClassifier(args.sinePrior) modelFile = os.path.join(args.outputDir,'NB.pickle') dictsFile = os.path.join(args.outputDir,'dicts.pickle') theoryFile = os.path.join(args.outputDir,'theory.pickle') @@ -182,7 +184,7 @@ # Update Dependencies, p proves p dicts.dependenciesDict[problemId] = [problemId]+dicts.dependenciesDict[problemId] if args.learnTheories: - theoryModels.update(problemId,dicts) + theoryModels.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId],dicts) if args.snow: model.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId],dicts) else: @@ -200,16 +202,34 @@ computeStats = True if args.predef: continue - name,features,accessibles = dicts.parse_problem(line) + name,features,accessibles,hints = dicts.parse_problem(line) # Create predictions logger.info('Starting computation for problem on line %s',lineCounter) + # Update Models with hints + if not hints == []: + if args.learnTheories: + accessibleTheories = set([(dicts.idNameDict[x]).split('.')[0] for x in accessibles]) + theoryModels.update_with_acc('hints',features,hints,dicts,accessibleTheories) + if args.snow: + pass + else: + model.update('hints',features,hints) + + # Predict premises if args.learnTheories: predictedTheories,accessibles = theoryModels.predict(features,accessibles,dicts) - if args.snow: - predictions,predictionValues = model.predict(features,accessibles,dicts) - else: - predictions,predictionValues = model.predict(features,accessibles) + predictions,predictionValues = model.predict(features,accessibles,dicts) assert len(predictions) == len(predictionValues) + + # Delete hints + if not hints == []: + if args.learnTheories: + theoryModels.delete('hints',features,hints,dicts) + if args.snow: + pass + else: + model.delete('hints',features,hints) + logger.info('Done. %s seconds needed.',round(time()-startTime,2)) # Output predictionNames = [str(dicts.idNameDict[p]) for p in predictions[:args.numberOfPredictions]] @@ -253,10 +273,19 @@ if __name__ == '__main__': # Example: + # Auth + # ISAR Theories + #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--learnTheories'] + #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories'] + # ISAR MePo + #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--predef'] + #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats'] + + # Jinja # ISAR Theories - #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121221/Jinja/','--learnTheories'] - #args = ['-i', '../data/20121221/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories'] + #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Jinja/','--learnTheories'] + #args = ['-i', '../data/20121227b/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories'] # ISAR NB #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121221/Jinja/'] #args = ['-i', '../data/20121221/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500'] @@ -277,6 +306,9 @@ # Probability + # ISAR Theories + #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/','--learnTheories'] + #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories'] # ISAR NB #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/'] #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/ProbIsarNB.stats','--cutOff','500'] diff -r 18ace05656cf -r aba769dc82e9 src/HOL/Tools/Sledgehammer/MaSh/src/readData.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Fri Jan 11 16:30:56 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Fri Jan 11 16:30:56 2013 +0100 @@ -14,7 +14,7 @@ import sys,logging -def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,inputFile): +def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,inputFile): logger = logging.getLogger('create_feature_dict') featureDict = {} IS = open(inputFile,'r') @@ -32,23 +32,24 @@ maxNameId += 1 # Feature Ids featureNames = [f.strip() for f in line[1].split()] - features = [] + features = [] for fn in featureNames: + weight = 1.0 tmp = fn.split('=') if len(tmp) == 2: - if not featureIdDict.has_key(tmp[0]): - featureIdDict[tmp[0]] = maxFeatureId - maxFeatureId += 1 - features.append((featureIdDict[tmp[0]],float(tmp[1]))) - else: - if not featureIdDict.has_key(fn): - featureIdDict[fn] = maxFeatureId - maxFeatureId += 1 - features.append((featureIdDict[fn],1.0)) + fn = tmp[0] + weight = float(tmp[1]) + if not featureIdDict.has_key(fn): + featureIdDict[fn] = maxFeatureId + featureCountDict[maxFeatureId] = 0 + maxFeatureId += 1 + fId = featureIdDict[fn] + features.append((fId,weight)) + featureCountDict[fId] += 1 # Store results featureDict[nameId] = features IS.close() - return featureDict,maxNameId,maxFeatureId + return featureDict,maxNameId,maxFeatureId,featureCountDict def create_dependencies_dict(nameIdDict,inputFile): logger = logging.getLogger('create_dependencies_dict') diff -r 18ace05656cf -r aba769dc82e9 src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100 @@ -61,7 +61,7 @@ self.pos -= 1 else: self.neg -= 1 - for f in features: + for f,_w in features: posCount,negCount = self.counts[f] if label: posCount -= 1 @@ -79,7 +79,7 @@ def predict_sparse(self,features): """ - Returns 1 if the probability is greater than 50%. + Returns 1 if the probability of + being the correct label is greater than the probability that - is the correct label. """ if self.neg == 0: return 1 diff -r 18ace05656cf -r aba769dc82e9 src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100 @@ -19,11 +19,15 @@ An updateable naive Bayes classifier. ''' - def __init__(self): + def __init__(self,useSinePrior = False): ''' Constructor ''' self.counts = {} + self.sinePrior = useSinePrior + self.defaultPriorWeight = 20 + self.posWeight = 20 + self.defVal = 15 def initializeModel(self,trainData,dicts): """ @@ -32,6 +36,10 @@ for d in trainData: dPosCount = 0 dFeatureCounts = {} + # DEBUG: give p |- p a higher weight + dPosCount = self.defaultPriorWeight + for f,_w in dicts.featureDict[d]: + dFeatureCounts[f] = self.defaultPriorWeight self.counts[d] = [dPosCount,dFeatureCounts] for key in dicts.dependenciesDict.keys(): @@ -53,8 +61,12 @@ """ if not self.counts.has_key(dataPoint): dPosCount = 0 - dFeatureCounts = {} - self.counts[dataPoint] = [dPosCount,dFeatureCounts] + dFeatureCounts = {} + # DEBUG: give p |- p a higher weight + dPosCount = self.defaultPriorWeight + for f,_w in features: + dFeatureCounts[f] = self.defaultPriorWeight + self.counts[dataPoint] = [dPosCount,dFeatureCounts] for dep in dependencies: self.counts[dep][0] += 1 for f,_w in features: @@ -69,7 +81,7 @@ """ for dep in dependencies: self.counts[dep][0] -= 1 - for f in features: + for f,_w in features: self.counts[dep][1][f] -= 1 @@ -83,13 +95,11 @@ self.delete(problemId,features,oldDeps) self.update(problemId,features,newDependencies) - def predict(self,features,accessibles): + def predict(self,features,accessibles,dicts): """ For each accessible, predicts the probability of it being useful given the features. Returns a ranking of the accessibles. """ - posWeight = 20.0 - defVal = 15 predictions = [] for a in accessibles: posA = self.counts[a][0] @@ -101,12 +111,12 @@ #w = 1 if f in fA: if fWeightsA[f] == 0: - resultA -= w*defVal + resultA -= w*self.defVal else: assert fWeightsA[f] <= posA - resultA += w*log(float(posWeight*fWeightsA[f])/posA) + resultA += w*log(float(self.posWeight*fWeightsA[f])/posA) else: - resultA -= w*defVal + resultA -= w*self.defVal predictions.append(resultA) #expPredictions = array([exp(x) for x in predictions]) predictions = array(predictions) diff -r 18ace05656cf -r aba769dc82e9 src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Fri Jan 11 16:30:56 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Fri Jan 11 16:30:56 2013 +0100 @@ -87,15 +87,13 @@ for a in self.accTheories: self.theoryModels[a].overwrite(features,a in oldTheories,a in newTheories) - def delete(self): - pass + def delete(self,problemId,features,dependencies,dicts): + tmp = [dicts.idNameDict[x] for x in dependencies] + usedTheories = set([x.split('.')[0] for x in tmp]) + for a in self.accessibleTheories: + self.theoryModels[a].delete(features,a in usedTheories) - def update(self,problemId,dicts): - features = dicts.featureDict[problemId] - - # Find the actually used theories - tmp = [dicts.idNameDict[x] for x in dicts.dependenciesDict[problemId]] - usedTheories = set([x.split('.')[0] for x in tmp]) + def update(self,problemId,features,dependencies,dicts): currentTheory = (dicts.idNameDict[problemId]).split('.')[0] # Create new theory model, if there is a new theory if not self.theoryDict.has_key(currentTheory): @@ -105,9 +103,15 @@ self.currentTheory = currentTheory theoryModel = singleNBClassifier() self.theoryModels[currentTheory] = theoryModel - self.accessibleTheories.add(self.currentTheory) - if not len(usedTheories) == 0: - for a in self.accessibleTheories: + self.accessibleTheories.add(self.currentTheory) + self.update_with_acc(problemId,features,dependencies,dicts,self.accessibleTheories) + + def update_with_acc(self,problemId,features,dependencies,dicts,accessibleTheories): + # Find the actually used theories + tmp = [dicts.idNameDict[x] for x in dependencies] + usedTheories = set([x.split('.')[0] for x in tmp]) + if not len(usedTheories) == 0: + for a in accessibleTheories: self.theoryModels[a].update(features,a in usedTheories) def predict(self,features,accessibles,dicts): diff -r 18ace05656cf -r aba769dc82e9 src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML --- a/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Fri Jan 11 16:30:56 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Fri Jan 11 16:30:56 2013 +0100 @@ -313,7 +313,7 @@ local -val version = "*** MaSh version 20130104a ***" +val version = "*** MaSh version 20130111a ***" exception Too_New of unit