# HG changeset patch # User blanchet # Date 1354789510 -3600 # Node ID a5b666e0c3c261eeb44a032fd0449b52f29d8c3d # Parent 3d8863c41fe854ea85371074ac54dae8bbac6b64 added weights to MaSh (by Daniel Kuehlwein) diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Thu Dec 06 11:25:10 2012 +0100 @@ -21,12 +21,12 @@ -------- Example Usage ---------------\n\ ./compareStats.py --statFiles ../tmp/natISANB.stats ../tmp/natATPNB.stats -b 30\n\n\ Author: Daniel Kuehlwein, July 2012',formatter_class=RawDescriptionHelpFormatter) -parser.add_argument('--statFiles', default=None, nargs='+', +parser.add_argument('--statFiles', default=None, nargs='+', help='The names of the saved statistic files.') parser.add_argument('-b','--bins',default=50,help="Number of bins for the AUC histogram. Default=50.",type=int) def main(argv = sys.argv[1:]): - args = parser.parse_args(argv) + args = parser.parse_args(argv) if args.statFiles == None: print 'Filenames missing.' sys.exit(-1) @@ -56,11 +56,10 @@ legend(loc='upper left') ylabel('Problems') xlabel('AUC') - + show() if __name__ == '__main__': #args = ['--statFiles','../tmp/natISANB.stats','../tmp/natATPNB.stats','-b','30'] #sys.exit(main(args)) sys.exit(main()) - \ No newline at end of file diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Thu Dec 06 11:25:10 2012 +0100 @@ -33,18 +33,18 @@ self.accessibleDict = {} self.expandedAccessibles = {} self.changed = True - + """ Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict get filled! - """ + """ def init_featureDict(self,featureFile): self.featureDict,self.maxNameId,self.maxFeatureId = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\ - self.maxFeatureId,featureFile) + self.maxFeatureId,featureFile) def init_dependenciesDict(self,depFile): self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile) def init_accessibleDict(self,accFile): self.accessibleDict,self.maxNameId = create_accessible_dict(self.nameIdDict,self.idNameDict,self.maxNameId,accFile) - + def init_all(self,inputFolder,featureFileName = 'mash_features',depFileName='mash_dependencies',accFileName = 'mash_accessibility'): featureFile = join(inputFolder,featureFileName) depFile = join(inputFolder,depFileName) @@ -54,7 +54,7 @@ self.init_dependenciesDict(depFile) self.expandedAccessibles = {} self.changed = True - + def get_name_id(self,name): """ Return the Id for a name. @@ -66,7 +66,7 @@ self.nameIdDict[name] = self.maxNameId self.idNameDict[self.maxNameId] = name nameId = self.maxNameId - self.maxNameId += 1 + self.maxNameId += 1 self.changed = True return nameId @@ -74,8 +74,23 @@ if not self.featureIdDict.has_key(featureName): self.featureIdDict[featureName] = self.maxFeatureId self.maxFeatureId += 1 - self.changed = True - + self.changed = True + return self.featureIdDict[featureName] + + def get_features(self,line): + # Feature Ids + featureNames = [f.strip() for f in line[1].split()] + features = [] + for fn in featureNames: + tmp = fn.split('=') + if len(tmp) == 2: + fId = self.add_feature(tmp[0]) + features.append((fId,float(tmp[1]))) + else: + fId = self.add_feature(fn) + features.append((fId,1.0)) + return features + def expand_accessibles(self,acc): accessibles = set(acc) unexpandedQueue = Queue() @@ -86,71 +101,67 @@ unexpandedQueue.put(a) while not unexpandedQueue.empty(): nextUnExp = unexpandedQueue.get() - nextUnExpAcc = self.accessibleDict[nextUnExp] + nextUnExpAcc = self.accessibleDict[nextUnExp] for a in nextUnExpAcc: if not a in accessibles: accessibles = accessibles.union([a]) if self.expandedAccessibles.has_key(a): accessibles = accessibles.union(self.expandedAccessibles[a]) else: - unexpandedQueue.put(a) + unexpandedQueue.put(a) return list(accessibles) - + def parse_fact(self,line): """ Parses a single line, extracting accessibles, features, and dependencies. """ assert line.startswith('! ') line = line[2:] - + # line = name:accessibles;features;dependencies line = line.split(':') name = line[0].strip() nameId = self.get_name_id(name) - - line = line[1].split(';') + + line = line[1].split(';') # Accessible Ids unExpAcc = [self.nameIdDict[a.strip()] for a in line[0].split()] self.accessibleDict[nameId] = unExpAcc - # Feature Ids - featureNames = [f.strip() for f in line[1].split()] - for fn in featureNames: - self.add_feature(fn) - self.featureDict[nameId] = [self.featureIdDict[fn] for fn in featureNames] + self.featureDict[nameId] = self.get_features(line) self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()] self.changed = True return nameId - + def parse_overwrite(self,line): """ Parses a single line, extracts the problemId and the Ids of the dependencies. """ assert line.startswith('p ') line = line[2:] - + # line = name:dependencies line = line.split(':') name = line[0].strip() nameId = self.get_name_id(name) - + dependencies = [self.nameIdDict[d.strip()] for d in line[1].split()] self.changed = True return nameId,dependencies - + def parse_problem(self,line): """ - Parses a problem and returns the features and the accessibles. + Parses a problem and returns the features and the accessibles. """ assert line.startswith('? ') line = line[2:] name = None - + # Check whether there is a problem name: tmp = line.split(':') if len(tmp) == 2: name = tmp[0].strip() line = tmp[1] - + # line = accessibles;features line = line.split(';') # Accessible Ids, expand and store the accessibles. @@ -164,13 +175,14 @@ self.expandedAccessibles[accId] = self.expand_accessibles(accIdAcc) self.changed = True accessibles = self.expand_accessibles(unExpAcc) - # Feature Ids - featureNames = [f.strip() for f in line[1].split()] - for fn in featureNames: - self.add_feature(fn) - features = [self.featureIdDict[fn] for fn in featureNames] - return name,features,accessibles - +# # Feature Ids +# featureNames = [f.strip() for f in line[1].split()] +# for fn in featureNames: +# self.add_feature(fn) +# features = [self.featureIdDict[fn] for fn in featureNames] + features = self.get_features(line) + return name,features,accessibles + def save(self,fileName): if self.changed: dictsStream = open(fileName, 'wb') @@ -179,10 +191,8 @@ self.changed = False dictsStream.close() def load(self,fileName): - dictsStream = open(fileName, 'rb') + dictsStream = open(fileName, 'rb') self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\ self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream) self.changed = False dictsStream.close() - - \ No newline at end of file diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/mash.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Thu Dec 06 11:25:10 2012 +0100 @@ -34,7 +34,7 @@ Author: Daniel Kuehlwein, July 2012',formatter_class=RawDescriptionHelpFormatter) parser.add_argument('-i','--inputFile',help='File containing all problems to be solved.') parser.add_argument('-o','--outputDir', default='../tmp/',help='Directory where all created files are stored. Default=../tmp/.') -parser.add_argument('-p','--predictions',default='../tmp/%s.predictions' % datetime.datetime.now(), +parser.add_argument('-p','--predictions',default='../tmp/%s.predictions' % datetime.datetime.now(), help='File where the predictions stored. Default=../tmp/dateTime.predictions.') parser.add_argument('--numberOfPredictions',default=200,help="Number of premises to write in the output. Default=200.",type=int) @@ -56,11 +56,11 @@ parser.add_argument('-l','--log', default='../tmp/%s.log' % datetime.datetime.now(), help='Log file name. Default=../tmp/dateTime.log') parser.add_argument('-q','--quiet',default=False,action='store_true',help="If enabled, only print warnings. Default=False.") -def main(argv = sys.argv[1:]): +def main(argv = sys.argv[1:]): # Initializing command-line arguments args = parser.parse_args(argv) - # Set up logging + # Set up logging logging.basicConfig(level=logging.DEBUG, format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s', datefmt='%d-%m %H:%M:%S', @@ -81,8 +81,8 @@ logger.info('Using the following settings: %s',args) # Pick algorithm if args.nb: - logger.info('Using Naive Bayes for learning.') - model = NBClassifier() + logger.info('Using Naive Bayes for learning.') + model = NBClassifier() modelFile = os.path.join(args.outputDir,'NB.pickle') elif args.snow: logger.info('Using naive bayes (SNoW) for learning.') @@ -90,37 +90,37 @@ modelFile = os.path.join(args.outputDir,'SNoW.pickle') elif args.predef: logger.info('Using predefined predictions.') - predictionFile = os.path.join(args.inputDir,'mash_meng_paulson_suggestions') + predictionFile = os.path.join(args.inputDir,'mash_meng_paulson_suggestions') model = Predefined(predictionFile) - modelFile = os.path.join(args.outputDir,'isabelle.pickle') + modelFile = os.path.join(args.outputDir,'isabelle.pickle') else: - logger.info('No algorithm specified. Using Naive Bayes.') - model = NBClassifier() - modelFile = os.path.join(args.outputDir,'NB.pickle') - dictsFile = os.path.join(args.outputDir,'dicts.pickle') - + logger.info('No algorithm specified. Using Naive Bayes.') + model = NBClassifier() + modelFile = os.path.join(args.outputDir,'NB.pickle') + dictsFile = os.path.join(args.outputDir,'dicts.pickle') + # Initializing model - if args.init: + if args.init: logger.info('Initializing Model.') startTime = time() - - # Load all data + + # Load all data dicts = Dictionaries() dicts.init_all(args.inputDir,depFileName=args.depFile) - + # Create Model trainData = dicts.featureDict.keys() if args.predef: dicts = model.initializeModel(trainData,dicts) else: model.initializeModel(trainData,dicts) - + model.save(modelFile) dicts.save(dictsFile) logger.info('All Done. %s seconds needed.',round(time()-startTime,2)) return 0 - # Create predictions and/or update model + # Create predictions and/or update model else: lineCounter = 0 dicts = Dictionaries() @@ -129,15 +129,15 @@ dicts.load(dictsFile) if os.path.isfile(modelFile): model.load(modelFile) - + # IO Streams OS = open(args.predictions,'a') IS = open(args.inputFile,'r') - + # Statistics if args.statistics: stats = Statistics(args.cutOff) - + predictions = None #Reading Input File for line in IS: @@ -151,11 +151,11 @@ if args.predef: predictions = model.predict[problemId] else: - predictions,_predictionsValues = model.predict(dicts.featureDict[problemId],dicts.expand_accessibles(acc)) + predictions,_predictionsValues = model.predict(dicts.featureDict[problemId],dicts.expand_accessibles(acc)) stats.update(predictions,dicts.dependenciesDict[problemId]) if not stats.badPreds == []: bp = string.join([str(dicts.idNameDict[x]) for x in stats.badPreds], ',') - logger.debug('Bad predictions: %s',bp) + logger.debug('Bad predictions: %s',bp) # Update Dependencies, p proves p dicts.dependenciesDict[problemId] = [problemId]+dicts.dependenciesDict[problemId] model.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId]) @@ -171,15 +171,15 @@ continue name,features,accessibles = dicts.parse_problem(line) # Create predictions - logger.info('Starting computation for problem on line %s',lineCounter) - predictions,predictionValues = model.predict(features,accessibles) + logger.info('Starting computation for problem on line %s',lineCounter) + predictions,predictionValues = model.predict(features,accessibles) assert len(predictions) == len(predictionValues) logger.info('Done. %s seconds needed.',round(time()-startTime,2)) - - # Output + + # Output predictionNames = [str(dicts.idNameDict[p]) for p in predictions[:args.numberOfPredictions]] - predictionValues = [str(x) for x in predictionValues[:args.numberOfPredictions]] - predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))] + predictionValues = [str(x) for x in predictionValues[:args.numberOfPredictions]] + predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))] predictionsString = string.join(predictionsStringList,' ') outString = '%s: %s' % (name,predictionsString) OS.write('%s\n' % outString) @@ -192,14 +192,14 @@ logger.warning('An error occurred on line %s .',line) lineCounter += 1 continue - """ + """ OS.close() IS.close() - + # Statistics if args.statistics: stats.printAvg() - + # Save if args.saveModel: model.save(modelFile) @@ -214,7 +214,7 @@ # Nat #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/','--predef'] #args = ['-i', '../data/Nat/mash_commands','-p','../tmp/testIsabelle.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/natATPMP.stats'] - #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/'] + #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/'] #args = ['-i', '../data/Nat/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/natATPNB.stats','--cutOff','500'] # BUG #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/List/','--isabelle'] @@ -223,6 +223,5 @@ #args = ['-i', '../bug/adds/mash_commands','-p','../tmp/testNB.pred','-l','testNB.log','--nb','-o','../tmp/'] #startTime = time() #sys.exit(main(args)) - #print 'New ' + str(round(time()-startTime,2)) + #print 'New ' + str(round(time()-startTime,2)) sys.exit(main()) - \ No newline at end of file diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py Thu Dec 06 11:25:10 2012 +0100 @@ -24,46 +24,46 @@ Constructor ''' self.counts = {} - + def initializeModel(self,trainData,dicts): """ Build basic model from training data. - """ + """ for d in trainData: dPosCount = 0 - dFeatureCounts = {} + dFeatureCounts = {} self.counts[d] = [dPosCount,dFeatureCounts] - + for key in dicts.dependenciesDict.keys(): # Add p proves p keyDeps = [key]+dicts.dependenciesDict[key] - + for dep in keyDeps: self.counts[dep][0] += 1 depFeatures = dicts.featureDict[key] - for f in depFeatures: + for f,_w in depFeatures: if self.counts[dep][1].has_key(f): self.counts[dep][1][f] += 1 else: self.counts[dep][1][f] = 1 - + def update(self,dataPoint,features,dependencies): """ Updates the Model. """ if not self.counts.has_key(dataPoint): dPosCount = 0 - dFeatureCounts = {} + dFeatureCounts = {} self.counts[dataPoint] = [dPosCount,dFeatureCounts] for dep in dependencies: self.counts[dep][0] += 1 - for f in features: + for f,_w in features: if self.counts[dep][1].has_key(f): self.counts[dep][1][f] += 1 else: self.counts[dep][1][f] = 1 - + def delete(self,dataPoint,features,dependencies): """ Deletes a single datapoint from the model. @@ -73,7 +73,7 @@ for f in features: self.counts[dep][1][f] -= 1 - + def overwrite(self,problemId,newDependencies,dicts): """ Deletes the old dependencies of problemId and replaces them with the new ones. Updates the model accordingly. @@ -83,45 +83,45 @@ features = dicts.featureDict[problemId] self.delete(problemId,features,oldDeps) self.update(problemId,features,newDependencies) - + def predict(self,features,accessibles): """ For each accessible, predicts the probability of it being useful given the features. Returns a ranking of the accessibles. """ predictions = [] - for a in accessibles: + for a in accessibles: posA = self.counts[a][0] fA = set(self.counts[a][1].keys()) fWeightsA = self.counts[a][1] - resultA = log(posA) - for f in features: + resultA = log(posA) + for f,w in features: if f in fA: if fWeightsA[f] == 0: - resultA -= 15 + resultA -= w*15 else: assert fWeightsA[f] <= posA - resultA += log(float(fWeightsA[f])/posA) + resultA += w*log(float(fWeightsA[f])/posA) else: - resultA -= 15 + resultA -= w*15 predictions.append(resultA) #expPredictions = array([exp(x) for x in predictions]) predictions = array(predictions) - perm = (-predictions).argsort() - #return array(accessibles)[perm],expPredictions[perm] + perm = (-predictions).argsort() + #return array(accessibles)[perm],expPredictions[perm] return array(accessibles)[perm],predictions[perm] - + def save(self,fileName): OStream = open(fileName, 'wb') - dump(self.counts,OStream) + dump(self.counts,OStream) OStream.close() - + def load(self,fileName): OStream = open(fileName, 'rb') - self.counts = load(OStream) + self.counts = load(OStream) OStream.close() - + if __name__ == '__main__': featureDict = {0:[0,1,2],1:[3,2,1]} dependenciesDict = {0:[0],1:[0,1]} @@ -136,4 +136,4 @@ d.loadModel('x') print c.counts print d.counts - print 'Done' \ No newline at end of file + print 'Done' diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py Thu Dec 06 11:25:10 2012 +0100 @@ -23,11 +23,11 @@ Constructor ''' self.predictionFile = mpPredictionFile - + def initializeModel(self,_trainData,dicts): """ Load predictions. - """ + """ self.predictions = {} IS = open(self.predictionFile,'r') for line in IS: @@ -39,28 +39,27 @@ self.predictions[predId] = preds IS.close() return dicts - + def update(self,dataPoint,features,dependencies): """ Updates the Model. """ # No Update needed since we assume that we got all predictions pass - - + + def predict(self,problemId): """ Return the saved predictions. """ - return self.predictions[problemId] - + return self.predictions[problemId] + def save(self,fileName): OStream = open(fileName, 'wb') - dump((self.predictionFile,self.predictions),OStream) + dump((self.predictionFile,self.predictions),OStream) OStream.close() - + def load(self,fileName): OStream = open(fileName, 'rb') - self.predictionFile,self.predictions = load(OStream) + self.predictionFile,self.predictions = load(OStream) OStream.close() - diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/readData.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Thu Dec 06 11:25:10 2012 +0100 @@ -15,12 +15,12 @@ import sys,logging def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,inputFile): - logger = logging.getLogger('create_feature_dict') + logger = logging.getLogger('create_feature_dict') featureDict = {} - IS = open(inputFile,'r') + IS = open(inputFile,'r') for line in IS: line = line.split(':') - name = line[0] + name = line[0] # Name Id if nameIdDict.has_key(name): logger.warning('%s appears twice in the feature file. Aborting.',name) @@ -29,33 +29,41 @@ nameIdDict[name] = maxNameId idNameDict[maxNameId] = name nameId = maxNameId - maxNameId += 1 + maxNameId += 1 # Feature Ids - featureNames = [f.strip() for f in line[1].split()] + featureNames = [f.strip() for f in line[1].split()] + features = [] for fn in featureNames: - if not featureIdDict.has_key(fn): - featureIdDict[fn] = maxFeatureId - maxFeatureId += 1 - featureIds = [featureIdDict[fn] for fn in featureNames] + tmp = fn.split('=') + if len(tmp) == 2: + if not featureIdDict.has_key(tmp[0]): + featureIdDict[tmp[0]] = maxFeatureId + maxFeatureId += 1 + features.append((featureIdDict[tmp[0]],float(tmp[1]))) + else: + if not featureIdDict.has_key(fn): + featureIdDict[fn] = maxFeatureId + maxFeatureId += 1 + features.append((featureIdDict[fn],1.0)) # Store results - featureDict[nameId] = featureIds + featureDict[nameId] = features IS.close() return featureDict,maxNameId,maxFeatureId - + def create_dependencies_dict(nameIdDict,inputFile): logger = logging.getLogger('create_dependencies_dict') dependenciesDict = {} IS = open(inputFile,'r') for line in IS: line = line.split(':') - name = line[0] + name = line[0] # Name Id if not nameIdDict.has_key(name): logger.warning('%s is missing in nameIdDict. Aborting.',name) sys.exit(-1) nameId = nameIdDict[name] - dependenciesIds = [nameIdDict[f.strip()] for f in line[1].split()] + dependenciesIds = [nameIdDict[f.strip()] for f in line[1].split()] # Store results, add p proves p dependenciesDict[nameId] = [nameId] + dependenciesIds IS.close() @@ -67,19 +75,17 @@ IS = open(inputFile,'r') for line in IS: line = line.split(':') - name = line[0] + name = line[0] # Name Id if not nameIdDict.has_key(name): logger.warning('%s is missing in nameIdDict. Adding it as theory.',name) nameIdDict[name] = maxNameId idNameDict[maxNameId] = name nameId = maxNameId - maxNameId += 1 + maxNameId += 1 else: nameId = nameIdDict[name] accessibleStrings = line[1].split() accessibleDict[nameId] = [nameIdDict[a.strip()] for a in accessibleStrings] IS.close() return accessibleDict,maxNameId - - \ No newline at end of file diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/snow.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/snow.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/snow.py Thu Dec 06 11:25:10 2012 +0100 @@ -27,13 +27,13 @@ ''' self.logger = logging.getLogger('SNoW') self.SNoWTrainFile = '../tmp/snow.train' - self.SNoWTestFile = '../snow.test' - self.SNoWNetFile = '../tmp/snow.net' - + self.SNoWTestFile = '../snow.test' + self.SNoWNetFile = '../tmp/snow.net' + def initializeModel(self,trainData,dicts): """ Build basic model from training data. - """ + """ # Prepare input files self.logger.debug('Creating IO Files') OS = open(self.SNoWTrainFile,'w') @@ -44,60 +44,60 @@ dependencies = dicts.dependenciesDict[nameId] dependencies = map(str,dependencies) dependenciesString = string.join(dependencies,',') - snowString = string.join([featureString,dependenciesString],',')+':\n' + snowString = string.join([featureString,dependenciesString],',')+':\n' OS.write(snowString) OS.close() # Build Model self.logger.debug('Building Model START.') - snowTrainCommand = '../bin/snow -train -M+ -I %s -F %s -g- -B :0-%s' % (self.SNoWTrainFile,self.SNoWNetFile,dicts.maxNameId-1) + snowTrainCommand = '../bin/snow -train -M+ -I %s -F %s -g- -B :0-%s' % (self.SNoWTrainFile,self.SNoWNetFile,dicts.maxNameId-1) args = shlex.split(snowTrainCommand) - p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) p.wait() - self.logger.debug('Building Model END.') + self.logger.debug('Building Model END.') - + def update(self,dataPoint,features,dependencies,dicts): """ Updates the Model. THIS IS NOT WORKING ATM< BUT WE DONT CARE - """ - self.logger.debug('Updating Model START') - trainData = dicts.featureDict.keys() + """ + self.logger.debug('Updating Model START') + trainData = dicts.featureDict.keys() self.initializeModel(trainData,dicts) self.logger.debug('Updating Model END') - - + + def predict(self,features,accessibles,dicts): logger = logging.getLogger('predict_SNoW') - + OS = open(self.SNoWTestFile,'w') features = map(str,features) featureString = string.join(features, ',') snowString = featureString+':' OS.write(snowString) - OS.close() - + OS.close() + snowTestCommand = '../bin/snow -test -I %s -F %s -o allboth' % (self.SNoWTestFile,self.SNoWNetFile) args = shlex.split(snowTestCommand) - p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) + p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT) (lines, _stderrdata) = p.communicate() logger.debug('SNoW finished.') - lines = lines.split('\n') + lines = lines.split('\n') assert lines[9].startswith('Example ') assert lines[-4] == '' - predictionsCon = [] + predictionsCon = [] for line in lines[10:-4]: premiseId = int(line.split()[0][:-1]) predictionsCon.append(premiseId) return predictionsCon - + def save(self,fileName): OStream = open(fileName, 'wb') - dump(self.counts,OStream) + dump(self.counts,OStream) OStream.close() - + def load(self,fileName): OStream = open(fileName, 'rb') - self.counts = load(OStream) - OStream.close() \ No newline at end of file + self.counts = load(OStream) + OStream.close() diff -r 3d8863c41fe8 -r a5b666e0c3c2 src/HOL/Tools/Sledgehammer/MaSh/src/stats.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Wed Dec 05 15:59:08 2012 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Thu Dec 06 11:25:10 2012 +0100 @@ -32,18 +32,38 @@ self.recallData = [0]*cutOff self.recall100Data = [0]*cutOff self.aucData = [] - - def update(self,predictions,dependencies): + self.premiseOccurenceCounter = {} + self.firstDepAppearance = {} + self.depAppearances = [] + + def update(self,predictions,dependencies,statementCounter): """ Evaluates AUC, dependencies, recall100 and number of available premises of a prediction. """ - available = len(predictions) predictions = predictions[:self.cutOff] dependencies = set(dependencies) + # No Stats for if no dependencies + if len(dependencies) == 0: + self.logger.debug('No Dependencies for statement %s' % statementCounter ) + self.badPreds = [] + return + if len(predictions) < self.cutOff: + for i in range(len(predictions),self.cutOff): + self.recall100Data[i] += 1 + self.recallData[i] += 1 + for d in dependencies: + if self.premiseOccurenceCounter.has_key(d): + self.premiseOccurenceCounter[d] += 1 + else: + self.premiseOccurenceCounter[d] = 1 + if self.firstDepAppearance.has_key(d): + self.depAppearances.append(statementCounter-self.firstDepAppearance[d]) + else: + self.firstDepAppearance[d] = statementCounter depNr = len(dependencies) - aucSum = 0. - posResults = 0. + aucSum = 0. + posResults = 0. positives, negatives = 0, 0 recall100 = 0.0 badPreds = [] @@ -56,7 +76,7 @@ depsFound.append(pId) if index > 200: badPreds.append(pId) - else: + else: aucSum += posResults negatives+=1 # Update Recall and Recall100 stats @@ -66,7 +86,7 @@ self.recallData[index] += 1 else: self.recallData[index] += float(positives)/depNr - + if not depNr == positives: depsFound = set(depsFound) missing = [] @@ -78,28 +98,29 @@ positives+=1 self.logger.debug('Dependencies missing for %s in accessibles! Estimating Statistics.',\ string.join([str(dep) for dep in missing],',')) - + if positives == 0 or negatives == 0: auc = 1.0 - else: + else: auc = aucSum/(negatives*positives) - + self.aucData.append(auc) self.avgAUC += auc self.avgRecall100 += recall100 self.problems += 1 self.badPreds = badPreds - self.avgAvailable += available + self.avgAvailable += available self.avgDepNr += depNr - self.logger.info('AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\ - round(100*auc,2),depNr,recall100,available,self.cutOff) - + self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\ + statementCounter,round(100*auc,2),depNr,recall100,available,self.cutOff) + def printAvg(self): self.logger.info('Average results:') self.logger.info('avgAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t cutOff:%s', \ - round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),self.avgRecall100/self.problems,self.cutOff) + round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),self.cutOff) - try: + #try: + if True: from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist avgRecall = [float(x)/self.problems for x in self.recallData] figure('Recall') @@ -116,15 +137,30 @@ hist(self.aucData,bins=100) ylabel('Problems') xlabel('AUC') + maxCount = max(self.premiseOccurenceCounter.values()) + minCount = min(self.premiseOccurenceCounter.values()) + figure('Dependency Occurances') + hist(self.premiseOccurenceCounter.values(),bins=range(minCount,maxCount+2),align = 'left') + #ylabel('Occurences') + xlabel('Number of Times a Dependency Occurs') + figure('Dependency Appearance in Problems after Introduction.') + hist(self.depAppearances,bins=50) + figure('Dependency Appearance in Problems after Introduction in Percent.') + xAxis = range(max(self.depAppearances)+1) + yAxis = [0] * (max(self.depAppearances)+1) + for val in self.depAppearances: + yAxis[val] += 1 + yAxis = [float(x)/len(self.firstDepAppearance.keys()) for x in yAxis] + plot(xAxis,yAxis) show() - except: - self.logger.warning('Matplotlib module missing. Skipping graphs.') - - def save(self,fileName): + #except: + # self.logger.warning('Matplotlib module missing. Skipping graphs.') + + def save(self,fileName): oStream = open(fileName, 'wb') - dump((self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData),oStream) + dump((self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter),oStream) oStream.close() def load(self,fileName): - iStream = open(fileName, 'rb') - self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData = load(iStream) - iStream.close() \ No newline at end of file + iStream = open(fileName, 'rb') + self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter = load(iStream) + iStream.close()