added weights to MaSh (by Daniel Kuehlwein)
authorblanchet
Thu, 06 Dec 2012 11:25:10 +0100
changeset 50388 a5b666e0c3c2
parent 50387 3d8863c41fe8
child 50389 ad0ac9112d2c
added weights to MaSh (by Daniel Kuehlwein)
src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py
src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py
src/HOL/Tools/Sledgehammer/MaSh/src/mash.py
src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py
src/HOL/Tools/Sledgehammer/MaSh/src/readData.py
src/HOL/Tools/Sledgehammer/MaSh/src/snow.py
src/HOL/Tools/Sledgehammer/MaSh/src/stats.py
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py	Thu Dec 06 11:25:10 2012 +0100
@@ -21,12 +21,12 @@
 -------- Example Usage ---------------\n\
 ./compareStats.py --statFiles ../tmp/natISANB.stats ../tmp/natATPNB.stats -b 30\n\n\
 Author: Daniel Kuehlwein, July 2012',formatter_class=RawDescriptionHelpFormatter)
-parser.add_argument('--statFiles', default=None, nargs='+', 
+parser.add_argument('--statFiles', default=None, nargs='+',
                     help='The names of the saved statistic files.')
 parser.add_argument('-b','--bins',default=50,help="Number of bins for the AUC histogram. Default=50.",type=int)
 
 def main(argv = sys.argv[1:]):
-    args = parser.parse_args(argv)  
+    args = parser.parse_args(argv)
     if args.statFiles == None:
         print 'Filenames missing.'
         sys.exit(-1)
@@ -56,11 +56,10 @@
     legend(loc='upper left')
     ylabel('Problems')
     xlabel('AUC')
-        
+
     show()
 
 if __name__ == '__main__':
     #args = ['--statFiles','../tmp/natISANB.stats','../tmp/natATPNB.stats','-b','30']
     #sys.exit(main(args))
     sys.exit(main())
-    
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Thu Dec 06 11:25:10 2012 +0100
@@ -33,18 +33,18 @@
         self.accessibleDict = {}
         self.expandedAccessibles = {}
         self.changed = True
-    
+
     """
     Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict get filled!
-    """    
+    """
     def init_featureDict(self,featureFile):
         self.featureDict,self.maxNameId,self.maxFeatureId = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
-                                                                                self.maxFeatureId,featureFile)        
+                                                                                self.maxFeatureId,featureFile)
     def init_dependenciesDict(self,depFile):
         self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
     def init_accessibleDict(self,accFile):
         self.accessibleDict,self.maxNameId = create_accessible_dict(self.nameIdDict,self.idNameDict,self.maxNameId,accFile)
-    
+
     def init_all(self,inputFolder,featureFileName = 'mash_features',depFileName='mash_dependencies',accFileName = 'mash_accessibility'):
         featureFile = join(inputFolder,featureFileName)
         depFile = join(inputFolder,depFileName)
@@ -54,7 +54,7 @@
         self.init_dependenciesDict(depFile)
         self.expandedAccessibles = {}
         self.changed = True
-        
+
     def get_name_id(self,name):
         """
         Return the Id for a name.
@@ -66,7 +66,7 @@
             self.nameIdDict[name] = self.maxNameId
             self.idNameDict[self.maxNameId] = name
             nameId = self.maxNameId
-            self.maxNameId += 1 
+            self.maxNameId += 1
             self.changed = True
         return nameId
 
@@ -74,8 +74,23 @@
         if not self.featureIdDict.has_key(featureName):
             self.featureIdDict[featureName] = self.maxFeatureId
             self.maxFeatureId += 1
-            self.changed = True 
-            
+            self.changed = True
+        return self.featureIdDict[featureName]
+
+    def get_features(self,line):
+        # Feature Ids
+        featureNames = [f.strip() for f in line[1].split()]
+        features = []
+        for fn in featureNames:
+            tmp = fn.split('=')
+            if len(tmp) == 2:
+                fId = self.add_feature(tmp[0])
+                features.append((fId,float(tmp[1])))
+            else:
+                fId = self.add_feature(fn)
+                features.append((fId,1.0))
+        return features
+
     def expand_accessibles(self,acc):
         accessibles = set(acc)
         unexpandedQueue = Queue()
@@ -86,71 +101,67 @@
                 unexpandedQueue.put(a)
         while not unexpandedQueue.empty():
             nextUnExp = unexpandedQueue.get()
-            nextUnExpAcc = self.accessibleDict[nextUnExp]            
+            nextUnExpAcc = self.accessibleDict[nextUnExp]
             for a in nextUnExpAcc:
                 if not a in accessibles:
                     accessibles = accessibles.union([a])
                     if self.expandedAccessibles.has_key(a):
                         accessibles = accessibles.union(self.expandedAccessibles[a])
                     else:
-                        unexpandedQueue.put(a)                    
+                        unexpandedQueue.put(a)
         return list(accessibles)
-            
+
     def parse_fact(self,line):
         """
         Parses a single line, extracting accessibles, features, and dependencies.
         """
         assert line.startswith('! ')
         line = line[2:]
-        
+
         # line = name:accessibles;features;dependencies
         line = line.split(':')
         name = line[0].strip()
         nameId = self.get_name_id(name)
-    
-        line = line[1].split(';')       
+
+        line = line[1].split(';')
         # Accessible Ids
         unExpAcc = [self.nameIdDict[a.strip()] for a in line[0].split()]
         self.accessibleDict[nameId] = unExpAcc
-        # Feature Ids
-        featureNames = [f.strip() for f in line[1].split()]
-        for fn in featureNames:
-            self.add_feature(fn)
-        self.featureDict[nameId] = [self.featureIdDict[fn] for fn in featureNames]
+        self.featureDict[nameId] = self.get_features(line)
         self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]
         self.changed = True
         return nameId
-    
+
     def parse_overwrite(self,line):
         """
         Parses a single line, extracts the problemId and the Ids of the dependencies.
         """
         assert line.startswith('p ')
         line = line[2:]
-        
+
         # line = name:dependencies
         line = line.split(':')
         name = line[0].strip()
         nameId = self.get_name_id(name)
-    
+
         dependencies = [self.nameIdDict[d.strip()] for d in line[1].split()]
         self.changed = True
         return nameId,dependencies
-    
+
     def parse_problem(self,line):
         """
-        Parses a problem and returns the features and the accessibles. 
+        Parses a problem and returns the features and the accessibles.
         """
         assert line.startswith('? ')
         line = line[2:]
         name = None
-        
+
         # Check whether there is a problem name:
         tmp = line.split(':')
         if len(tmp) == 2:
             name = tmp[0].strip()
             line = tmp[1]
-        
+
         # line = accessibles;features
         line = line.split(';')
         # Accessible Ids, expand and store the accessibles.
@@ -164,13 +175,14 @@
                 self.expandedAccessibles[accId] = self.expand_accessibles(accIdAcc)
                 self.changed = True
         accessibles = self.expand_accessibles(unExpAcc)
-        # Feature Ids
-        featureNames = [f.strip() for f in line[1].split()]
-        for fn in featureNames:
-            self.add_feature(fn)
-        features = [self.featureIdDict[fn] for fn in featureNames]
-        return name,features,accessibles    
-    
+#        # Feature Ids
+#        featureNames = [f.strip() for f in line[1].split()]
+#        for fn in featureNames:
+#            self.add_feature(fn)
+#        features = [self.featureIdDict[fn] for fn in featureNames]
+        features = self.get_features(line)
+        return name,features,accessibles
+
     def save(self,fileName):
         if self.changed:
             dictsStream = open(fileName, 'wb')
@@ -179,10 +191,8 @@
             self.changed = False
             dictsStream.close()
     def load(self,fileName):
-        dictsStream = open(fileName, 'rb')        
+        dictsStream = open(fileName, 'rb')
         self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
               self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream)
         self.changed = False
         dictsStream.close()
-    
-            
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Thu Dec 06 11:25:10 2012 +0100
@@ -34,7 +34,7 @@
 Author: Daniel Kuehlwein, July 2012',formatter_class=RawDescriptionHelpFormatter)
 parser.add_argument('-i','--inputFile',help='File containing all problems to be solved.')
 parser.add_argument('-o','--outputDir', default='../tmp/',help='Directory where all created files are stored. Default=../tmp/.')
-parser.add_argument('-p','--predictions',default='../tmp/%s.predictions' % datetime.datetime.now(), 
+parser.add_argument('-p','--predictions',default='../tmp/%s.predictions' % datetime.datetime.now(),
                     help='File where the predictions stored. Default=../tmp/dateTime.predictions.')
 parser.add_argument('--numberOfPredictions',default=200,help="Number of premises to write in the output. Default=200.",type=int)
 
@@ -56,11 +56,11 @@
 parser.add_argument('-l','--log', default='../tmp/%s.log' % datetime.datetime.now(), help='Log file name. Default=../tmp/dateTime.log')
 parser.add_argument('-q','--quiet',default=False,action='store_true',help="If enabled, only print warnings. Default=False.")
 
-def main(argv = sys.argv[1:]):        
+def main(argv = sys.argv[1:]):
     # Initializing command-line arguments
     args = parser.parse_args(argv)
 
-    # Set up logging 
+    # Set up logging
     logging.basicConfig(level=logging.DEBUG,
                         format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
                         datefmt='%d-%m %H:%M:%S',
@@ -81,8 +81,8 @@
     logger.info('Using the following settings: %s',args)
     # Pick algorithm
     if args.nb:
-        logger.info('Using Naive Bayes for learning.')        
-        model = NBClassifier() 
+        logger.info('Using Naive Bayes for learning.')
+        model = NBClassifier()
         modelFile = os.path.join(args.outputDir,'NB.pickle')
     elif args.snow:
         logger.info('Using naive bayes (SNoW) for learning.')
@@ -90,37 +90,37 @@
         modelFile = os.path.join(args.outputDir,'SNoW.pickle')
     elif args.predef:
         logger.info('Using predefined predictions.')
-        predictionFile = os.path.join(args.inputDir,'mash_meng_paulson_suggestions') 
+        predictionFile = os.path.join(args.inputDir,'mash_meng_paulson_suggestions')
         model = Predefined(predictionFile)
-        modelFile = os.path.join(args.outputDir,'isabelle.pickle')        
+        modelFile = os.path.join(args.outputDir,'isabelle.pickle')
     else:
-        logger.info('No algorithm specified. Using Naive Bayes.')        
-        model = NBClassifier() 
-        modelFile = os.path.join(args.outputDir,'NB.pickle')    
-    dictsFile = os.path.join(args.outputDir,'dicts.pickle')    
-    
+        logger.info('No algorithm specified. Using Naive Bayes.')
+        model = NBClassifier()
+        modelFile = os.path.join(args.outputDir,'NB.pickle')
+    dictsFile = os.path.join(args.outputDir,'dicts.pickle')
+
     # Initializing model
-    if args.init:        
+    if args.init:
         logger.info('Initializing Model.')
         startTime = time()
-        
-        # Load all data        
+
+        # Load all data
         dicts = Dictionaries()
         dicts.init_all(args.inputDir,depFileName=args.depFile)
-        
+
         # Create Model
         trainData = dicts.featureDict.keys()
         if args.predef:
             dicts = model.initializeModel(trainData,dicts)
         else:
             model.initializeModel(trainData,dicts)
-        
+
         model.save(modelFile)
         dicts.save(dictsFile)
 
         logger.info('All Done. %s seconds needed.',round(time()-startTime,2))
         return 0
-    # Create predictions and/or update model       
+    # Create predictions and/or update model
     else:
         lineCounter = 0
         dicts = Dictionaries()
@@ -129,15 +129,15 @@
             dicts.load(dictsFile)
         if os.path.isfile(modelFile):
             model.load(modelFile)
-        
+
         # IO Streams
         OS = open(args.predictions,'a')
         IS = open(args.inputFile,'r')
-        
+
         # Statistics
         if args.statistics:
             stats = Statistics(args.cutOff)
-        
+
         predictions = None
         #Reading Input File
         for line in IS:
@@ -151,11 +151,11 @@
                         if args.predef:
                             predictions = model.predict[problemId]
                         else:
-                            predictions,_predictionsValues = model.predict(dicts.featureDict[problemId],dicts.expand_accessibles(acc))        
+                            predictions,_predictionsValues = model.predict(dicts.featureDict[problemId],dicts.expand_accessibles(acc))
                         stats.update(predictions,dicts.dependenciesDict[problemId])
                         if not stats.badPreds == []:
                             bp = string.join([str(dicts.idNameDict[x]) for x in stats.badPreds], ',')
-                            logger.debug('Bad predictions: %s',bp)    
+                            logger.debug('Bad predictions: %s',bp)
                     # Update Dependencies, p proves p
                     dicts.dependenciesDict[problemId] = [problemId]+dicts.dependenciesDict[problemId]
                     model.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId])
@@ -171,15 +171,15 @@
                         continue
                     name,features,accessibles = dicts.parse_problem(line)
                     # Create predictions
-                    logger.info('Starting computation for problem on line %s',lineCounter)                
-                    predictions,predictionValues = model.predict(features,accessibles)        
+                    logger.info('Starting computation for problem on line %s',lineCounter)
+                    predictions,predictionValues = model.predict(features,accessibles)
                     assert len(predictions) == len(predictionValues)
                     logger.info('Done. %s seconds needed.',round(time()-startTime,2))
-                    
-                    # Output        
+
+                    # Output
                     predictionNames = [str(dicts.idNameDict[p]) for p in predictions[:args.numberOfPredictions]]
-                    predictionValues = [str(x) for x in predictionValues[:args.numberOfPredictions]]                    
-                    predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))]                
+                    predictionValues = [str(x) for x in predictionValues[:args.numberOfPredictions]]
+                    predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))]
                     predictionsString = string.join(predictionsStringList,' ')
                     outString = '%s: %s' % (name,predictionsString)
                     OS.write('%s\n' % outString)
@@ -192,14 +192,14 @@
                 logger.warning('An error occurred on line %s .',line)
                 lineCounter += 1
                 continue
-            """    
+            """
         OS.close()
         IS.close()
-        
+
         # Statistics
         if args.statistics:
             stats.printAvg()
-        
+
         # Save
         if args.saveModel:
             model.save(modelFile)
@@ -214,7 +214,7 @@
     # Nat
     #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/','--predef']
     #args = ['-i', '../data/Nat/mash_commands','-p','../tmp/testIsabelle.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/natATPMP.stats']
-    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/']    
+    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/']
     #args = ['-i', '../data/Nat/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/natATPNB.stats','--cutOff','500']
     # BUG
     #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/List/','--isabelle']
@@ -223,6 +223,5 @@
     #args = ['-i', '../bug/adds/mash_commands','-p','../tmp/testNB.pred','-l','testNB.log','--nb','-o','../tmp/']
     #startTime = time()
     #sys.exit(main(args))
-    #print 'New ' + str(round(time()-startTime,2))    
+    #print 'New ' + str(round(time()-startTime,2))
     sys.exit(main())
-    
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py	Thu Dec 06 11:25:10 2012 +0100
@@ -24,46 +24,46 @@
         Constructor
         '''
         self.counts = {}
-    
+
     def initializeModel(self,trainData,dicts):
         """
         Build basic model from training data.
-        """        
+        """
         for d in trainData:
             dPosCount = 0
-            dFeatureCounts = {}            
+            dFeatureCounts = {}
             self.counts[d] = [dPosCount,dFeatureCounts]
-        
+
         for key in dicts.dependenciesDict.keys():
             # Add p proves p
             keyDeps = [key]+dicts.dependenciesDict[key]
-            
+
             for dep in keyDeps:
                 self.counts[dep][0] += 1
                 depFeatures = dicts.featureDict[key]
-                for f in depFeatures:
+                for f,_w in depFeatures:
                     if self.counts[dep][1].has_key(f):
                         self.counts[dep][1][f] += 1
                     else:
                         self.counts[dep][1][f] = 1
 
-    
+
     def update(self,dataPoint,features,dependencies):
         """
         Updates the Model.
         """
         if not self.counts.has_key(dataPoint):
             dPosCount = 0
-            dFeatureCounts = {}            
+            dFeatureCounts = {}
             self.counts[dataPoint] = [dPosCount,dFeatureCounts]
         for dep in dependencies:
             self.counts[dep][0] += 1
-            for f in features:
+            for f,_w in features:
                 if self.counts[dep][1].has_key(f):
                     self.counts[dep][1][f] += 1
                 else:
                     self.counts[dep][1][f] = 1
- 
+
     def delete(self,dataPoint,features,dependencies):
         """
         Deletes a single datapoint from the model.
@@ -73,7 +73,7 @@
             for f in features:
                 self.counts[dep][1][f] -= 1
 
-            
+
     def overwrite(self,problemId,newDependencies,dicts):
         """
         Deletes the old dependencies of problemId and replaces them with the new ones. Updates the model accordingly.
@@ -83,45 +83,45 @@
         features = dicts.featureDict[problemId]
         self.delete(problemId,features,oldDeps)
         self.update(problemId,features,newDependencies)
-    
+
     def predict(self,features,accessibles):
         """
         For each accessible, predicts the probability of it being useful given the features.
         Returns a ranking of the accessibles.
         """
         predictions = []
-        for a in accessibles:            
+        for a in accessibles:
             posA = self.counts[a][0]
             fA = set(self.counts[a][1].keys())
             fWeightsA = self.counts[a][1]
-            resultA = log(posA) 
-            for f in features:
+            resultA = log(posA)
+            for f,w in features:
                 if f in fA:
                     if fWeightsA[f] == 0:
-                        resultA -= 15
+                        resultA -= w*15
                     else:
                         assert fWeightsA[f] <= posA
-                        resultA += log(float(fWeightsA[f])/posA)
+                        resultA += w*log(float(fWeightsA[f])/posA)
                 else:
-                    resultA -= 15
+                    resultA -= w*15
             predictions.append(resultA)
         #expPredictions = array([exp(x) for x in predictions])
         predictions = array(predictions)
-        perm = (-predictions).argsort()        
-        #return array(accessibles)[perm],expPredictions[perm] 
+        perm = (-predictions).argsort()
+        #return array(accessibles)[perm],expPredictions[perm]
         return array(accessibles)[perm],predictions[perm]
-    
+
     def save(self,fileName):
         OStream = open(fileName, 'wb')
-        dump(self.counts,OStream)        
+        dump(self.counts,OStream)
         OStream.close()
-        
+
     def load(self,fileName):
         OStream = open(fileName, 'rb')
-        self.counts = load(OStream)      
+        self.counts = load(OStream)
         OStream.close()
 
-    
+
 if __name__ == '__main__':
     featureDict = {0:[0,1,2],1:[3,2,1]}
     dependenciesDict = {0:[0],1:[0,1]}
@@ -136,4 +136,4 @@
     d.loadModel('x')
     print c.counts
     print d.counts
-    print 'Done'
\ No newline at end of file
+    print 'Done'
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py	Thu Dec 06 11:25:10 2012 +0100
@@ -23,11 +23,11 @@
         Constructor
         '''
         self.predictionFile = mpPredictionFile
-    
+
     def initializeModel(self,_trainData,dicts):
         """
         Load predictions.
-        """        
+        """
         self.predictions = {}
         IS = open(self.predictionFile,'r')
         for line in IS:
@@ -39,28 +39,27 @@
             self.predictions[predId] = preds
         IS.close()
         return dicts
-    
+
     def update(self,dataPoint,features,dependencies):
         """
         Updates the Model.
         """
         # No Update needed since we assume that we got all predictions
         pass
-            
-    
+
+
     def predict(self,problemId):
         """
         Return the saved predictions.
         """
-        return self.predictions[problemId] 
-    
+        return self.predictions[problemId]
+
     def save(self,fileName):
         OStream = open(fileName, 'wb')
-        dump((self.predictionFile,self.predictions),OStream)        
+        dump((self.predictionFile,self.predictions),OStream)
         OStream.close()
-        
+
     def load(self,fileName):
         OStream = open(fileName, 'rb')
-        self.predictionFile,self.predictions = load(OStream)      
+        self.predictionFile,self.predictions = load(OStream)
         OStream.close()
-
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Thu Dec 06 11:25:10 2012 +0100
@@ -15,12 +15,12 @@
 import sys,logging
 
 def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,inputFile):
-    logger = logging.getLogger('create_feature_dict')    
+    logger = logging.getLogger('create_feature_dict')
     featureDict = {}
-    IS = open(inputFile,'r') 
+    IS = open(inputFile,'r')
     for line in IS:
         line = line.split(':')
-        name = line[0]        
+        name = line[0]
         # Name Id
         if nameIdDict.has_key(name):
             logger.warning('%s appears twice in the feature file. Aborting.',name)
@@ -29,33 +29,41 @@
             nameIdDict[name] = maxNameId
             idNameDict[maxNameId] = name
             nameId = maxNameId
-            maxNameId += 1            
+            maxNameId += 1
         # Feature Ids
-        featureNames = [f.strip() for f in line[1].split()]             
+        featureNames = [f.strip() for f in line[1].split()]
+        features = []
         for fn in featureNames:
-            if not featureIdDict.has_key(fn):
-                featureIdDict[fn] = maxFeatureId
-                maxFeatureId += 1
-        featureIds = [featureIdDict[fn] for fn in featureNames]
+            tmp = fn.split('=')
+            if len(tmp) == 2:
+                if not featureIdDict.has_key(tmp[0]):
+                    featureIdDict[tmp[0]] = maxFeatureId
+                    maxFeatureId += 1
+                features.append((featureIdDict[tmp[0]],float(tmp[1])))
+            else:
+                if not featureIdDict.has_key(fn):
+                    featureIdDict[fn] = maxFeatureId
+                    maxFeatureId += 1
+                features.append((featureIdDict[fn],1.0))
         # Store results
-        featureDict[nameId] = featureIds
+        featureDict[nameId] = features
     IS.close()
     return featureDict,maxNameId,maxFeatureId
-     
+
 def create_dependencies_dict(nameIdDict,inputFile):
     logger = logging.getLogger('create_dependencies_dict')
     dependenciesDict = {}
     IS = open(inputFile,'r')
     for line in IS:
         line = line.split(':')
-        name = line[0]        
+        name = line[0]
         # Name Id
         if not nameIdDict.has_key(name):
             logger.warning('%s is missing in nameIdDict. Aborting.',name)
             sys.exit(-1)
 
         nameId = nameIdDict[name]
-        dependenciesIds = [nameIdDict[f.strip()] for f in line[1].split()]             
+        dependenciesIds = [nameIdDict[f.strip()] for f in line[1].split()]
         # Store results, add p proves p
         dependenciesDict[nameId] = [nameId] + dependenciesIds
     IS.close()
@@ -67,19 +75,17 @@
     IS = open(inputFile,'r')
     for line in IS:
         line = line.split(':')
-        name = line[0]     
+        name = line[0]
         # Name Id
         if not nameIdDict.has_key(name):
             logger.warning('%s is missing in nameIdDict. Adding it as theory.',name)
             nameIdDict[name] = maxNameId
             idNameDict[maxNameId] = name
             nameId = maxNameId
-            maxNameId += 1  
+            maxNameId += 1
         else:
             nameId = nameIdDict[name]
         accessibleStrings = line[1].split()
         accessibleDict[nameId] = [nameIdDict[a.strip()] for a in accessibleStrings]
     IS.close()
     return accessibleDict,maxNameId
-        
-    
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/snow.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/snow.py	Thu Dec 06 11:25:10 2012 +0100
@@ -27,13 +27,13 @@
         '''
         self.logger = logging.getLogger('SNoW')
         self.SNoWTrainFile = '../tmp/snow.train'
-        self.SNoWTestFile = '../snow.test'  
-        self.SNoWNetFile = '../tmp/snow.net' 
-    
+        self.SNoWTestFile = '../snow.test'
+        self.SNoWNetFile = '../tmp/snow.net'
+
     def initializeModel(self,trainData,dicts):
         """
         Build basic model from training data.
-        """     
+        """
         # Prepare input files
         self.logger.debug('Creating IO Files')
         OS = open(self.SNoWTrainFile,'w')
@@ -44,60 +44,60 @@
             dependencies = dicts.dependenciesDict[nameId]
             dependencies = map(str,dependencies)
             dependenciesString = string.join(dependencies,',')
-            snowString = string.join([featureString,dependenciesString],',')+':\n' 
+            snowString = string.join([featureString,dependenciesString],',')+':\n'
             OS.write(snowString)
         OS.close()
 
         # Build Model
         self.logger.debug('Building Model START.')
-        snowTrainCommand = '../bin/snow -train -M+ -I %s -F %s -g- -B :0-%s' % (self.SNoWTrainFile,self.SNoWNetFile,dicts.maxNameId-1)    
+        snowTrainCommand = '../bin/snow -train -M+ -I %s -F %s -g- -B :0-%s' % (self.SNoWTrainFile,self.SNoWNetFile,dicts.maxNameId-1)
         args = shlex.split(snowTrainCommand)
-        p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)    
+        p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
         p.wait()
-        self.logger.debug('Building Model END.')   
+        self.logger.debug('Building Model END.')
 
-    
+
     def update(self,dataPoint,features,dependencies,dicts):
         """
         Updates the Model.
         THIS IS NOT WORKING ATM< BUT WE DONT CARE
-        """        
-        self.logger.debug('Updating Model START') 
-        trainData = dicts.featureDict.keys()       
+        """
+        self.logger.debug('Updating Model START')
+        trainData = dicts.featureDict.keys()
         self.initializeModel(trainData,dicts)
         self.logger.debug('Updating Model END')
-            
-    
+
+
     def predict(self,features,accessibles,dicts):
         logger = logging.getLogger('predict_SNoW')
-         
+
         OS = open(self.SNoWTestFile,'w')
         features = map(str,features)
         featureString = string.join(features, ',')
         snowString = featureString+':'
         OS.write(snowString)
-        OS.close() 
-        
+        OS.close()
+
         snowTestCommand = '../bin/snow -test -I %s -F %s -o allboth' % (self.SNoWTestFile,self.SNoWNetFile)
         args = shlex.split(snowTestCommand)
-        p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)    
+        p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
         (lines, _stderrdata) = p.communicate()
         logger.debug('SNoW finished.')
-        lines = lines.split('\n')    
+        lines = lines.split('\n')
         assert lines[9].startswith('Example ')
         assert lines[-4] == ''
-        predictionsCon = []    
+        predictionsCon = []
         for line in lines[10:-4]:
             premiseId = int(line.split()[0][:-1])
             predictionsCon.append(premiseId)
         return predictionsCon
-    
+
     def save(self,fileName):
         OStream = open(fileName, 'wb')
-        dump(self.counts,OStream)        
+        dump(self.counts,OStream)
         OStream.close()
-        
+
     def load(self,fileName):
         OStream = open(fileName, 'rb')
-        self.counts = load(OStream)      
-        OStream.close()
\ No newline at end of file
+        self.counts = load(OStream)
+        OStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py	Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py	Thu Dec 06 11:25:10 2012 +0100
@@ -32,18 +32,38 @@
         self.recallData = [0]*cutOff
         self.recall100Data = [0]*cutOff
         self.aucData = []
-        
-    def update(self,predictions,dependencies):
+        self.premiseOccurenceCounter = {}
+        self.firstDepAppearance = {}
+        self.depAppearances = []
+
+    def update(self,predictions,dependencies,statementCounter):
         """
         Evaluates AUC, dependencies, recall100 and number of available premises of a prediction.
         """
-
         available = len(predictions)
         predictions = predictions[:self.cutOff]
         dependencies = set(dependencies)
+        # No Stats for if no dependencies
+        if len(dependencies) == 0:
+            self.logger.debug('No Dependencies for statement %s' % statementCounter )
+            self.badPreds = []
+            return
+        if len(predictions) < self.cutOff:
+            for i in range(len(predictions),self.cutOff):
+                self.recall100Data[i] += 1
+                self.recallData[i] += 1
+        for d in dependencies:
+            if self.premiseOccurenceCounter.has_key(d):
+                self.premiseOccurenceCounter[d] += 1
+            else:
+                self.premiseOccurenceCounter[d] = 1
+            if self.firstDepAppearance.has_key(d):
+                self.depAppearances.append(statementCounter-self.firstDepAppearance[d])
+            else:
+                self.firstDepAppearance[d] = statementCounter
         depNr = len(dependencies)
-        aucSum = 0.    
-        posResults = 0.        
+        aucSum = 0.
+        posResults = 0.
         positives, negatives = 0, 0
         recall100 = 0.0
         badPreds = []
@@ -56,7 +76,7 @@
                 depsFound.append(pId)
                 if index > 200:
                     badPreds.append(pId)
-            else:            
+            else:
                 aucSum += posResults
                 negatives+=1
             # Update Recall and Recall100 stats
@@ -66,7 +86,7 @@
                 self.recallData[index] += 1
             else:
                 self.recallData[index] += float(positives)/depNr
-    
+
         if not depNr == positives:
             depsFound = set(depsFound)
             missing = []
@@ -78,28 +98,29 @@
                     positives+=1
             self.logger.debug('Dependencies missing for %s in accessibles! Estimating Statistics.',\
                               string.join([str(dep) for dep in missing],','))
-    
+
         if positives == 0 or negatives == 0:
             auc = 1.0
-        else:            
+        else:
             auc = aucSum/(negatives*positives)
-        
+
         self.aucData.append(auc)
         self.avgAUC += auc
         self.avgRecall100 += recall100
         self.problems += 1
         self.badPreds = badPreds
-        self.avgAvailable += available 
+        self.avgAvailable += available
         self.avgDepNr += depNr
-        self.logger.info('AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\
-                          round(100*auc,2),depNr,recall100,available,self.cutOff)        
-        
+        self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\
+                          statementCounter,round(100*auc,2),depNr,recall100,available,self.cutOff)
+
     def printAvg(self):
         self.logger.info('Average results:')
         self.logger.info('avgAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t cutOff:%s', \
-                         round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),self.avgRecall100/self.problems,self.cutOff)
+                         round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),self.cutOff)
 
-        try:
+        #try:
+        if True:
             from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist
             avgRecall = [float(x)/self.problems for x in self.recallData]
             figure('Recall')
@@ -116,15 +137,30 @@
             hist(self.aucData,bins=100)
             ylabel('Problems')
             xlabel('AUC')
+            maxCount = max(self.premiseOccurenceCounter.values())
+            minCount = min(self.premiseOccurenceCounter.values())
+            figure('Dependency Occurances')
+            hist(self.premiseOccurenceCounter.values(),bins=range(minCount,maxCount+2),align = 'left')
+            #ylabel('Occurences')
+            xlabel('Number of Times a Dependency Occurs')
+            figure('Dependency Appearance in Problems after Introduction.')
+            hist(self.depAppearances,bins=50)
+            figure('Dependency Appearance in Problems after Introduction in Percent.')
+            xAxis = range(max(self.depAppearances)+1)
+            yAxis = [0] * (max(self.depAppearances)+1)
+            for val in self.depAppearances:
+                yAxis[val] += 1
+            yAxis = [float(x)/len(self.firstDepAppearance.keys()) for x in yAxis]
+            plot(xAxis,yAxis)
             show()
-        except:
-            self.logger.warning('Matplotlib module missing. Skipping graphs.')
-    
-    def save(self,fileName):       
+        #except:
+        #    self.logger.warning('Matplotlib module missing. Skipping graphs.')
+
+    def save(self,fileName):
         oStream = open(fileName, 'wb')
-        dump((self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData),oStream)
+        dump((self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter),oStream)
         oStream.close()
     def load(self,fileName):
-        iStream = open(fileName, 'rb')        
-        self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData = load(iStream)
-        iStream.close()
\ No newline at end of file
+        iStream = open(fileName, 'rb')
+        self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter = load(iStream)
+        iStream.close()