updated MaSh Python component
authorblanchet
Fri, 11 Jan 2013 16:30:56 +0100
changeset 50827 aba769dc82e9
parent 50826 18ace05656cf
child 50828 91e6836bb68b
updated MaSh Python component
src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py
src/HOL/Tools/Sledgehammer/MaSh/src/mash.py
src/HOL/Tools/Sledgehammer/MaSh/src/readData.py
src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py
src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Fri Jan 11 16:30:56 2013 +0100
@@ -31,6 +31,7 @@
         self.featureDict = {}
         self.dependenciesDict = {}
         self.accessibleDict = {}
+        self.featureCountDict = {}
         self.expandedAccessibles = {}
         self.changed = True
 
@@ -38,8 +39,8 @@
     Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict, articleDict get filled!
     """
     def init_featureDict(self,featureFile):
-        self.featureDict,self.maxNameId,self.maxFeatureId = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
-                                                                                self.maxFeatureId,featureFile)
+        self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
+                                                                                                       self.maxFeatureId,self.featureCountDict,featureFile)
     def init_dependenciesDict(self,depFile):
         self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
     def init_accessibleDict(self,accFile):
@@ -73,9 +74,12 @@
     def add_feature(self,featureName):
         if not self.featureIdDict.has_key(featureName):
             self.featureIdDict[featureName] = self.maxFeatureId
+            self.featureCountDict[self.maxFeatureId] = 0
             self.maxFeatureId += 1
             self.changed = True
-        return self.featureIdDict[featureName]
+        fId = self.featureIdDict[featureName]
+        self.featureCountDict[fId] += 1
+        return fId
 
     def get_features(self,line):
         # Feature Ids
@@ -83,12 +87,12 @@
         features = []
         for fn in featureNames:
             tmp = fn.split('=')
+            weight = 1.0
             if len(tmp) == 2:
-                fId = self.add_feature(tmp[0])
-                features.append((fId,float(tmp[1])))
-            else:
-                fId = self.add_feature(fn)
-                features.append((fId,1.0))
+                fn = tmp[0]
+                weight = float(tmp[1])
+            fId = self.add_feature(tmp[0])
+            features.append((fId,weight))
         return features
 
     def expand_accessibles(self,acc):
@@ -150,7 +154,7 @@
 
     def parse_problem(self,line):
         """
-        Parses a problem and returns the features and the accessibles.
+        Parses a problem and returns the features, the accessibles, and any hints.
         """
         assert line.startswith('? ')
         line = line[2:]
@@ -176,19 +180,24 @@
                 self.changed = True
         accessibles = self.expand_accessibles(unExpAcc)
         features = self.get_features(line)
+        # Get hints:
+        if len(line) == 3:
+            hints = [self.nameIdDict[d.strip()] for d in line[2].split()]
+        else:
+            hints = []
 
-        return name,features,accessibles
+        return name,features,accessibles,hints
 
     def save(self,fileName):
         if self.changed:
             dictsStream = open(fileName, 'wb')
             dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
-                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict),dictsStream)
+                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict),dictsStream)
             self.changed = False
             dictsStream.close()
     def load(self,fileName):
         dictsStream = open(fileName, 'rb')
         self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
-              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream)
+              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict = load(dictsStream)
         self.changed = False
         dictsStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Fri Jan 11 16:30:56 2013 +0100
@@ -48,6 +48,8 @@
                     help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies')
 parser.add_argument('--saveModel',default=False,action='store_true',help="Stores the learned Model at the end of a prediction run. Default=False.")
 parser.add_argument('--learnTheories',default=False,action='store_true',help="Uses a two-lvl prediction mode. First the theories, then the premises. Default=False.")
+#DEBUG: Change sineprioir default to false
+parser.add_argument('--sinePrior',default=True,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
 
 
 parser.add_argument('--nb',default=False,action='store_true',help="Use Naive Bayes for learning. This is the default learning method.")
@@ -87,7 +89,7 @@
     # Pick algorithm
     if args.nb:
         logger.info('Using sparse Naive Bayes for learning.')
-        model = sparseNBClassifier()
+        model = sparseNBClassifier(args.sinePrior)
         modelFile = os.path.join(args.outputDir,'NB.pickle')
     elif args.snow:
         logger.info('Using naive bayes (SNoW) for learning.')
@@ -101,7 +103,7 @@
         modelFile = os.path.join(args.outputDir,'mepo.pickle')        
     else:
         logger.info('No algorithm specified. Using sparse Naive Bayes.')
-        model = sparseNBClassifier()
+        model = sparseNBClassifier(args.sinePrior)
         modelFile = os.path.join(args.outputDir,'NB.pickle')
     dictsFile = os.path.join(args.outputDir,'dicts.pickle')
     theoryFile = os.path.join(args.outputDir,'theory.pickle')
@@ -182,7 +184,7 @@
                     # Update Dependencies, p proves p
                     dicts.dependenciesDict[problemId] = [problemId]+dicts.dependenciesDict[problemId]
                     if args.learnTheories:
-                        theoryModels.update(problemId,dicts)
+                        theoryModels.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId],dicts)
                     if args.snow:
                         model.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId],dicts)
                     else:
@@ -200,16 +202,34 @@
                     computeStats = True
                     if args.predef:
                         continue
-                    name,features,accessibles = dicts.parse_problem(line)    
+                    name,features,accessibles,hints = dicts.parse_problem(line)    
                     # Create predictions
                     logger.info('Starting computation for problem on line %s',lineCounter)
+                    # Update Models with hints
+                    if not hints == []:
+                        if args.learnTheories:
+                            accessibleTheories = set([(dicts.idNameDict[x]).split('.')[0] for x in accessibles])
+                            theoryModels.update_with_acc('hints',features,hints,dicts,accessibleTheories)
+                        if args.snow:
+                            pass
+                        else:
+                            model.update('hints',features,hints)
+                    
+                    # Predict premises
                     if args.learnTheories:
                         predictedTheories,accessibles = theoryModels.predict(features,accessibles,dicts)
-                    if args.snow:
-                        predictions,predictionValues = model.predict(features,accessibles,dicts)
-                    else:
-                        predictions,predictionValues = model.predict(features,accessibles)
+                    predictions,predictionValues = model.predict(features,accessibles,dicts)
                     assert len(predictions) == len(predictionValues)
+                    
+                    # Delete hints
+                    if not hints == []:
+                        if args.learnTheories:
+                            theoryModels.delete('hints',features,hints,dicts)
+                        if args.snow:
+                            pass
+                        else:
+                            model.delete('hints',features,hints)
+
                     logger.info('Done. %s seconds needed.',round(time()-startTime,2))
                     # Output        
                     predictionNames = [str(dicts.idNameDict[p]) for p in predictions[:args.numberOfPredictions]]
@@ -253,10 +273,19 @@
 
 if __name__ == '__main__':
     # Example:
+    # Auth
+    # ISAR Theories
+    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--learnTheories']    
+    #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
+    # ISAR MePo 
+    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--predef']
+    #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
+
+    
     # Jinja
     # ISAR Theories
-    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121221/Jinja/','--learnTheories']    
-    #args = ['-i', '../data/20121221/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
+    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Jinja/','--learnTheories']    
+    #args = ['-i', '../data/20121227b/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
     # ISAR NB
     #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121221/Jinja/']    
     #args = ['-i', '../data/20121221/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500']
@@ -277,6 +306,9 @@
 
 
     # Probability
+    # ISAR Theories
+    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/','--learnTheories']    
+    #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
     # ISAR NB
     #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/']    
     #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/ProbIsarNB.stats','--cutOff','500']
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Fri Jan 11 16:30:56 2013 +0100
@@ -14,7 +14,7 @@
 
 import sys,logging
 
-def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,inputFile):
+def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,inputFile):
     logger = logging.getLogger('create_feature_dict')
     featureDict = {}
     IS = open(inputFile,'r')
@@ -32,23 +32,24 @@
             maxNameId += 1
         # Feature Ids
         featureNames = [f.strip() for f in line[1].split()]
-        features = []
+        features = []        
         for fn in featureNames:
+            weight = 1.0
             tmp = fn.split('=')
             if len(tmp) == 2:
-                if not featureIdDict.has_key(tmp[0]):
-                    featureIdDict[tmp[0]] = maxFeatureId
-                    maxFeatureId += 1
-                features.append((featureIdDict[tmp[0]],float(tmp[1])))
-            else:
-                if not featureIdDict.has_key(fn):
-                    featureIdDict[fn] = maxFeatureId
-                    maxFeatureId += 1
-                features.append((featureIdDict[fn],1.0))
+                fn = tmp[0]
+                weight = float(tmp[1])
+            if not featureIdDict.has_key(fn):
+                featureIdDict[fn] = maxFeatureId
+                featureCountDict[maxFeatureId] = 0
+                maxFeatureId += 1
+            fId = featureIdDict[fn]
+            features.append((fId,weight))
+            featureCountDict[fId] += 1
         # Store results
         featureDict[nameId] = features
     IS.close()
-    return featureDict,maxNameId,maxFeatureId
+    return featureDict,maxNameId,maxFeatureId,featureCountDict
 
 def create_dependencies_dict(nameIdDict,inputFile):
     logger = logging.getLogger('create_dependencies_dict')
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py	Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py	Fri Jan 11 16:30:56 2013 +0100
@@ -61,7 +61,7 @@
             self.pos -= 1
         else:
             self.neg -= 1
-        for f in features:
+        for f,_w in features:
             posCount,negCount = self.counts[f]
             if label:
                 posCount -= 1
@@ -79,7 +79,7 @@
     
     def predict_sparse(self,features):
         """
-        Returns 1 if the probability is greater than 50%.
+        Returns 1 if the probability of + being the correct label is greater than the probability that - is the correct label.
         """
         if self.neg == 0:
             return 1
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Fri Jan 11 16:30:56 2013 +0100
@@ -19,11 +19,15 @@
     An updateable naive Bayes classifier.
     '''
 
-    def __init__(self):
+    def __init__(self,useSinePrior = False):
         '''
         Constructor
         '''
         self.counts = {}
+        self.sinePrior = useSinePrior
+        self.defaultPriorWeight = 20
+        self.posWeight = 20 
+        self.defVal = 15
 
     def initializeModel(self,trainData,dicts):
         """
@@ -32,6 +36,10 @@
         for d in trainData:
             dPosCount = 0
             dFeatureCounts = {}
+            # DEBUG: give p |- p a higher weight
+            dPosCount = self.defaultPriorWeight
+            for f,_w in dicts.featureDict[d]:
+                dFeatureCounts[f] = self.defaultPriorWeight
             self.counts[d] = [dPosCount,dFeatureCounts]
 
         for key in dicts.dependenciesDict.keys():
@@ -53,8 +61,12 @@
         """
         if not self.counts.has_key(dataPoint):
             dPosCount = 0
-            dFeatureCounts = {}
-            self.counts[dataPoint] = [dPosCount,dFeatureCounts]
+            dFeatureCounts = {}            
+            # DEBUG: give p |- p a higher weight
+            dPosCount = self.defaultPriorWeight
+            for f,_w in features:
+                dFeatureCounts[f] = self.defaultPriorWeight
+            self.counts[dataPoint] = [dPosCount,dFeatureCounts]            
         for dep in dependencies:
             self.counts[dep][0] += 1
             for f,_w in features:
@@ -69,7 +81,7 @@
         """
         for dep in dependencies:
             self.counts[dep][0] -= 1
-            for f in features:
+            for f,_w in features:
                 self.counts[dep][1][f] -= 1
 
 
@@ -83,13 +95,11 @@
         self.delete(problemId,features,oldDeps)
         self.update(problemId,features,newDependencies)
 
-    def predict(self,features,accessibles):
+    def predict(self,features,accessibles,dicts):
         """
         For each accessible, predicts the probability of it being useful given the features.
         Returns a ranking of the accessibles.
         """
-        posWeight = 20.0
-        defVal = 15
         predictions = []
         for a in accessibles:
             posA = self.counts[a][0]
@@ -101,12 +111,12 @@
                 #w = 1
                 if f in fA:
                     if fWeightsA[f] == 0:
-                        resultA -= w*defVal
+                        resultA -= w*self.defVal
                     else:
                         assert fWeightsA[f] <= posA
-                        resultA += w*log(float(posWeight*fWeightsA[f])/posA)
+                        resultA += w*log(float(self.posWeight*fWeightsA[f])/posA)
                 else:
-                    resultA -= w*defVal
+                    resultA -= w*self.defVal
             predictions.append(resultA)
         #expPredictions = array([exp(x) for x in predictions])
         predictions = array(predictions)
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py	Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py	Fri Jan 11 16:30:56 2013 +0100
@@ -87,15 +87,13 @@
         for a in self.accTheories:                
             self.theoryModels[a].overwrite(features,a in oldTheories,a in newTheories) 
     
-    def delete(self):
-        pass
+    def delete(self,problemId,features,dependencies,dicts):
+        tmp = [dicts.idNameDict[x] for x in dependencies]
+        usedTheories = set([x.split('.')[0] for x in tmp])   
+        for a in self.accessibleTheories:        
+            self.theoryModels[a].delete(features,a in usedTheories)  
     
-    def update(self,problemId,dicts):        
-        features = dicts.featureDict[problemId]
-        
-        # Find the actually used theories
-        tmp = [dicts.idNameDict[x] for x in dicts.dependenciesDict[problemId]]
-        usedTheories = set([x.split('.')[0] for x in tmp]) 
+    def update(self,problemId,features,dependencies,dicts): 
         currentTheory = (dicts.idNameDict[problemId]).split('.')[0]       
         # Create new theory model, if there is a new theory 
         if not self.theoryDict.has_key(currentTheory):
@@ -105,9 +103,15 @@
                 self.currentTheory = currentTheory
                 theoryModel = singleNBClassifier()
                 self.theoryModels[currentTheory] = theoryModel
-                self.accessibleTheories.add(self.currentTheory)          
-        if not len(usedTheories) == 0:
-            for a in self.accessibleTheories:                
+                self.accessibleTheories.add(self.currentTheory) 
+        self.update_with_acc(problemId,features,dependencies,dicts,self.accessibleTheories)  
+    
+    def update_with_acc(self,problemId,features,dependencies,dicts,accessibleTheories):        
+        # Find the actually used theories
+        tmp = [dicts.idNameDict[x] for x in dependencies]
+        usedTheories = set([x.split('.')[0] for x in tmp]) 
+        if not len(usedTheories) == 0: 
+            for a in accessibleTheories:                
                 self.theoryModels[a].update(features,a in usedTheories)   
     
     def predict(self,features,accessibles,dicts):
--- a/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML	Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML	Fri Jan 11 16:30:56 2013 +0100
@@ -313,7 +313,7 @@
 
 local
 
-val version = "*** MaSh version 20130104a ***"
+val version = "*** MaSh version 20130111a ***"
 
 exception Too_New of unit