new version of MaSh Python component
authorblanchet
Sat, 12 Jan 2013 16:49:39 +0100
changeset 50840 a5cc092156da
parent 50839 9cc70b273e90
child 50841 087e3c531e86
new version of MaSh Python component
src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py
src/HOL/Tools/Sledgehammer/MaSh/src/mash.py
src/HOL/Tools/Sledgehammer/MaSh/src/readData.py
src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py
src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Sat Jan 12 16:49:39 2013 +0100
@@ -31,16 +31,18 @@
         self.featureDict = {}
         self.dependenciesDict = {}
         self.accessibleDict = {}
+        self.expandedAccessibles = {}
+        # For SInE-like prior
         self.featureCountDict = {}
-        self.expandedAccessibles = {}
+        self.triggerFeatures = {}
         self.changed = True
 
     """
     Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict, articleDict get filled!
     """
     def init_featureDict(self,featureFile):
-        self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
-                                                                                                       self.maxFeatureId,self.featureCountDict,featureFile)
+        self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict,self.triggerFeatures = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
+                                                                                                       self.maxFeatureId,self.featureCountDict,self.triggerFeatures,featureFile)
     def init_dependenciesDict(self,depFile):
         self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
     def init_accessibleDict(self,accFile):
@@ -131,8 +133,11 @@
         # Accessible Ids
         unExpAcc = [self.nameIdDict[a.strip()] for a in line[0].split()]
         self.accessibleDict[nameId] = unExpAcc
-        self.featureDict[nameId] = self.get_features(line)
-        self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]
+        features = self.get_features(line)
+        self.featureDict[nameId] = features
+        minVal = min([self.featureCountDict[f] for f,_w in features])
+        self.triggerFeatures[nameId] = [f for f,_w in features if self.featureCountDict[f] == minVal]
+        self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]        
         self.changed = True
         return nameId
 
@@ -192,12 +197,12 @@
         if self.changed:
             dictsStream = open(fileName, 'wb')
             dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
-                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict),dictsStream)
+                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures),dictsStream)
             self.changed = False
             dictsStream.close()
     def load(self,fileName):
         dictsStream = open(fileName, 'rb')
         self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
-              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict = load(dictsStream)
+              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures = load(dictsStream)
         self.changed = False
         dictsStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Sat Jan 12 16:49:39 2013 +0100
@@ -47,12 +47,21 @@
 parser.add_argument('--depFile', default='mash_dependencies',
                     help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies')
 parser.add_argument('--saveModel',default=False,action='store_true',help="Stores the learned Model at the end of a prediction run. Default=False.")
+
 parser.add_argument('--learnTheories',default=False,action='store_true',help="Uses a two-lvl prediction mode. First the theories, then the premises. Default=False.")
-#DEBUG: Change sineprioir default to false
-parser.add_argument('--sinePrior',default=True,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
-
+# Theory Parameters
+parser.add_argument('--theoryDefValPos',default=-7.5,help="Default value for positive unknown features. Default=-7.5.",type=float)
+parser.add_argument('--theoryDefValNeg',default=-15.0,help="Default value for negative unknown features. Default=-15.0.",type=float)
+parser.add_argument('--theoryPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
 
 parser.add_argument('--nb',default=False,action='store_true',help="Use Naive Bayes for learning. This is the default learning method.")
+# NB Parameters
+parser.add_argument('--NBDefaultPriorWeight',default=20.0,help="Initializes classifiers with value * p |- p. Default=20.0.",type=float)
+parser.add_argument('--NBDefVal',default=-15.0,help="Default value for unknown features. Default=-15.0.",type=float)
+parser.add_argument('--NBPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
+parser.add_argument('--NBSinePrior',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
+parser.add_argument('--NBSineWeight',default=100.0,help="How much the SInE prior is weighted. Default=100.0.",type=float)
+
 parser.add_argument('--snow',default=False,action='store_true',help="Use SNoW's naive bayes instead of Naive Bayes for learning.")
 parser.add_argument('--predef',default=False,action='store_true',\
                     help="Use predefined predictions. Used only for comparison with the actual learning. Expects mash_mepo_suggestions in inputDir.")
@@ -89,7 +98,7 @@
     # Pick algorithm
     if args.nb:
         logger.info('Using sparse Naive Bayes for learning.')
-        model = sparseNBClassifier(args.sinePrior)
+        model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight)
         modelFile = os.path.join(args.outputDir,'NB.pickle')
     elif args.snow:
         logger.info('Using naive bayes (SNoW) for learning.')
@@ -103,7 +112,7 @@
         modelFile = os.path.join(args.outputDir,'mepo.pickle')        
     else:
         logger.info('No algorithm specified. Using sparse Naive Bayes.')
-        model = sparseNBClassifier(args.sinePrior)
+        model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight)
         modelFile = os.path.join(args.outputDir,'NB.pickle')
     dictsFile = os.path.join(args.outputDir,'dicts.pickle')
     theoryFile = os.path.join(args.outputDir,'theory.pickle')
@@ -123,7 +132,7 @@
 
         if args.learnTheories:
             depFile = os.path.join(args.inputDir,args.depFile)
-            theoryModels = TheoryModels()
+            theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight)
             theoryModels.init(depFile,dicts)
             theoryModels.save(theoryFile)
             
@@ -138,7 +147,7 @@
         statementCounter = 1
         computeStats = False
         dicts = Dictionaries()
-        theoryModels = TheoryModels()
+        theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight)
         # Load Files
         if os.path.isfile(dictsFile):
             dicts.load(dictsFile)
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Sat Jan 12 16:49:39 2013 +0100
@@ -14,7 +14,7 @@
 
 import sys,logging
 
-def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,inputFile):
+def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,triggerFeatures,inputFile):
     logger = logging.getLogger('create_feature_dict')
     featureDict = {}
     IS = open(inputFile,'r')
@@ -32,7 +32,8 @@
             maxNameId += 1
         # Feature Ids
         featureNames = [f.strip() for f in line[1].split()]
-        features = []        
+        features = []     
+        minFeatureCount = 0   
         for fn in featureNames:
             weight = 1.0
             tmp = fn.split('=')
@@ -46,10 +47,12 @@
             fId = featureIdDict[fn]
             features.append((fId,weight))
             featureCountDict[fId] += 1
+            minFeatureCount = min(triggerFeatures,featureCountDict[fId])
         # Store results
         featureDict[nameId] = features
+        triggerFeatures[nameId] = [f for f,_w in features if featureCountDict[f] == minFeatureCount]
     IS.close()
-    return featureDict,maxNameId,maxFeatureId,featureCountDict
+    return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeatures
 
 def create_dependencies_dict(nameIdDict,inputFile):
     logger = logging.getLogger('create_dependencies_dict')
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py	Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py	Sat Jan 12 16:49:39 2013 +0100
@@ -19,13 +19,16 @@
     An updateable naive Bayes classifier.
     '''
 
-    def __init__(self):
+    def __init__(self,defValPos = -7.5,defValNeg = -15.0,posWeight = 10.0):
         '''
         Constructor
         '''
         self.neg = 0.0
         self.pos = 0.0
         self.counts = {} # Counts is the tuple poscounts,negcounts
+        self.defValPos = defValPos       
+        self.defValNeg = defValNeg
+        self.posWeight = posWeight        
     
     def update(self,features,label):
         """
@@ -85,10 +88,6 @@
             return 1
         elif self.pos ==0:
             return 0
-        defValPos = -7.5       
-        defValNeg = -15.0
-        posWeight = 10.0
-        
         logneg = log(self.neg)
         logpos = log(self.pos)
         prob = logpos - logneg
@@ -97,13 +96,13 @@
             if self.counts.has_key(f):
                 posCount,negCount = self.counts[f]
                 if posCount > 0:
-                    prob += (log(posWeight * posCount) - logpos)
+                    prob += (log(self.posWeight * posCount) - logpos)
                 else:
-                    prob += defValPos
+                    prob += self.defValPos
                 if negCount > 0:
                     prob -= (log(negCount) - logneg)
                 else:
-                    prob -= defValNeg 
+                    prob -= self.defValNeg 
         if prob >= 0 : 
             return 1
         else:
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Sat Jan 12 16:49:39 2013 +0100
@@ -11,7 +11,7 @@
 '''
 
 from cPickle import dump,load
-from numpy import array,exp
+from numpy import array
 from math import log
 
 class sparseNBClassifier(object):
@@ -19,28 +19,28 @@
     An updateable naive Bayes classifier.
     '''
 
-    def __init__(self,useSinePrior = False):
+    def __init__(self,defaultPriorWeight = 20.0,posWeight = 20.0,defVal = -15.0,useSinePrior = False,sineWeight = 100.0):
         '''
         Constructor
         '''
         self.counts = {}
         self.sinePrior = useSinePrior
-        self.defaultPriorWeight = 20
-        self.posWeight = 20 
-        self.defVal = 15
+        self.sineWeight = sineWeight
+        self.defaultPriorWeight = defaultPriorWeight
+        self.posWeight = posWeight
+        self.defVal = defVal
 
     def initializeModel(self,trainData,dicts):
         """
         Build basic model from training data.
         """
-        for d in trainData:
-            dPosCount = 0
+        for d in trainData:            
             dFeatureCounts = {}
-            # DEBUG: give p |- p a higher weight
-            dPosCount = self.defaultPriorWeight
-            for f,_w in dicts.featureDict[d]:
-                dFeatureCounts[f] = self.defaultPriorWeight
-            self.counts[d] = [dPosCount,dFeatureCounts]
+            # Give p |- p a higher weight
+            if not self.defaultPriorWeight == 0:            
+                for f,_w in dicts.featureDict[d]:
+                    dFeatureCounts[f] = self.defaultPriorWeight
+            self.counts[d] = [self.defaultPriorWeight,dFeatureCounts]
 
         for key in dicts.dependenciesDict.keys():
             # Add p proves p
@@ -60,13 +60,12 @@
         Updates the Model.
         """
         if not self.counts.has_key(dataPoint):
-            dPosCount = 0
             dFeatureCounts = {}            
-            # DEBUG: give p |- p a higher weight
-            dPosCount = self.defaultPriorWeight
-            for f,_w in features:
-                dFeatureCounts[f] = self.defaultPriorWeight
-            self.counts[dataPoint] = [dPosCount,dFeatureCounts]            
+            # Give p |- p a higher weight
+            if not self.defaultPriorWeight == 0:               
+                for f,_w in features:
+                    dFeatureCounts[f] = self.defaultPriorWeight
+            self.counts[dataPoint] = [self.defaultPriorWeight,dFeatureCounts]            
         for dep in dependencies:
             self.counts[dep][0] += 1
             for f,_w in features:
@@ -101,37 +100,43 @@
         Returns a ranking of the accessibles.
         """
         predictions = []
+        fSet = set([f for f,_w in features])
         for a in accessibles:
             posA = self.counts[a][0]
             fA = set(self.counts[a][1].keys())
             fWeightsA = self.counts[a][1]
-            resultA = log(posA)
+            prior = posA
+            if self.sinePrior:
+                triggerFeatures = dicts.triggerFeatures[a]
+                triggeredFeatures = fSet.intersection(triggerFeatures)
+                for f in triggeredFeatures:
+                    posW = dicts.featureCountDict[f]
+                    prior += self.sineWeight /  posW 
+            resultA = log(prior)
             for f,w in features:
                 # DEBUG
                 #w = 1
                 if f in fA:
                     if fWeightsA[f] == 0:
-                        resultA -= w*self.defVal
+                        resultA += w*self.defVal
                     else:
                         assert fWeightsA[f] <= posA
                         resultA += w*log(float(self.posWeight*fWeightsA[f])/posA)
                 else:
-                    resultA -= w*self.defVal
+                    resultA += w*self.defVal
             predictions.append(resultA)
-        #expPredictions = array([exp(x) for x in predictions])
         predictions = array(predictions)
         perm = (-predictions).argsort()
-        #return array(accessibles)[perm],expPredictions[perm]
         return array(accessibles)[perm],predictions[perm]
 
     def save(self,fileName):
         OStream = open(fileName, 'wb')
-        dump(self.counts,OStream)
+        dump((self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight),OStream)
         OStream.close()
 
     def load(self,fileName):
         OStream = open(fileName, 'rb')
-        self.counts = load(OStream)
+        self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight = load(OStream)
         OStream.close()
 
 
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py	Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py	Sat Jan 12 16:49:39 2013 +0100
@@ -20,11 +20,15 @@
     '''
 
 
-    def __init__(self):
+    def __init__(self,defValPos = -7.5,defValNeg = -15.0,posWeight = 10.0):
         '''
         Constructor
         '''
         self.theoryModels = {}
+        # Model Params
+        self.defValPos = defValPos       
+        self.defValNeg = defValNeg
+        self.posWeight = posWeight
         self.theoryDict = {}
         self.accessibleTheories = set([])
         self.currentTheory = None
@@ -49,7 +53,7 @@
                     self.accessibleTheories.add(self.currentTheory)
                 self.currentTheory = theory
                 self.theoryDict[theory] = set([nameId])
-                theoryModel = singleNBClassifier()
+                theoryModel = singleNBClassifier(self.defValPos,self.defValNeg,self.posWeight)
                 self.theoryModels[theory] = theoryModel 
             else:
                 self.theoryDict[theory] = self.theoryDict[theory].union([nameId])               
@@ -94,6 +98,7 @@
             self.theoryModels[a].delete(features,a in usedTheories)  
     
     def update(self,problemId,features,dependencies,dicts): 
+        # TODO: Implicit assumption that self.accessibleTheories contains all accessible theories!
         currentTheory = (dicts.idNameDict[problemId]).split('.')[0]       
         # Create new theory model, if there is a new theory 
         if not self.theoryDict.has_key(currentTheory):
@@ -101,7 +106,7 @@
             if not currentTheory == None:
                 self.theoryDict[currentTheory] = []
                 self.currentTheory = currentTheory
-                theoryModel = singleNBClassifier()
+                theoryModel = singleNBClassifier(self.defValPos,self.defValNeg,self.posWeight)
                 self.theoryModels[currentTheory] = theoryModel
                 self.accessibleTheories.add(self.currentTheory) 
         self.update_with_acc(problemId,features,dependencies,dicts,self.accessibleTheories)  
@@ -118,12 +123,7 @@
         """
         Predicts the relevant theories. Returns the predicted theories and a list of all accessible premises in these theories.
         """         
-        # TODO: This can be made a lot faster!    
-        self.accessibleTheories = []
-        for x in accessibles:
-            xArt = (dicts.idNameDict[x]).split('.')[0]
-            self.accessibleTheories.append(xArt)
-        self.accessibleTheories = set(self.accessibleTheories)
+        self.accessibleTheories = set([(dicts.idNameDict[x]).split('.')[0] for x in accessibles])
         
         # Predict Theories
         predictedTheories = [self.currentTheory]
@@ -143,9 +143,9 @@
         
     def save(self,fileName):
         outStream = open(fileName, 'wb')
-        dump((self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict),outStream)
+        dump((self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict,self.defValPos,self.defValNeg,self.posWeight),outStream)
         outStream.close()
     def load(self,fileName):
         inStream = open(fileName, 'rb')
-        self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict = load(inStream)
+        self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict,self.defValPos,self.defValNeg,self.posWeight = load(inStream)
         inStream.close()
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML	Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML	Sat Jan 12 16:49:39 2013 +0100
@@ -313,7 +313,7 @@
 
 local
 
-val version = "*** MaSh version 20130111a ***"
+val version = "*** MaSh version 20130112a ***"
 
 exception Too_New of unit