new version of MaSh
authorblanchet
Thu, 12 Sep 2013 09:59:45 +0200
changeset 53555 12251bc889f1
parent 53554 78fe0002024d
child 53556 347f743e8336
new version of MaSh
src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py
src/HOL/Tools/Sledgehammer/MaSh/src/KNN.py
src/HOL/Tools/Sledgehammer/MaSh/src/KNNs.py
src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py
src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py
src/HOL/Tools/Sledgehammer/MaSh/src/mash.py
src/HOL/Tools/Sledgehammer/MaSh/src/mashTest.py
src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py
src/HOL/Tools/Sledgehammer/MaSh/src/readData.py
src/HOL/Tools/Sledgehammer/MaSh/src/server.py
src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/stats.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py	Thu Sep 12 09:59:45 2013 +0200
@@ -0,0 +1,162 @@
+'''
+Created on Aug 21, 2013
+
+@author: daniel
+'''
+
+from math import log
+from gensim import corpora, models, similarities
+
+class ExpandFeatures(object):
+
+    def __init__(self,dicts):
+        self.dicts = dicts
+        self.featureMap = {}
+        self.alpha = 0.1
+        self.featureCounts = {}
+        self.counter = 0        
+        self.corpus = []
+        self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
+
+    def initialize(self,dicts):
+        self.dicts = dicts
+        IS = open(dicts.accFile,'r')
+        for line in IS:
+            line = line.split(':')
+            name = line[0]
+            #print 'name',name
+            nameId = dicts.nameIdDict[name]    
+            features = dicts.featureDict[nameId]
+            dependencies = dicts.dependenciesDict[nameId]   
+            x = [self.dicts.idNameDict[d] for d in dependencies]
+            #print x  
+            self.update(features, dependencies)
+            self.corpus.append([(x,1) for x in features.keys()])
+        IS.close()
+        print 'x'
+        #self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
+        print self.LSIModel
+        print 'y'
+        
+    def update(self,features,dependencies):
+        self.counter += 1
+        self.corpus.append([(x,1) for x in features.keys()])
+        self.LSIModel.add_documents([[(x,1) for x in features.keys()]])
+        """
+        for f in features.iterkeys():
+            try:
+                self.featureCounts[f] += 1
+            except:
+                self.featureCounts[f] = 1
+            if self.featureCounts[f] > 100:
+                continue
+            try:
+                self.featureMap[f] = self.featureMap[f].intersection(features.keys())
+            except:
+                self.featureMap[f] = set(features.keys())
+            #print 'fOld',len(fMap),self.featureCounts[f],len(dependencies)
+
+            for d in dependencies[1:]:
+                #print 'dep',self.dicts.idNameDict[d]
+                dFeatures = self.dicts.featureDict[d]
+                for df in dFeatures.iterkeys():
+                    if self.featureCounts.has_key(df):
+                        if self.featureCounts[df] > 20:
+                            continue
+                    else:
+                        print df
+                    try:
+                        fMap[df] += self.alpha * (1.0 - fMap[df])
+                    except:
+                        fMap[df] = self.alpha
+            """
+            #print 'fNew',len(fMap)
+            
+    def expand(self,features):
+        #print self.corpus[:50]        
+        #print corpus
+        #tfidfmodel = models.TfidfModel(self.corpus, normalize=True)        
+        #print features.keys()        
+        #tfidfcorpus = [tfidfmodel[x] for x in self.corpus]
+        #newFeatures = LSI[[(x,1) for x in features.keys()]]
+        newFeatures = self.LSIModel[[(x,1) for x in features.keys()]]
+        print features
+        print newFeatures
+        #print newFeatures
+        
+        """
+        newFeatures = dict(features)
+        for f in features.keys():
+            try:
+                fC = self.featureCounts[f]
+            except:
+                fC = 0.5
+            newFeatures[f] = log(float(8+self.counter) / fC)
+        #nrOfFeatures = float(len(features))
+        addedCount = 0
+        alpha = 0.2
+        #"""
+        
+        """
+        consideredFeatures = []
+        while len(newFeatures) < 30:
+            #alpha = alpha * 0.5
+            minF = None
+            minFrequence = 1000000
+            for f in newFeatures.iterkeys():
+                if f in consideredFeatures:
+                    continue
+                try:
+                    if self.featureCounts[f] < minFrequence:
+                        minF = f
+                except:
+                    pass
+            if minF == None:
+                break
+            # Expand minimal feature
+            consideredFeatures.append(minF)
+            for expF in self.featureMap[minF]:
+                if not newFeatures.has_key(expF):
+                    fC = self.featureCounts[minF]
+                    newFeatures[expF] = alpha*log(float(8+self.counter) / fC)
+        #print features, newFeatures
+        #"""
+        """
+        for f in features.iterkeys():
+            try:
+                self.featureCounts[f] += 1
+            except:
+                self.featureCounts[f] = 0            
+            if self.featureCounts[f] > 10:
+                continue            
+            addedCount += 1
+            try:
+                fmap = self.featureMap[f]
+            except:
+                self.featureMap[f] = {}
+                fmap = {}
+            for nf,nv in fmap.iteritems():
+                try:
+                    newFeatures[nf] += nv
+                except:
+                    newFeatures[nf] = nv
+        if addedCount > 0: 
+            for f,w in newFeatures.iteritems():
+                newFeatures[f] = float(w)/addedCount
+        #"""                    
+        """
+        deleteF = []
+        for f,w in newFeatures.iteritems():
+            if w < 0.1:
+                deleteF.append(f)
+        for f in deleteF:
+            del newFeatures[f]
+        """
+        #print 'fold',len(features)
+        #print 'fnew',len(newFeatures)
+        return dict(newFeatures)
+
+if __name__ == "__main__":
+    pass
+    
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/KNN.py	Thu Sep 12 09:59:45 2013 +0200
@@ -0,0 +1,99 @@
+'''
+Created on Aug 21, 2013
+
+@author: daniel
+'''
+
+from cPickle import dump,load
+from numpy import array
+from math import sqrt,log
+
+def cosine(f1,f2):
+    f1Norm = 0.0
+    for f in f1.keys():
+        f1Norm += f1[f] * f1[f]
+    #assert f1Norm = sum(map(lambda x,y: x*y,f1.itervalues(),f1.itervalues()))
+    f1Norm = sqrt(f1Norm) 
+    
+    f2Norm = 0.0
+    for f in f2.keys():
+        f2Norm += f2[f] * f2[f]
+    f2Norm = sqrt(f2Norm)         
+   
+    dotProduct = 0.0
+    featureIntersection = set(f1.keys()) & set(f2.keys())
+    for f in featureIntersection:
+            dotProduct += f1[f] * f2[f]
+    cosine = dotProduct / (f1Norm * f2Norm)        
+    return 1.0 - cosine
+
+def euclidean(f1,f2):
+    diffSum = 0.0        
+    featureUnion = set(f1.keys()) | set(f2.keys())
+    for f in featureUnion:
+        try:
+            f1Val = f1[f]
+        except:
+            f1Val = 0.0
+        try:
+            f2Val = f2[f]
+        except:
+            f2Val = 0.0
+        diff = f1Val - f2Val
+        diffSum += diff * diff
+        #if f in f1.keys():
+        #    diffSum += log(2+self.pointCount/self.featureCounts[f]) * diff * diff
+        #else:
+        #    diffSum += diff * diff            
+    #print diffSum,f1,f2
+    return diffSum
+
+class KNN(object):
+    '''
+    A basic KNN ranker.
+    '''
+
+    def __init__(self,dicts,metric=cosine):
+        '''
+        Constructor
+        '''
+        self.points = dicts.featureDict
+        self.metric = metric
+
+    def initializeModel(self,_trainData,_dicts):  
+        """
+        Build basic model from training data.
+        """
+        pass
+    
+    def update(self,dataPoint,features,dependencies):
+        assert self.points[dataPoint] == features
+        
+    def overwrite(self,problemId,newDependencies,dicts):
+        # Taken care of by dicts
+        pass
+    
+    def delete(self,dataPoint,features,dependencies):
+        # Taken care of by dicts
+        pass      
+    
+    def predict(self,features,accessibles,dicts):
+        predictions = map(lambda x: self.metric(features,self.points[x]),accessibles)
+        predictions = array(predictions)
+        perm = predictions.argsort()
+        return array(accessibles)[perm],predictions[perm]
+    
+    def save(self,fileName):
+        OStream = open(fileName, 'wb')
+        dump((self.points,self.metric),OStream)
+        OStream.close()
+
+    def load(self,fileName):
+        OStream = open(fileName, 'rb')
+        self.points,self.metric = load(OStream)
+        OStream.close()
+
+if __name__ == '__main__':
+    pass    
+        
+        
\ No newline at end of file
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/KNNs.py	Thu Sep 12 09:59:45 2013 +0200
@@ -0,0 +1,105 @@
+'''
+Created on Aug 21, 2013
+
+@author: daniel
+'''
+
+from math import log
+from KNN import KNN,cosine
+from numpy import array
+
+class KNNAdaptPointFeatures(KNN):
+    
+    def __init__(self,dicts,metric=cosine,alpha = 0.05):
+        self.points = dicts.featureDict
+        self.metric = self.euclidean    
+        self.alpha = alpha
+        self.count = 0
+        self.featureCount = {}
+
+    def initializeModel(self,trainData,dicts):  
+        """
+        Build basic model from training data.
+        """
+        IS = open(dicts.accFile,'r')
+        for line in IS:
+            line = line.split(':')
+            name = line[0]
+            nameId = dicts.nameIdDict[name]
+            features = dicts.featureDict[nameId]
+            dependencies = dicts.dependenciesDict[nameId] 
+            self.update(nameId, features, dependencies)
+        IS.close()
+        
+    def update(self,dataPoint,features,dependencies):
+        self.count += 1
+        for f in features.iterkeys():
+            try:
+                self.featureCount[f] += 1
+            except:
+                self.featureCount[f] = 1
+        for d in dependencies:
+            dFeatures = self.points[d]
+            featureUnion = set(dFeatures.keys()) | set(features.keys())
+            for f in featureUnion:
+                try:
+                    pVal = features[f]
+                except:
+                    pVal = 0.0
+                try:
+                    dVal = dFeatures[f]
+                except:
+                    dVal = 0.0
+                newDVal = dVal + self.alpha * (pVal - dVal)                
+                dFeatures[f] = newDVal           
+        
+    def euclidean(self,f1,f2):
+        diffSum = 0.0        
+        f1Set = set(f1.keys())
+        featureUnion = f1Set | set(f2.keys())
+        for f in featureUnion:
+            if not self.featureCount.has_key(f):
+                continue
+            if self.featureCount[f] == 1:
+                continue
+            try:
+                f1Val = f1[f]
+            except:
+                f1Val = 0.0
+            try:
+                f2Val = f2[f]
+            except:
+                f2Val = 0.0
+            diff = f1Val - f2Val
+            diffSum += diff * diff
+            if f in f1Set:
+                diffSum += log(2+self.count/self.featureCount[f]) * diff * diff
+            else:
+                diffSum += diff * diff            
+        #print diffSum,f1,f2
+        return diffSum 
+
+class KNNUrban(KNN):
+    def __init__(self,dicts,metric=cosine,nrOfNeighbours = 40):
+        self.points = dicts.featureDict
+        self.metric = metric    
+        self.nrOfNeighbours = nrOfNeighbours # Ignored at the moment
+    
+    def predict(self,features,accessibles,dicts):
+        predictions = map(lambda x: self.metric(features,self.points[x]),accessibles)
+        pDict = dict(zip(accessibles,predictions))
+        for a,p in zip(accessibles,predictions):
+            aDeps = dicts.dependenciesDict[a]
+            for d in aDeps:
+                pDict[d] -= p 
+        predictions = []
+        names = []
+        for n,p in pDict.items():
+            predictions.append(p)
+            names.append(n)        
+        predictions = array(predictions)
+        perm = predictions.argsort()
+        return array(names)[perm],predictions[perm]
+    
+    
+         
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py	Thu Sep 12 09:59:45 2013 +0200
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python
 #     Title:      HOL/Tools/Sledgehammer/MaSh/src/compareStats.py
 #     Author:     Daniel Kuehlwein, ICIS, Radboud University Nijmegen
 #     Copyright   2012
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Thu Sep 12 09:59:45 2013 +0200
@@ -1,18 +1,13 @@
 #     Title:      HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py
 #     Author:     Daniel Kuehlwein, ICIS, Radboud University Nijmegen
-#     Copyright   2012
+#     Copyright   2012-2013
 #
 # Persistent dictionaries: accessibility, dependencies, and features.
 
-'''
-Created on Jul 12, 2012
-
-@author: daniel
-'''
-
+import logging,sys
 from os.path import join
 from Queue import Queue
-from readData import create_accessible_dict,create_dependencies_dict,create_feature_dict
+from readData import create_accessible_dict,create_dependencies_dict
 from cPickle import load,dump
 
 class Dictionaries(object):
@@ -32,20 +27,17 @@
         self.dependenciesDict = {}
         self.accessibleDict = {}
         self.expandedAccessibles = {}
-        # For SInE features
-        self.useSine = False
-        self.featureCountDict = {} 
-        self.triggerFeaturesDict = {} 
-        self.featureTriggeredFormulasDict = {}
+        self.accFile =  ''
         self.changed = True
 
     """
     Init functions. nameIdDict, idNameDict, featureIdDict, articleDict get filled!
     """
-    def init_featureDict(self,featureFile,sineFeatures):
-        self.featureDict,self.maxNameId,self.maxFeatureId,self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict =\
-         create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,self.maxFeatureId,self.featureCountDict,\
-                             self.triggerFeaturesDict,self.featureTriggeredFormulasDict,sineFeatures,featureFile)
+    def init_featureDict(self,featureFile):
+        self.create_feature_dict(featureFile)
+        #self.featureDict,self.maxNameId,self.maxFeatureId,self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict =\
+        # create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,self.maxFeatureId,self.featureCountDict,\
+        #                     self.triggerFeaturesDict,self.featureTriggeredFormulasDict,sineFeatures,featureFile)
     def init_dependenciesDict(self,depFile):
         self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
     def init_accessibleDict(self,accFile):
@@ -54,16 +46,37 @@
     def init_all(self,args):
         self.featureFileName = 'mash_features'
         self.accFileName = 'mash_accessibility'
-        self.useSine = args.sineFeatures
         featureFile = join(args.inputDir,self.featureFileName)
         depFile = join(args.inputDir,args.depFile)
-        accFile = join(args.inputDir,self.accFileName)
-        self.init_featureDict(featureFile,self.useSine)
-        self.init_accessibleDict(accFile)
+        self.accFile = join(args.inputDir,self.accFileName)
+        self.init_featureDict(featureFile)
+        self.init_accessibleDict(self.accFile)
         self.init_dependenciesDict(depFile)
         self.expandedAccessibles = {}
         self.changed = True
 
+    def create_feature_dict(self,inputFile):
+        logger = logging.getLogger('create_feature_dict')
+        self.featureDict = {}
+        IS = open(inputFile,'r')
+        for line in IS:
+            line = line.split(':')
+            name = line[0]
+            # Name Id
+            if self.nameIdDict.has_key(name):
+                logger.warning('%s appears twice in the feature file. Aborting.',name)
+                sys.exit(-1)
+            else:
+                self.nameIdDict[name] = self.maxNameId
+                self.idNameDict[self.maxNameId] = name
+                nameId = self.maxNameId
+                self.maxNameId += 1
+            features = self.get_features(line)
+            # Store results
+            self.featureDict[nameId] = features
+        IS.close()
+        return
+
     def get_name_id(self,name):
         """
         Return the Id for a name.
@@ -82,27 +95,23 @@
     def add_feature(self,featureName):
         if not self.featureIdDict.has_key(featureName):
             self.featureIdDict[featureName] = self.maxFeatureId
-            if self.useSine:
-                self.featureCountDict[self.maxFeatureId] = 0
             self.maxFeatureId += 1
             self.changed = True
         fId = self.featureIdDict[featureName]
-        if self.useSine:
-            self.featureCountDict[fId] += 1
         return fId
 
     def get_features(self,line):
-        # Feature Ids
         featureNames = [f.strip() for f in line[1].split()]
-        features = []
+        features = {}
         for fn in featureNames:
             tmp = fn.split('=')
-            weight = 1.0
+            weight = 1.0 
             if len(tmp) == 2:
                 fn = tmp[0]
                 weight = float(tmp[1])
             fId = self.add_feature(tmp[0])
-            features.append((fId,weight))
+            features[fId] = weight
+            #features[fId] = 1.0 ###
         return features
 
     def expand_accessibles(self,acc):
@@ -142,16 +151,6 @@
         self.accessibleDict[nameId] = unExpAcc
         features = self.get_features(line)
         self.featureDict[nameId] = features
-        if self.useSine:
-            # SInE Features
-            minFeatureCount = min([self.featureCountDict[f] for f,_w in features])
-            triggerFeatures = [f for f,_w in features if self.featureCountDict[f] == minFeatureCount]
-            self.triggerFeaturesDict[nameId] = triggerFeatures
-            for f in triggerFeatures:
-                if self.featureTriggeredFormulasDict.has_key(f): 
-                    self.featureTriggeredFormulasDict[f].append(nameId)
-                else:
-                    self.featureTriggeredFormulasDict[f] = [nameId]        
         self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]        
         self.changed = True
         return nameId
@@ -219,14 +218,12 @@
         if self.changed:
             dictsStream = open(fileName, 'wb')
             dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
-                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\
-                self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine),dictsStream)
+                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict),dictsStream)
             self.changed = False
             dictsStream.close()
     def load(self,fileName):
         dictsStream = open(fileName, 'rb')
         self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
-              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\
-              self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine = load(dictsStream)
+              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream)
         self.changed = False
         dictsStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Thu Sep 12 09:59:45 2013 +0200
@@ -41,7 +41,7 @@
     path = dirname(realpath(__file__))
     spawnDaemon(os.path.join(path,'server.py'))
     serverIsUp=False
-    for _i in range(10):
+    for _i in range(20):
         # Test if server is up
         try:
             sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -118,11 +118,12 @@
                 OS.write('%s\n' % received)
         OS.close()
         IS.close()
+        
+        # Statistics
+        if args.statistics:
+            received = communicate('avgStats',args.host,args.port)
+            logger.info(received)
 
-    # Statistics
-    if args.statistics:
-        received = communicate('avgStats',args.host,args.port)
-        logger.info(received)
     if args.saveModels:
         communicate('save',args.host,args.port)
 
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mashTest.py	Thu Sep 12 00:34:48 2013 +0200
+++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-'''
-Created on Aug 20, 2013
-
-@author: daniel
-'''
-from mash import mash
-
-if __name__ == "__main__":
-    args = ['--statistics', '--init', '--inputDir', '../data/20130118/Jinja', '--log', '../tmp/auth.log', '--modelFile', '../tmp/m0', '--dictsFile', '../tmp/d0','--NBDefaultPriorWeight', '20.0', '--NBDefVal', '-15.0', '--NBPosWeight', '10.0']
-    mash(args)
-    args = ['-i', '../data/20130118/Jinja/mash_commands', '-p', '../tmp/auth.pred0', '--statistics', '--cutOff', '500', '--log', '../tmp/auth.log','--modelFile', '../tmp/m0', '--dictsFile', '../tmp/d0']
-    mash(args) 
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py	Thu Sep 12 09:59:45 2013 +0200
@@ -22,14 +22,13 @@
     parser.add_argument('--depFile', default='mash_dependencies',
                         help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies')
     
-    parser.add_argument('--algorithm',default='nb',help="Which learning algorithm is used. nb = Naive Bayes,predef=predefined. Default=nb.")
+    parser.add_argument('--algorithm',default='nb',help="Which learning algorithm is used. nb = Naive Bayes,KNN,predef=predefined. Default=nb.")
+    parser.add_argument('--predef',help="File containing the predefined suggestions. Only used when algorithm = predef.")
     # NB Parameters
     parser.add_argument('--NBDefaultPriorWeight',default=20.0,help="Initializes classifiers with value * p |- p. Default=20.0.",type=float)
     parser.add_argument('--NBDefVal',default=-15.0,help="Default value for unknown features. Default=-15.0.",type=float)
     parser.add_argument('--NBPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
-    # TODO: Rename to sineFeatures
-    parser.add_argument('--sineFeatures',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
-    parser.add_argument('--sineWeight',default=0.5,help="How much the SInE prior is weighted. Default=0.5.",type=float)
+    parser.add_argument('--expandFeatures',default=False,action='store_true',help="Learning-based feature expansion. Default=False.")
     
     parser.add_argument('--statistics',default=False,action='store_true',help="Create and show statistics for the top CUTOFF predictions.\
                         WARNING: This will make the program a lot slower! Default=False.")
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Thu Sep 12 09:59:45 2013 +0200
@@ -14,55 +14,6 @@
 
 import sys,logging
 
-def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,\
-                        triggerFeaturesDict,featureTriggeredFormulasDict,sineFeatures,inputFile):
-    logger = logging.getLogger('create_feature_dict')
-    featureDict = {}
-    IS = open(inputFile,'r')
-    for line in IS:
-        line = line.split(':')
-        name = line[0]
-        # Name Id
-        if nameIdDict.has_key(name):
-            logger.warning('%s appears twice in the feature file. Aborting.',name)
-            sys.exit(-1)
-        else:
-            nameIdDict[name] = maxNameId
-            idNameDict[maxNameId] = name
-            nameId = maxNameId
-            maxNameId += 1
-        # Feature Ids
-        featureNames = [f.strip() for f in line[1].split()]
-        features = []     
-        minFeatureCount = 9999999   
-        for fn in featureNames:
-            weight = 1.0
-            tmp = fn.split('=')
-            if len(tmp) == 2:
-                fn = tmp[0]
-                weight = float(tmp[1])
-            if not featureIdDict.has_key(fn):
-                featureIdDict[fn] = maxFeatureId
-                featureCountDict[maxFeatureId] = 0
-                maxFeatureId += 1
-            fId = featureIdDict[fn]
-            features.append((fId,weight))
-            if sineFeatures:
-                featureCountDict[fId] += 1
-                minFeatureCount = min(minFeatureCount,featureCountDict[fId])
-        # Store results
-        featureDict[nameId] = features
-        if sineFeatures:
-            triggerFeatures = [f for f,_w in features if featureCountDict[f] == minFeatureCount]
-            triggerFeaturesDict[nameId] = triggerFeatures
-            for f in triggerFeatures:
-                if featureTriggeredFormulasDict.has_key(f): 
-                    featureTriggeredFormulasDict[f].append(nameId)
-                else:
-                    featureTriggeredFormulasDict[f] = [nameId]
-    IS.close()
-    return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeaturesDict,featureTriggeredFormulasDict
-
 def create_dependencies_dict(nameIdDict,inputFile):
     logger = logging.getLogger('create_dependencies_dict')
     dependenciesDict = {}
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/server.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/server.py	Thu Sep 12 09:59:45 2013 +0200
@@ -7,10 +7,15 @@
 
 import SocketServer,os,string,logging
 from multiprocessing import Manager
+from threading import Timer
 from time import time
 from dictionaries import Dictionaries
 from parameters import init_parser
 from sparseNaiveBayes import sparseNBClassifier
+from KNN import KNN,euclidean
+from KNNs import KNNAdaptPointFeatures,KNNUrban
+from predefined import Predefined
+from ExpandFeatures import ExpandFeatures
 from stats import Statistics
 
 
@@ -19,6 +24,21 @@
         SocketServer.ThreadingTCPServer.__init__(self,*args, **kwargs)
         self.manager = Manager()
         self.lock = Manager().Lock()
+        self.idle_timeout = 28800.0 # 8 hours in seconds
+        self.idle_timer = Timer(self.idle_timeout, self.shutdown)
+        self.idle_timer.start()        
+        
+    def save(self):
+        # Save Models
+        self.model.save(self.args.modelFile)
+        self.dicts.save(self.args.dictsFile)
+        if not self.args.saveStats == None:
+            statsFile = os.path.join(self.args.outputDir,self.args.saveStats)
+            self.stats.save(statsFile)   
+               
+    def save_and_shutdown(self):     
+        self.save()          
+        self.shutdown()
 
 class MaShHandler(SocketServer.BaseRequestHandler):
 
@@ -28,25 +48,32 @@
         else:
             argv = argv.split(';')
             self.server.args = init_parser(argv)
-        # Pick model
-        if self.server.args.algorithm == 'nb':
-            self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
-        else: # Default case
-            self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
         # Load all data
-        # TODO: rewrite dicts for concurrency and without sine
         self.server.dicts = Dictionaries()
         if os.path.isfile(self.server.args.dictsFile):
             self.server.dicts.load(self.server.args.dictsFile)            
         elif self.server.args.init:
             self.server.dicts.init_all(self.server.args)
+        # Pick model
+        if self.server.args.algorithm == 'nb':
+            self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
+        elif self.server.args.algorithm == 'KNN':
+            #self.server.model = KNN(self.server.dicts)
+            self.server.model = KNNAdaptPointFeatures(self.server.dicts)
+        elif self.server.args.algorithm == 'predef':
+            self.server.model = Predefined(self.server.args.predef)
+        else: # Default case
+            self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
+        if self.server.args.expandFeatures:
+            self.server.expandFeatures = ExpandFeatures(self.server.dicts)
+            self.server.expandFeatures.initialize(self.server.dicts)
         # Create Model
         if os.path.isfile(self.server.args.modelFile):
             self.server.model.load(self.server.args.modelFile)          
         elif self.server.args.init:
             trainData = self.server.dicts.featureDict.keys()
             self.server.model.initializeModel(trainData,self.server.dicts)
-            
+           
         if self.server.args.statistics:
             self.server.stats = Statistics(self.server.args.cutOff)
             self.server.statementCounter = 1
@@ -77,6 +104,8 @@
                 self.server.logger.debug('Poor predictions: %s',bp)
             self.server.statementCounter += 1
 
+        if self.server.args.expandFeatures:
+            self.server.expandFeatures.update(self.server.dicts.featureDict[problemId],self.server.dicts.dependenciesDict[problemId])
         # Update Dependencies, p proves p
         self.server.dicts.dependenciesDict[problemId] = [problemId]+self.server.dicts.dependenciesDict[problemId]
         self.server.model.update(problemId,self.server.dicts.featureDict[problemId],self.server.dicts.dependenciesDict[problemId])
@@ -92,22 +121,25 @@
         self.server.computeStats = True
         if self.server.args.algorithm == 'predef':
             return
-        name,features,accessibles,hints,numberOfPredictions = self.server.dicts.parse_problem(self.data)  
+        name,features,accessibles,hints,numberOfPredictions = self.server.dicts.parse_problem(self.data)
         if numberOfPredictions == None:
             numberOfPredictions = self.server.args.numberOfPredictions
         if not hints == []:
             self.server.model.update('hints',features,hints)
-        
+        if self.server.args.expandFeatures:
+            features = self.server.expandFeatures.expand(features)
         # Create predictions
         self.server.logger.debug('Starting computation for line %s',self.server.callCounter)
-        predictionsFeatures = features                    
-        self.server.predictions,predictionValues = self.server.model.predict(predictionsFeatures,accessibles,self.server.dicts)
+                
+        self.server.predictions,predictionValues = self.server.model.predict(features,accessibles,self.server.dicts)
         assert len(self.server.predictions) == len(predictionValues)
         self.server.logger.debug('Time needed: '+str(round(time()-self.startTime,2)))
 
         # Output        
         predictionNames = [str(self.server.dicts.idNameDict[p]) for p in self.server.predictions[:numberOfPredictions]]
-        predictionValues = [str(x) for x in predictionValues[:numberOfPredictions]]
+        #predictionValues = [str(x) for x in predictionValues[:numberOfPredictions]]
+        #predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))]
+        #predictionsString = string.join(predictionsStringList,' ')
         predictionsString = string.join(predictionNames,' ')
         outString = '%s: %s' % (name,predictionsString)
         self.request.sendall(outString)
@@ -115,27 +147,18 @@
     def shutdown(self,saveModels=True):
         self.request.sendall('Shutting down server.')
         if saveModels:
-            self.save()
+            self.server.save()
         self.server.shutdown()
     
-    def save(self):
-        # Save Models
-        self.server.model.save(self.server.args.modelFile)
-        self.server.dicts.save(self.server.args.dictsFile)
-        if not self.server.args.saveStats == None:
-            statsFile = os.path.join(self.server.args.outputDir,self.server.args.saveStats)
-            self.server.stats.save(statsFile)
-    
     def handle(self):
         # self.request is the TCP socket connected to the client
         self.data = self.request.recv(4194304).strip()
         self.server.lock.acquire()
-        #print "{} wrote:".format(self.client_address[0])
         self.startTime = time()  
         if self.data == 'shutdown':
             self.shutdown()         
         elif self.data == 'save':
-            self.save()       
+            self.server.save()       
         elif self.data.startswith('i'):            
             self.init(self.data[2:])
         elif self.data.startswith('!'):
@@ -153,15 +176,16 @@
         else:
             self.request.sendall('Unspecified input format: \n%s',self.data)
         self.server.callCounter += 1
+        # Update idle shutdown timer
+        self.server.idle_timer.cancel()
+        self.server.idle_timer = Timer(self.server.idle_timeout, self.server.save_and_shutdown)
+        self.server.idle_timer.start()        
         self.server.lock.release()
 
 if __name__ == "__main__":
     HOST, PORT = "localhost", 9255
-    #print 'Started Server'
-    # Create the server, binding to localhost on port 9999
     SocketServer.TCPServer.allow_reuse_address = True
     server = ThreadingTCPServer((HOST, PORT), MaShHandler)
-    #server = SocketServer.TCPServer((HOST, PORT), MaShHandler)
 
     # Activate the server; this will keep running until you
     # interrupt the program with Ctrl-C
@@ -171,4 +195,4 @@
 
     
     
-    
+    
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Thu Sep 12 09:59:45 2013 +0200
@@ -36,7 +36,7 @@
             dFeatureCounts = {}
             # Add p proves p with weight self.defaultPriorWeight
             if not self.defaultPriorWeight == 0:            
-                for f,_w in dicts.featureDict[d]:
+                for f in dicts.featureDict[d].iterkeys():
                     dFeatureCounts[f] = self.defaultPriorWeight
             self.counts[d] = [self.defaultPriorWeight,dFeatureCounts]
 
@@ -44,7 +44,7 @@
             for dep in keyDeps:
                 self.counts[dep][0] += 1
                 depFeatures = dicts.featureDict[key]
-                for f,_w in depFeatures:
+                for f in depFeatures.iterkeys():
                     if self.counts[dep][1].has_key(f):
                         self.counts[dep][1][f] += 1
                     else:
@@ -59,12 +59,12 @@
             dFeatureCounts = {}            
             # Give p |- p a higher weight
             if not self.defaultPriorWeight == 0:               
-                for f,_w in features:
+                for f in features.iterkeys():
                     dFeatureCounts[f] = self.defaultPriorWeight
             self.counts[dataPoint] = [self.defaultPriorWeight,dFeatureCounts]            
         for dep in dependencies:
             self.counts[dep][0] += 1
-            for f,_w in features:
+            for f in features.iterkeys():
                 if self.counts[dep][1].has_key(f):
                     self.counts[dep][1][f] += 1
                 else:
@@ -97,12 +97,14 @@
         """
         tau = 0.05 # Jasmin, change value here
         predictions = []
+        #observedFeatures = [f for f,_w in features]
+        observedFeatures = features.keys()
         for a in accessibles:
             posA = self.counts[a][0]
             fA = set(self.counts[a][1].keys())
             fWeightsA = self.counts[a][1]
             resultA = log(posA)
-            for f,w in features:
+            for f,w in features.iteritems():
                 # DEBUG
                 #w = 1.0
                 if f in fA:
@@ -114,9 +116,10 @@
                 else:
                     resultA += w*self.defVal
             if not tau == 0.0:
-                observedFeatures = [f for f,_w in features]
                 missingFeatures = list(fA.difference(observedFeatures))
-                sumOfWeights = sum([log(float(fWeightsA[x])/posA) for x in missingFeatures])
+                #sumOfWeights = sum([log(float(fWeightsA[x])/posA) for x in missingFeatures])  # slower
+                sumOfWeights = sum([log(float(fWeightsA[x])) for x in missingFeatures]) - log(posA) * len(missingFeatures) #DEFAULT
+                #sumOfWeights = sum([log(float(fWeightsA[x])/self.totalFeatureCounts[x]) for x in missingFeatures]) - log(posA) * len(missingFeatures)
                 resultA -= tau * sumOfWeights
             predictions.append(resultA)
         predictions = array(predictions)
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py	Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py	Thu Sep 12 09:59:45 2013 +0200
@@ -97,7 +97,7 @@
                     badPreds.append(dep)
                     recall100 = len(predictions)+1
                     positives+=1
-            self.logger.debug('Dependencies missing for %s in accessibles! Estimating Statistics.',\
+            self.logger.debug('Dependencies missing for %s in cutoff predictions! Estimating Statistics.',\
                               string.join([str(dep) for dep in missing],','))
 
         if positives == 0 or negatives == 0:
@@ -113,7 +113,7 @@
         self.badPreds = badPreds
         self.avgAvailable += available
         self.avgDepNr += depNr
-        self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\
+        self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff: %s',\
                           statementCounter,round(100*auc,2),depNr,recall100,available,self.cutOff)
 
     def printAvg(self):
@@ -135,7 +135,7 @@
         else:
             medianrecall100 = float(sorted(self.recall100Median)[nrDataPoints/2] + sorted(self.recall100Median)[nrDataPoints/2 + 1])/2
 
-        returnString = 'avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff:%s' %\
+        returnString = 'avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff: %s' %\
                          (round(100*self.avgAUC/self.problems,2),round(100*medianAUC,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),round(medianrecall100,2),self.cutOff)
         self.logger.info(returnString)
         return returnString
@@ -143,44 +143,6 @@
         """
         self.logger.info('avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff:%s', \
                          round(100*self.avgAUC/self.problems,2),round(100*medianAUC,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),round(medianrecall100,2),self.cutOff)
-
-        #try:
-        #if True:
-        if False:
-            from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist
-            avgRecall = [float(x)/self.problems for x in self.recallData]
-            figure('Recall')
-            plot(range(self.cutOff),avgRecall)
-            ylabel('Average Recall')
-            xlabel('Highest ranked premises')
-            axis([0,self.cutOff,0.0,1.0])
-            figure('100%Recall')
-            plot(range(self.cutOff),self.recall100Data)
-            ylabel('100%Recall')
-            xlabel('Highest ranked premises')
-            axis([0,self.cutOff,0,self.problems])
-            figure('AUC Histogram')
-            hist(self.aucData,bins=100)
-            ylabel('Problems')
-            xlabel('AUC')
-            maxCount = max(self.premiseOccurenceCounter.values())
-            minCount = min(self.premiseOccurenceCounter.values())
-            figure('Dependency Occurances')
-            hist(self.premiseOccurenceCounter.values(),bins=range(minCount,maxCount+2),align = 'left')
-            #ylabel('Occurences')
-            xlabel('Number of Times a Dependency Occurs')
-            figure('Dependency Appearance in Problems after Introduction.')
-            hist(self.depAppearances,bins=50)
-            figure('Dependency Appearance in Problems after Introduction in Percent.')
-            xAxis = range(max(self.depAppearances)+1)
-            yAxis = [0] * (max(self.depAppearances)+1)
-            for val in self.depAppearances:
-                yAxis[val] += 1
-            yAxis = [float(x)/len(self.firstDepAppearance.keys()) for x in yAxis]
-            plot(xAxis,yAxis)
-            show()
-        #except:
-        #    self.logger.warning('Matplotlib module missing. Skipping graphs.')
         """
 
     def save(self,fileName):