src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py
changeset 53555 12251bc889f1
child 53789 8d9f4e89d8c8
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py	Thu Sep 12 09:59:45 2013 +0200
@@ -0,0 +1,162 @@
+'''
+Created on Aug 21, 2013
+
+@author: daniel
+'''
+
+from math import log
+from gensim import corpora, models, similarities
+
+class ExpandFeatures(object):
+
+    def __init__(self,dicts):
+        self.dicts = dicts
+        self.featureMap = {}
+        self.alpha = 0.1
+        self.featureCounts = {}
+        self.counter = 0        
+        self.corpus = []
+        self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
+
+    def initialize(self,dicts):
+        self.dicts = dicts
+        IS = open(dicts.accFile,'r')
+        for line in IS:
+            line = line.split(':')
+            name = line[0]
+            #print 'name',name
+            nameId = dicts.nameIdDict[name]    
+            features = dicts.featureDict[nameId]
+            dependencies = dicts.dependenciesDict[nameId]   
+            x = [self.dicts.idNameDict[d] for d in dependencies]
+            #print x  
+            self.update(features, dependencies)
+            self.corpus.append([(x,1) for x in features.keys()])
+        IS.close()
+        print 'x'
+        #self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
+        print self.LSIModel
+        print 'y'
+        
+    def update(self,features,dependencies):
+        self.counter += 1
+        self.corpus.append([(x,1) for x in features.keys()])
+        self.LSIModel.add_documents([[(x,1) for x in features.keys()]])
+        """
+        for f in features.iterkeys():
+            try:
+                self.featureCounts[f] += 1
+            except:
+                self.featureCounts[f] = 1
+            if self.featureCounts[f] > 100:
+                continue
+            try:
+                self.featureMap[f] = self.featureMap[f].intersection(features.keys())
+            except:
+                self.featureMap[f] = set(features.keys())
+            #print 'fOld',len(fMap),self.featureCounts[f],len(dependencies)
+
+            for d in dependencies[1:]:
+                #print 'dep',self.dicts.idNameDict[d]
+                dFeatures = self.dicts.featureDict[d]
+                for df in dFeatures.iterkeys():
+                    if self.featureCounts.has_key(df):
+                        if self.featureCounts[df] > 20:
+                            continue
+                    else:
+                        print df
+                    try:
+                        fMap[df] += self.alpha * (1.0 - fMap[df])
+                    except:
+                        fMap[df] = self.alpha
+            """
+            #print 'fNew',len(fMap)
+            
+    def expand(self,features):
+        #print self.corpus[:50]        
+        #print corpus
+        #tfidfmodel = models.TfidfModel(self.corpus, normalize=True)        
+        #print features.keys()        
+        #tfidfcorpus = [tfidfmodel[x] for x in self.corpus]
+        #newFeatures = LSI[[(x,1) for x in features.keys()]]
+        newFeatures = self.LSIModel[[(x,1) for x in features.keys()]]
+        print features
+        print newFeatures
+        #print newFeatures
+        
+        """
+        newFeatures = dict(features)
+        for f in features.keys():
+            try:
+                fC = self.featureCounts[f]
+            except:
+                fC = 0.5
+            newFeatures[f] = log(float(8+self.counter) / fC)
+        #nrOfFeatures = float(len(features))
+        addedCount = 0
+        alpha = 0.2
+        #"""
+        
+        """
+        consideredFeatures = []
+        while len(newFeatures) < 30:
+            #alpha = alpha * 0.5
+            minF = None
+            minFrequence = 1000000
+            for f in newFeatures.iterkeys():
+                if f in consideredFeatures:
+                    continue
+                try:
+                    if self.featureCounts[f] < minFrequence:
+                        minF = f
+                except:
+                    pass
+            if minF == None:
+                break
+            # Expand minimal feature
+            consideredFeatures.append(minF)
+            for expF in self.featureMap[minF]:
+                if not newFeatures.has_key(expF):
+                    fC = self.featureCounts[minF]
+                    newFeatures[expF] = alpha*log(float(8+self.counter) / fC)
+        #print features, newFeatures
+        #"""
+        """
+        for f in features.iterkeys():
+            try:
+                self.featureCounts[f] += 1
+            except:
+                self.featureCounts[f] = 0            
+            if self.featureCounts[f] > 10:
+                continue            
+            addedCount += 1
+            try:
+                fmap = self.featureMap[f]
+            except:
+                self.featureMap[f] = {}
+                fmap = {}
+            for nf,nv in fmap.iteritems():
+                try:
+                    newFeatures[nf] += nv
+                except:
+                    newFeatures[nf] = nv
+        if addedCount > 0: 
+            for f,w in newFeatures.iteritems():
+                newFeatures[f] = float(w)/addedCount
+        #"""                    
+        """
+        deleteF = []
+        for f,w in newFeatures.iteritems():
+            if w < 0.1:
+                deleteF.append(f)
+        for f in deleteF:
+            del newFeatures[f]
+        """
+        #print 'fold',len(features)
+        #print 'fnew',len(newFeatures)
+        return dict(newFeatures)
+
+if __name__ == "__main__":
+    pass
+    
+        
\ No newline at end of file