src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py
changeset 53555 12251bc889f1
child 53789 8d9f4e89d8c8
equal deleted inserted replaced
53554:78fe0002024d 53555:12251bc889f1
       
     1 '''
       
     2 Created on Aug 21, 2013
       
     3 
       
     4 @author: daniel
       
     5 '''
       
     6 
       
     7 from math import log
       
     8 from gensim import corpora, models, similarities
       
     9 
       
    10 class ExpandFeatures(object):
       
    11 
       
    12     def __init__(self,dicts):
       
    13         self.dicts = dicts
       
    14         self.featureMap = {}
       
    15         self.alpha = 0.1
       
    16         self.featureCounts = {}
       
    17         self.counter = 0        
       
    18         self.corpus = []
       
    19         self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
       
    20 
       
    21     def initialize(self,dicts):
       
    22         self.dicts = dicts
       
    23         IS = open(dicts.accFile,'r')
       
    24         for line in IS:
       
    25             line = line.split(':')
       
    26             name = line[0]
       
    27             #print 'name',name
       
    28             nameId = dicts.nameIdDict[name]    
       
    29             features = dicts.featureDict[nameId]
       
    30             dependencies = dicts.dependenciesDict[nameId]   
       
    31             x = [self.dicts.idNameDict[d] for d in dependencies]
       
    32             #print x  
       
    33             self.update(features, dependencies)
       
    34             self.corpus.append([(x,1) for x in features.keys()])
       
    35         IS.close()
       
    36         print 'x'
       
    37         #self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
       
    38         print self.LSIModel
       
    39         print 'y'
       
    40         
       
    41     def update(self,features,dependencies):
       
    42         self.counter += 1
       
    43         self.corpus.append([(x,1) for x in features.keys()])
       
    44         self.LSIModel.add_documents([[(x,1) for x in features.keys()]])
       
    45         """
       
    46         for f in features.iterkeys():
       
    47             try:
       
    48                 self.featureCounts[f] += 1
       
    49             except:
       
    50                 self.featureCounts[f] = 1
       
    51             if self.featureCounts[f] > 100:
       
    52                 continue
       
    53             try:
       
    54                 self.featureMap[f] = self.featureMap[f].intersection(features.keys())
       
    55             except:
       
    56                 self.featureMap[f] = set(features.keys())
       
    57             #print 'fOld',len(fMap),self.featureCounts[f],len(dependencies)
       
    58 
       
    59             for d in dependencies[1:]:
       
    60                 #print 'dep',self.dicts.idNameDict[d]
       
    61                 dFeatures = self.dicts.featureDict[d]
       
    62                 for df in dFeatures.iterkeys():
       
    63                     if self.featureCounts.has_key(df):
       
    64                         if self.featureCounts[df] > 20:
       
    65                             continue
       
    66                     else:
       
    67                         print df
       
    68                     try:
       
    69                         fMap[df] += self.alpha * (1.0 - fMap[df])
       
    70                     except:
       
    71                         fMap[df] = self.alpha
       
    72             """
       
    73             #print 'fNew',len(fMap)
       
    74             
       
    75     def expand(self,features):
       
    76         #print self.corpus[:50]        
       
    77         #print corpus
       
    78         #tfidfmodel = models.TfidfModel(self.corpus, normalize=True)        
       
    79         #print features.keys()        
       
    80         #tfidfcorpus = [tfidfmodel[x] for x in self.corpus]
       
    81         #newFeatures = LSI[[(x,1) for x in features.keys()]]
       
    82         newFeatures = self.LSIModel[[(x,1) for x in features.keys()]]
       
    83         print features
       
    84         print newFeatures
       
    85         #print newFeatures
       
    86         
       
    87         """
       
    88         newFeatures = dict(features)
       
    89         for f in features.keys():
       
    90             try:
       
    91                 fC = self.featureCounts[f]
       
    92             except:
       
    93                 fC = 0.5
       
    94             newFeatures[f] = log(float(8+self.counter) / fC)
       
    95         #nrOfFeatures = float(len(features))
       
    96         addedCount = 0
       
    97         alpha = 0.2
       
    98         #"""
       
    99         
       
   100         """
       
   101         consideredFeatures = []
       
   102         while len(newFeatures) < 30:
       
   103             #alpha = alpha * 0.5
       
   104             minF = None
       
   105             minFrequence = 1000000
       
   106             for f in newFeatures.iterkeys():
       
   107                 if f in consideredFeatures:
       
   108                     continue
       
   109                 try:
       
   110                     if self.featureCounts[f] < minFrequence:
       
   111                         minF = f
       
   112                 except:
       
   113                     pass
       
   114             if minF == None:
       
   115                 break
       
   116             # Expand minimal feature
       
   117             consideredFeatures.append(minF)
       
   118             for expF in self.featureMap[minF]:
       
   119                 if not newFeatures.has_key(expF):
       
   120                     fC = self.featureCounts[minF]
       
   121                     newFeatures[expF] = alpha*log(float(8+self.counter) / fC)
       
   122         #print features, newFeatures
       
   123         #"""
       
   124         """
       
   125         for f in features.iterkeys():
       
   126             try:
       
   127                 self.featureCounts[f] += 1
       
   128             except:
       
   129                 self.featureCounts[f] = 0            
       
   130             if self.featureCounts[f] > 10:
       
   131                 continue            
       
   132             addedCount += 1
       
   133             try:
       
   134                 fmap = self.featureMap[f]
       
   135             except:
       
   136                 self.featureMap[f] = {}
       
   137                 fmap = {}
       
   138             for nf,nv in fmap.iteritems():
       
   139                 try:
       
   140                     newFeatures[nf] += nv
       
   141                 except:
       
   142                     newFeatures[nf] = nv
       
   143         if addedCount > 0: 
       
   144             for f,w in newFeatures.iteritems():
       
   145                 newFeatures[f] = float(w)/addedCount
       
   146         #"""                    
       
   147         """
       
   148         deleteF = []
       
   149         for f,w in newFeatures.iteritems():
       
   150             if w < 0.1:
       
   151                 deleteF.append(f)
       
   152         for f in deleteF:
       
   153             del newFeatures[f]
       
   154         """
       
   155         #print 'fold',len(features)
       
   156         #print 'fnew',len(newFeatures)
       
   157         return dict(newFeatures)
       
   158 
       
   159 if __name__ == "__main__":
       
   160     pass
       
   161     
       
   162