|
1 ''' |
|
2 Created on Aug 21, 2013 |
|
3 |
|
4 @author: daniel |
|
5 ''' |
|
6 |
|
7 from math import log |
|
8 from gensim import corpora, models, similarities |
|
9 |
|
10 class ExpandFeatures(object): |
|
11 |
|
12 def __init__(self,dicts): |
|
13 self.dicts = dicts |
|
14 self.featureMap = {} |
|
15 self.alpha = 0.1 |
|
16 self.featureCounts = {} |
|
17 self.counter = 0 |
|
18 self.corpus = [] |
|
19 self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500) |
|
20 |
|
21 def initialize(self,dicts): |
|
22 self.dicts = dicts |
|
23 IS = open(dicts.accFile,'r') |
|
24 for line in IS: |
|
25 line = line.split(':') |
|
26 name = line[0] |
|
27 #print 'name',name |
|
28 nameId = dicts.nameIdDict[name] |
|
29 features = dicts.featureDict[nameId] |
|
30 dependencies = dicts.dependenciesDict[nameId] |
|
31 x = [self.dicts.idNameDict[d] for d in dependencies] |
|
32 #print x |
|
33 self.update(features, dependencies) |
|
34 self.corpus.append([(x,1) for x in features.keys()]) |
|
35 IS.close() |
|
36 print 'x' |
|
37 #self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500) |
|
38 print self.LSIModel |
|
39 print 'y' |
|
40 |
|
41 def update(self,features,dependencies): |
|
42 self.counter += 1 |
|
43 self.corpus.append([(x,1) for x in features.keys()]) |
|
44 self.LSIModel.add_documents([[(x,1) for x in features.keys()]]) |
|
45 """ |
|
46 for f in features.iterkeys(): |
|
47 try: |
|
48 self.featureCounts[f] += 1 |
|
49 except: |
|
50 self.featureCounts[f] = 1 |
|
51 if self.featureCounts[f] > 100: |
|
52 continue |
|
53 try: |
|
54 self.featureMap[f] = self.featureMap[f].intersection(features.keys()) |
|
55 except: |
|
56 self.featureMap[f] = set(features.keys()) |
|
57 #print 'fOld',len(fMap),self.featureCounts[f],len(dependencies) |
|
58 |
|
59 for d in dependencies[1:]: |
|
60 #print 'dep',self.dicts.idNameDict[d] |
|
61 dFeatures = self.dicts.featureDict[d] |
|
62 for df in dFeatures.iterkeys(): |
|
63 if self.featureCounts.has_key(df): |
|
64 if self.featureCounts[df] > 20: |
|
65 continue |
|
66 else: |
|
67 print df |
|
68 try: |
|
69 fMap[df] += self.alpha * (1.0 - fMap[df]) |
|
70 except: |
|
71 fMap[df] = self.alpha |
|
72 """ |
|
73 #print 'fNew',len(fMap) |
|
74 |
|
75 def expand(self,features): |
|
76 #print self.corpus[:50] |
|
77 #print corpus |
|
78 #tfidfmodel = models.TfidfModel(self.corpus, normalize=True) |
|
79 #print features.keys() |
|
80 #tfidfcorpus = [tfidfmodel[x] for x in self.corpus] |
|
81 #newFeatures = LSI[[(x,1) for x in features.keys()]] |
|
82 newFeatures = self.LSIModel[[(x,1) for x in features.keys()]] |
|
83 print features |
|
84 print newFeatures |
|
85 #print newFeatures |
|
86 |
|
87 """ |
|
88 newFeatures = dict(features) |
|
89 for f in features.keys(): |
|
90 try: |
|
91 fC = self.featureCounts[f] |
|
92 except: |
|
93 fC = 0.5 |
|
94 newFeatures[f] = log(float(8+self.counter) / fC) |
|
95 #nrOfFeatures = float(len(features)) |
|
96 addedCount = 0 |
|
97 alpha = 0.2 |
|
98 #""" |
|
99 |
|
100 """ |
|
101 consideredFeatures = [] |
|
102 while len(newFeatures) < 30: |
|
103 #alpha = alpha * 0.5 |
|
104 minF = None |
|
105 minFrequence = 1000000 |
|
106 for f in newFeatures.iterkeys(): |
|
107 if f in consideredFeatures: |
|
108 continue |
|
109 try: |
|
110 if self.featureCounts[f] < minFrequence: |
|
111 minF = f |
|
112 except: |
|
113 pass |
|
114 if minF == None: |
|
115 break |
|
116 # Expand minimal feature |
|
117 consideredFeatures.append(minF) |
|
118 for expF in self.featureMap[minF]: |
|
119 if not newFeatures.has_key(expF): |
|
120 fC = self.featureCounts[minF] |
|
121 newFeatures[expF] = alpha*log(float(8+self.counter) / fC) |
|
122 #print features, newFeatures |
|
123 #""" |
|
124 """ |
|
125 for f in features.iterkeys(): |
|
126 try: |
|
127 self.featureCounts[f] += 1 |
|
128 except: |
|
129 self.featureCounts[f] = 0 |
|
130 if self.featureCounts[f] > 10: |
|
131 continue |
|
132 addedCount += 1 |
|
133 try: |
|
134 fmap = self.featureMap[f] |
|
135 except: |
|
136 self.featureMap[f] = {} |
|
137 fmap = {} |
|
138 for nf,nv in fmap.iteritems(): |
|
139 try: |
|
140 newFeatures[nf] += nv |
|
141 except: |
|
142 newFeatures[nf] = nv |
|
143 if addedCount > 0: |
|
144 for f,w in newFeatures.iteritems(): |
|
145 newFeatures[f] = float(w)/addedCount |
|
146 #""" |
|
147 """ |
|
148 deleteF = [] |
|
149 for f,w in newFeatures.iteritems(): |
|
150 if w < 0.1: |
|
151 deleteF.append(f) |
|
152 for f in deleteF: |
|
153 del newFeatures[f] |
|
154 """ |
|
155 #print 'fold',len(features) |
|
156 #print 'fnew',len(newFeatures) |
|
157 return dict(newFeatures) |
|
158 |
|
159 if __name__ == "__main__": |
|
160 pass |
|
161 |
|
162 |