--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py Thu Sep 12 09:59:45 2013 +0200
@@ -0,0 +1,162 @@
+'''
+Created on Aug 21, 2013
+
+@author: daniel
+'''
+
+from math import log
+from gensim import corpora, models, similarities
+
+class ExpandFeatures(object):
+
+ def __init__(self,dicts):
+ self.dicts = dicts
+ self.featureMap = {}
+ self.alpha = 0.1
+ self.featureCounts = {}
+ self.counter = 0
+ self.corpus = []
+ self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
+
+ def initialize(self,dicts):
+ self.dicts = dicts
+ IS = open(dicts.accFile,'r')
+ for line in IS:
+ line = line.split(':')
+ name = line[0]
+ #print 'name',name
+ nameId = dicts.nameIdDict[name]
+ features = dicts.featureDict[nameId]
+ dependencies = dicts.dependenciesDict[nameId]
+ x = [self.dicts.idNameDict[d] for d in dependencies]
+ #print x
+ self.update(features, dependencies)
+ self.corpus.append([(x,1) for x in features.keys()])
+ IS.close()
+ print 'x'
+ #self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500)
+ print self.LSIModel
+ print 'y'
+
+ def update(self,features,dependencies):
+ self.counter += 1
+ self.corpus.append([(x,1) for x in features.keys()])
+ self.LSIModel.add_documents([[(x,1) for x in features.keys()]])
+ """
+ for f in features.iterkeys():
+ try:
+ self.featureCounts[f] += 1
+ except:
+ self.featureCounts[f] = 1
+ if self.featureCounts[f] > 100:
+ continue
+ try:
+ self.featureMap[f] = self.featureMap[f].intersection(features.keys())
+ except:
+ self.featureMap[f] = set(features.keys())
+ #print 'fOld',len(fMap),self.featureCounts[f],len(dependencies)
+
+ for d in dependencies[1:]:
+ #print 'dep',self.dicts.idNameDict[d]
+ dFeatures = self.dicts.featureDict[d]
+ for df in dFeatures.iterkeys():
+ if self.featureCounts.has_key(df):
+ if self.featureCounts[df] > 20:
+ continue
+ else:
+ print df
+ try:
+ fMap[df] += self.alpha * (1.0 - fMap[df])
+ except:
+ fMap[df] = self.alpha
+ """
+ #print 'fNew',len(fMap)
+
+ def expand(self,features):
+ #print self.corpus[:50]
+ #print corpus
+ #tfidfmodel = models.TfidfModel(self.corpus, normalize=True)
+ #print features.keys()
+ #tfidfcorpus = [tfidfmodel[x] for x in self.corpus]
+ #newFeatures = LSI[[(x,1) for x in features.keys()]]
+ newFeatures = self.LSIModel[[(x,1) for x in features.keys()]]
+ print features
+ print newFeatures
+ #print newFeatures
+
+ """
+ newFeatures = dict(features)
+ for f in features.keys():
+ try:
+ fC = self.featureCounts[f]
+ except:
+ fC = 0.5
+ newFeatures[f] = log(float(8+self.counter) / fC)
+ #nrOfFeatures = float(len(features))
+ addedCount = 0
+ alpha = 0.2
+ #"""
+
+ """
+ consideredFeatures = []
+ while len(newFeatures) < 30:
+ #alpha = alpha * 0.5
+ minF = None
+ minFrequence = 1000000
+ for f in newFeatures.iterkeys():
+ if f in consideredFeatures:
+ continue
+ try:
+ if self.featureCounts[f] < minFrequence:
+ minF = f
+ except:
+ pass
+ if minF == None:
+ break
+ # Expand minimal feature
+ consideredFeatures.append(minF)
+ for expF in self.featureMap[minF]:
+ if not newFeatures.has_key(expF):
+ fC = self.featureCounts[minF]
+ newFeatures[expF] = alpha*log(float(8+self.counter) / fC)
+ #print features, newFeatures
+ #"""
+ """
+ for f in features.iterkeys():
+ try:
+ self.featureCounts[f] += 1
+ except:
+ self.featureCounts[f] = 0
+ if self.featureCounts[f] > 10:
+ continue
+ addedCount += 1
+ try:
+ fmap = self.featureMap[f]
+ except:
+ self.featureMap[f] = {}
+ fmap = {}
+ for nf,nv in fmap.iteritems():
+ try:
+ newFeatures[nf] += nv
+ except:
+ newFeatures[nf] = nv
+ if addedCount > 0:
+ for f,w in newFeatures.iteritems():
+ newFeatures[f] = float(w)/addedCount
+ #"""
+ """
+ deleteF = []
+ for f,w in newFeatures.iteritems():
+ if w < 0.1:
+ deleteF.append(f)
+ for f in deleteF:
+ del newFeatures[f]
+ """
+ #print 'fold',len(features)
+ #print 'fnew',len(newFeatures)
+ return dict(newFeatures)
+
+if __name__ == "__main__":
+ pass
+
+
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/KNN.py Thu Sep 12 09:59:45 2013 +0200
@@ -0,0 +1,99 @@
+'''
+Created on Aug 21, 2013
+
+@author: daniel
+'''
+
+from cPickle import dump,load
+from numpy import array
+from math import sqrt,log
+
+def cosine(f1,f2):
+ f1Norm = 0.0
+ for f in f1.keys():
+ f1Norm += f1[f] * f1[f]
+ #assert f1Norm = sum(map(lambda x,y: x*y,f1.itervalues(),f1.itervalues()))
+ f1Norm = sqrt(f1Norm)
+
+ f2Norm = 0.0
+ for f in f2.keys():
+ f2Norm += f2[f] * f2[f]
+ f2Norm = sqrt(f2Norm)
+
+ dotProduct = 0.0
+ featureIntersection = set(f1.keys()) & set(f2.keys())
+ for f in featureIntersection:
+ dotProduct += f1[f] * f2[f]
+ cosine = dotProduct / (f1Norm * f2Norm)
+ return 1.0 - cosine
+
+def euclidean(f1,f2):
+ diffSum = 0.0
+ featureUnion = set(f1.keys()) | set(f2.keys())
+ for f in featureUnion:
+ try:
+ f1Val = f1[f]
+ except:
+ f1Val = 0.0
+ try:
+ f2Val = f2[f]
+ except:
+ f2Val = 0.0
+ diff = f1Val - f2Val
+ diffSum += diff * diff
+ #if f in f1.keys():
+ # diffSum += log(2+self.pointCount/self.featureCounts[f]) * diff * diff
+ #else:
+ # diffSum += diff * diff
+ #print diffSum,f1,f2
+ return diffSum
+
+class KNN(object):
+ '''
+ A basic KNN ranker.
+ '''
+
+ def __init__(self,dicts,metric=cosine):
+ '''
+ Constructor
+ '''
+ self.points = dicts.featureDict
+ self.metric = metric
+
+ def initializeModel(self,_trainData,_dicts):
+ """
+ Build basic model from training data.
+ """
+ pass
+
+ def update(self,dataPoint,features,dependencies):
+ assert self.points[dataPoint] == features
+
+ def overwrite(self,problemId,newDependencies,dicts):
+ # Taken care of by dicts
+ pass
+
+ def delete(self,dataPoint,features,dependencies):
+ # Taken care of by dicts
+ pass
+
+ def predict(self,features,accessibles,dicts):
+ predictions = map(lambda x: self.metric(features,self.points[x]),accessibles)
+ predictions = array(predictions)
+ perm = predictions.argsort()
+ return array(accessibles)[perm],predictions[perm]
+
+ def save(self,fileName):
+ OStream = open(fileName, 'wb')
+ dump((self.points,self.metric),OStream)
+ OStream.close()
+
+ def load(self,fileName):
+ OStream = open(fileName, 'rb')
+ self.points,self.metric = load(OStream)
+ OStream.close()
+
+if __name__ == '__main__':
+ pass
+
+
\ No newline at end of file
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/KNNs.py Thu Sep 12 09:59:45 2013 +0200
@@ -0,0 +1,105 @@
+'''
+Created on Aug 21, 2013
+
+@author: daniel
+'''
+
+from math import log
+from KNN import KNN,cosine
+from numpy import array
+
+class KNNAdaptPointFeatures(KNN):
+
+ def __init__(self,dicts,metric=cosine,alpha = 0.05):
+ self.points = dicts.featureDict
+ self.metric = self.euclidean
+ self.alpha = alpha
+ self.count = 0
+ self.featureCount = {}
+
+ def initializeModel(self,trainData,dicts):
+ """
+ Build basic model from training data.
+ """
+ IS = open(dicts.accFile,'r')
+ for line in IS:
+ line = line.split(':')
+ name = line[0]
+ nameId = dicts.nameIdDict[name]
+ features = dicts.featureDict[nameId]
+ dependencies = dicts.dependenciesDict[nameId]
+ self.update(nameId, features, dependencies)
+ IS.close()
+
+ def update(self,dataPoint,features,dependencies):
+ self.count += 1
+ for f in features.iterkeys():
+ try:
+ self.featureCount[f] += 1
+ except:
+ self.featureCount[f] = 1
+ for d in dependencies:
+ dFeatures = self.points[d]
+ featureUnion = set(dFeatures.keys()) | set(features.keys())
+ for f in featureUnion:
+ try:
+ pVal = features[f]
+ except:
+ pVal = 0.0
+ try:
+ dVal = dFeatures[f]
+ except:
+ dVal = 0.0
+ newDVal = dVal + self.alpha * (pVal - dVal)
+ dFeatures[f] = newDVal
+
+ def euclidean(self,f1,f2):
+ diffSum = 0.0
+ f1Set = set(f1.keys())
+ featureUnion = f1Set | set(f2.keys())
+ for f in featureUnion:
+ if not self.featureCount.has_key(f):
+ continue
+ if self.featureCount[f] == 1:
+ continue
+ try:
+ f1Val = f1[f]
+ except:
+ f1Val = 0.0
+ try:
+ f2Val = f2[f]
+ except:
+ f2Val = 0.0
+ diff = f1Val - f2Val
+ diffSum += diff * diff
+ if f in f1Set:
+ diffSum += log(2+self.count/self.featureCount[f]) * diff * diff
+ else:
+ diffSum += diff * diff
+ #print diffSum,f1,f2
+ return diffSum
+
+class KNNUrban(KNN):
+ def __init__(self,dicts,metric=cosine,nrOfNeighbours = 40):
+ self.points = dicts.featureDict
+ self.metric = metric
+ self.nrOfNeighbours = nrOfNeighbours # Ignored at the moment
+
+ def predict(self,features,accessibles,dicts):
+ predictions = map(lambda x: self.metric(features,self.points[x]),accessibles)
+ pDict = dict(zip(accessibles,predictions))
+ for a,p in zip(accessibles,predictions):
+ aDeps = dicts.dependenciesDict[a]
+ for d in aDeps:
+ pDict[d] -= p
+ predictions = []
+ names = []
+ for n,p in pDict.items():
+ predictions.append(p)
+ names.append(n)
+ predictions = array(predictions)
+ perm = predictions.argsort()
+ return array(names)[perm],predictions[perm]
+
+
+
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Thu Sep 12 09:59:45 2013 +0200
@@ -1,4 +1,4 @@
-#!/usr/bin/env python
+#!/usr/bin/python
# Title: HOL/Tools/Sledgehammer/MaSh/src/compareStats.py
# Author: Daniel Kuehlwein, ICIS, Radboud University Nijmegen
# Copyright 2012
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Thu Sep 12 09:59:45 2013 +0200
@@ -1,18 +1,13 @@
# Title: HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py
# Author: Daniel Kuehlwein, ICIS, Radboud University Nijmegen
-# Copyright 2012
+# Copyright 2012-2013
#
# Persistent dictionaries: accessibility, dependencies, and features.
-'''
-Created on Jul 12, 2012
-
-@author: daniel
-'''
-
+import logging,sys
from os.path import join
from Queue import Queue
-from readData import create_accessible_dict,create_dependencies_dict,create_feature_dict
+from readData import create_accessible_dict,create_dependencies_dict
from cPickle import load,dump
class Dictionaries(object):
@@ -32,20 +27,17 @@
self.dependenciesDict = {}
self.accessibleDict = {}
self.expandedAccessibles = {}
- # For SInE features
- self.useSine = False
- self.featureCountDict = {}
- self.triggerFeaturesDict = {}
- self.featureTriggeredFormulasDict = {}
+ self.accFile = ''
self.changed = True
"""
Init functions. nameIdDict, idNameDict, featureIdDict, articleDict get filled!
"""
- def init_featureDict(self,featureFile,sineFeatures):
- self.featureDict,self.maxNameId,self.maxFeatureId,self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict =\
- create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,self.maxFeatureId,self.featureCountDict,\
- self.triggerFeaturesDict,self.featureTriggeredFormulasDict,sineFeatures,featureFile)
+ def init_featureDict(self,featureFile):
+ self.create_feature_dict(featureFile)
+ #self.featureDict,self.maxNameId,self.maxFeatureId,self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict =\
+ # create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,self.maxFeatureId,self.featureCountDict,\
+ # self.triggerFeaturesDict,self.featureTriggeredFormulasDict,sineFeatures,featureFile)
def init_dependenciesDict(self,depFile):
self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
def init_accessibleDict(self,accFile):
@@ -54,16 +46,37 @@
def init_all(self,args):
self.featureFileName = 'mash_features'
self.accFileName = 'mash_accessibility'
- self.useSine = args.sineFeatures
featureFile = join(args.inputDir,self.featureFileName)
depFile = join(args.inputDir,args.depFile)
- accFile = join(args.inputDir,self.accFileName)
- self.init_featureDict(featureFile,self.useSine)
- self.init_accessibleDict(accFile)
+ self.accFile = join(args.inputDir,self.accFileName)
+ self.init_featureDict(featureFile)
+ self.init_accessibleDict(self.accFile)
self.init_dependenciesDict(depFile)
self.expandedAccessibles = {}
self.changed = True
+ def create_feature_dict(self,inputFile):
+ logger = logging.getLogger('create_feature_dict')
+ self.featureDict = {}
+ IS = open(inputFile,'r')
+ for line in IS:
+ line = line.split(':')
+ name = line[0]
+ # Name Id
+ if self.nameIdDict.has_key(name):
+ logger.warning('%s appears twice in the feature file. Aborting.',name)
+ sys.exit(-1)
+ else:
+ self.nameIdDict[name] = self.maxNameId
+ self.idNameDict[self.maxNameId] = name
+ nameId = self.maxNameId
+ self.maxNameId += 1
+ features = self.get_features(line)
+ # Store results
+ self.featureDict[nameId] = features
+ IS.close()
+ return
+
def get_name_id(self,name):
"""
Return the Id for a name.
@@ -82,27 +95,23 @@
def add_feature(self,featureName):
if not self.featureIdDict.has_key(featureName):
self.featureIdDict[featureName] = self.maxFeatureId
- if self.useSine:
- self.featureCountDict[self.maxFeatureId] = 0
self.maxFeatureId += 1
self.changed = True
fId = self.featureIdDict[featureName]
- if self.useSine:
- self.featureCountDict[fId] += 1
return fId
def get_features(self,line):
- # Feature Ids
featureNames = [f.strip() for f in line[1].split()]
- features = []
+ features = {}
for fn in featureNames:
tmp = fn.split('=')
- weight = 1.0
+ weight = 1.0
if len(tmp) == 2:
fn = tmp[0]
weight = float(tmp[1])
fId = self.add_feature(tmp[0])
- features.append((fId,weight))
+ features[fId] = weight
+ #features[fId] = 1.0 ###
return features
def expand_accessibles(self,acc):
@@ -142,16 +151,6 @@
self.accessibleDict[nameId] = unExpAcc
features = self.get_features(line)
self.featureDict[nameId] = features
- if self.useSine:
- # SInE Features
- minFeatureCount = min([self.featureCountDict[f] for f,_w in features])
- triggerFeatures = [f for f,_w in features if self.featureCountDict[f] == minFeatureCount]
- self.triggerFeaturesDict[nameId] = triggerFeatures
- for f in triggerFeatures:
- if self.featureTriggeredFormulasDict.has_key(f):
- self.featureTriggeredFormulasDict[f].append(nameId)
- else:
- self.featureTriggeredFormulasDict[f] = [nameId]
self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]
self.changed = True
return nameId
@@ -219,14 +218,12 @@
if self.changed:
dictsStream = open(fileName, 'wb')
dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
- self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\
- self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine),dictsStream)
+ self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict),dictsStream)
self.changed = False
dictsStream.close()
def load(self,fileName):
dictsStream = open(fileName, 'rb')
self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
- self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\
- self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine = load(dictsStream)
+ self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream)
self.changed = False
dictsStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Thu Sep 12 09:59:45 2013 +0200
@@ -41,7 +41,7 @@
path = dirname(realpath(__file__))
spawnDaemon(os.path.join(path,'server.py'))
serverIsUp=False
- for _i in range(10):
+ for _i in range(20):
# Test if server is up
try:
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
@@ -118,11 +118,12 @@
OS.write('%s\n' % received)
OS.close()
IS.close()
+
+ # Statistics
+ if args.statistics:
+ received = communicate('avgStats',args.host,args.port)
+ logger.info(received)
- # Statistics
- if args.statistics:
- received = communicate('avgStats',args.host,args.port)
- logger.info(received)
if args.saveModels:
communicate('save',args.host,args.port)
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mashTest.py Thu Sep 12 00:34:48 2013 +0200
+++ /dev/null Thu Jan 01 00:00:00 1970 +0000
@@ -1,12 +0,0 @@
-'''
-Created on Aug 20, 2013
-
-@author: daniel
-'''
-from mash import mash
-
-if __name__ == "__main__":
- args = ['--statistics', '--init', '--inputDir', '../data/20130118/Jinja', '--log', '../tmp/auth.log', '--modelFile', '../tmp/m0', '--dictsFile', '../tmp/d0','--NBDefaultPriorWeight', '20.0', '--NBDefVal', '-15.0', '--NBPosWeight', '10.0']
- mash(args)
- args = ['-i', '../data/20130118/Jinja/mash_commands', '-p', '../tmp/auth.pred0', '--statistics', '--cutOff', '500', '--log', '../tmp/auth.log','--modelFile', '../tmp/m0', '--dictsFile', '../tmp/d0']
- mash(args)
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py Thu Sep 12 09:59:45 2013 +0200
@@ -22,14 +22,13 @@
parser.add_argument('--depFile', default='mash_dependencies',
help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies')
- parser.add_argument('--algorithm',default='nb',help="Which learning algorithm is used. nb = Naive Bayes,predef=predefined. Default=nb.")
+ parser.add_argument('--algorithm',default='nb',help="Which learning algorithm is used. nb = Naive Bayes,KNN,predef=predefined. Default=nb.")
+ parser.add_argument('--predef',help="File containing the predefined suggestions. Only used when algorithm = predef.")
# NB Parameters
parser.add_argument('--NBDefaultPriorWeight',default=20.0,help="Initializes classifiers with value * p |- p. Default=20.0.",type=float)
parser.add_argument('--NBDefVal',default=-15.0,help="Default value for unknown features. Default=-15.0.",type=float)
parser.add_argument('--NBPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
- # TODO: Rename to sineFeatures
- parser.add_argument('--sineFeatures',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
- parser.add_argument('--sineWeight',default=0.5,help="How much the SInE prior is weighted. Default=0.5.",type=float)
+ parser.add_argument('--expandFeatures',default=False,action='store_true',help="Learning-based feature expansion. Default=False.")
parser.add_argument('--statistics',default=False,action='store_true',help="Create and show statistics for the top CUTOFF predictions.\
WARNING: This will make the program a lot slower! Default=False.")
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Thu Sep 12 09:59:45 2013 +0200
@@ -14,55 +14,6 @@
import sys,logging
-def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,\
- triggerFeaturesDict,featureTriggeredFormulasDict,sineFeatures,inputFile):
- logger = logging.getLogger('create_feature_dict')
- featureDict = {}
- IS = open(inputFile,'r')
- for line in IS:
- line = line.split(':')
- name = line[0]
- # Name Id
- if nameIdDict.has_key(name):
- logger.warning('%s appears twice in the feature file. Aborting.',name)
- sys.exit(-1)
- else:
- nameIdDict[name] = maxNameId
- idNameDict[maxNameId] = name
- nameId = maxNameId
- maxNameId += 1
- # Feature Ids
- featureNames = [f.strip() for f in line[1].split()]
- features = []
- minFeatureCount = 9999999
- for fn in featureNames:
- weight = 1.0
- tmp = fn.split('=')
- if len(tmp) == 2:
- fn = tmp[0]
- weight = float(tmp[1])
- if not featureIdDict.has_key(fn):
- featureIdDict[fn] = maxFeatureId
- featureCountDict[maxFeatureId] = 0
- maxFeatureId += 1
- fId = featureIdDict[fn]
- features.append((fId,weight))
- if sineFeatures:
- featureCountDict[fId] += 1
- minFeatureCount = min(minFeatureCount,featureCountDict[fId])
- # Store results
- featureDict[nameId] = features
- if sineFeatures:
- triggerFeatures = [f for f,_w in features if featureCountDict[f] == minFeatureCount]
- triggerFeaturesDict[nameId] = triggerFeatures
- for f in triggerFeatures:
- if featureTriggeredFormulasDict.has_key(f):
- featureTriggeredFormulasDict[f].append(nameId)
- else:
- featureTriggeredFormulasDict[f] = [nameId]
- IS.close()
- return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeaturesDict,featureTriggeredFormulasDict
-
def create_dependencies_dict(nameIdDict,inputFile):
logger = logging.getLogger('create_dependencies_dict')
dependenciesDict = {}
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/server.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/server.py Thu Sep 12 09:59:45 2013 +0200
@@ -7,10 +7,15 @@
import SocketServer,os,string,logging
from multiprocessing import Manager
+from threading import Timer
from time import time
from dictionaries import Dictionaries
from parameters import init_parser
from sparseNaiveBayes import sparseNBClassifier
+from KNN import KNN,euclidean
+from KNNs import KNNAdaptPointFeatures,KNNUrban
+from predefined import Predefined
+from ExpandFeatures import ExpandFeatures
from stats import Statistics
@@ -19,6 +24,21 @@
SocketServer.ThreadingTCPServer.__init__(self,*args, **kwargs)
self.manager = Manager()
self.lock = Manager().Lock()
+ self.idle_timeout = 28800.0 # 8 hours in seconds
+ self.idle_timer = Timer(self.idle_timeout, self.shutdown)
+ self.idle_timer.start()
+
+ def save(self):
+ # Save Models
+ self.model.save(self.args.modelFile)
+ self.dicts.save(self.args.dictsFile)
+ if not self.args.saveStats == None:
+ statsFile = os.path.join(self.args.outputDir,self.args.saveStats)
+ self.stats.save(statsFile)
+
+ def save_and_shutdown(self):
+ self.save()
+ self.shutdown()
class MaShHandler(SocketServer.BaseRequestHandler):
@@ -28,25 +48,32 @@
else:
argv = argv.split(';')
self.server.args = init_parser(argv)
- # Pick model
- if self.server.args.algorithm == 'nb':
- self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
- else: # Default case
- self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
# Load all data
- # TODO: rewrite dicts for concurrency and without sine
self.server.dicts = Dictionaries()
if os.path.isfile(self.server.args.dictsFile):
self.server.dicts.load(self.server.args.dictsFile)
elif self.server.args.init:
self.server.dicts.init_all(self.server.args)
+ # Pick model
+ if self.server.args.algorithm == 'nb':
+ self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
+ elif self.server.args.algorithm == 'KNN':
+ #self.server.model = KNN(self.server.dicts)
+ self.server.model = KNNAdaptPointFeatures(self.server.dicts)
+ elif self.server.args.algorithm == 'predef':
+ self.server.model = Predefined(self.server.args.predef)
+ else: # Default case
+ self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal)
+ if self.server.args.expandFeatures:
+ self.server.expandFeatures = ExpandFeatures(self.server.dicts)
+ self.server.expandFeatures.initialize(self.server.dicts)
# Create Model
if os.path.isfile(self.server.args.modelFile):
self.server.model.load(self.server.args.modelFile)
elif self.server.args.init:
trainData = self.server.dicts.featureDict.keys()
self.server.model.initializeModel(trainData,self.server.dicts)
-
+
if self.server.args.statistics:
self.server.stats = Statistics(self.server.args.cutOff)
self.server.statementCounter = 1
@@ -77,6 +104,8 @@
self.server.logger.debug('Poor predictions: %s',bp)
self.server.statementCounter += 1
+ if self.server.args.expandFeatures:
+ self.server.expandFeatures.update(self.server.dicts.featureDict[problemId],self.server.dicts.dependenciesDict[problemId])
# Update Dependencies, p proves p
self.server.dicts.dependenciesDict[problemId] = [problemId]+self.server.dicts.dependenciesDict[problemId]
self.server.model.update(problemId,self.server.dicts.featureDict[problemId],self.server.dicts.dependenciesDict[problemId])
@@ -92,22 +121,25 @@
self.server.computeStats = True
if self.server.args.algorithm == 'predef':
return
- name,features,accessibles,hints,numberOfPredictions = self.server.dicts.parse_problem(self.data)
+ name,features,accessibles,hints,numberOfPredictions = self.server.dicts.parse_problem(self.data)
if numberOfPredictions == None:
numberOfPredictions = self.server.args.numberOfPredictions
if not hints == []:
self.server.model.update('hints',features,hints)
-
+ if self.server.args.expandFeatures:
+ features = self.server.expandFeatures.expand(features)
# Create predictions
self.server.logger.debug('Starting computation for line %s',self.server.callCounter)
- predictionsFeatures = features
- self.server.predictions,predictionValues = self.server.model.predict(predictionsFeatures,accessibles,self.server.dicts)
+
+ self.server.predictions,predictionValues = self.server.model.predict(features,accessibles,self.server.dicts)
assert len(self.server.predictions) == len(predictionValues)
self.server.logger.debug('Time needed: '+str(round(time()-self.startTime,2)))
# Output
predictionNames = [str(self.server.dicts.idNameDict[p]) for p in self.server.predictions[:numberOfPredictions]]
- predictionValues = [str(x) for x in predictionValues[:numberOfPredictions]]
+ #predictionValues = [str(x) for x in predictionValues[:numberOfPredictions]]
+ #predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))]
+ #predictionsString = string.join(predictionsStringList,' ')
predictionsString = string.join(predictionNames,' ')
outString = '%s: %s' % (name,predictionsString)
self.request.sendall(outString)
@@ -115,27 +147,18 @@
def shutdown(self,saveModels=True):
self.request.sendall('Shutting down server.')
if saveModels:
- self.save()
+ self.server.save()
self.server.shutdown()
- def save(self):
- # Save Models
- self.server.model.save(self.server.args.modelFile)
- self.server.dicts.save(self.server.args.dictsFile)
- if not self.server.args.saveStats == None:
- statsFile = os.path.join(self.server.args.outputDir,self.server.args.saveStats)
- self.server.stats.save(statsFile)
-
def handle(self):
# self.request is the TCP socket connected to the client
self.data = self.request.recv(4194304).strip()
self.server.lock.acquire()
- #print "{} wrote:".format(self.client_address[0])
self.startTime = time()
if self.data == 'shutdown':
self.shutdown()
elif self.data == 'save':
- self.save()
+ self.server.save()
elif self.data.startswith('i'):
self.init(self.data[2:])
elif self.data.startswith('!'):
@@ -153,15 +176,16 @@
else:
self.request.sendall('Unspecified input format: \n%s',self.data)
self.server.callCounter += 1
+ # Update idle shutdown timer
+ self.server.idle_timer.cancel()
+ self.server.idle_timer = Timer(self.server.idle_timeout, self.server.save_and_shutdown)
+ self.server.idle_timer.start()
self.server.lock.release()
if __name__ == "__main__":
HOST, PORT = "localhost", 9255
- #print 'Started Server'
- # Create the server, binding to localhost on port 9999
SocketServer.TCPServer.allow_reuse_address = True
server = ThreadingTCPServer((HOST, PORT), MaShHandler)
- #server = SocketServer.TCPServer((HOST, PORT), MaShHandler)
# Activate the server; this will keep running until you
# interrupt the program with Ctrl-C
@@ -171,4 +195,4 @@
-
+
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Thu Sep 12 09:59:45 2013 +0200
@@ -36,7 +36,7 @@
dFeatureCounts = {}
# Add p proves p with weight self.defaultPriorWeight
if not self.defaultPriorWeight == 0:
- for f,_w in dicts.featureDict[d]:
+ for f in dicts.featureDict[d].iterkeys():
dFeatureCounts[f] = self.defaultPriorWeight
self.counts[d] = [self.defaultPriorWeight,dFeatureCounts]
@@ -44,7 +44,7 @@
for dep in keyDeps:
self.counts[dep][0] += 1
depFeatures = dicts.featureDict[key]
- for f,_w in depFeatures:
+ for f in depFeatures.iterkeys():
if self.counts[dep][1].has_key(f):
self.counts[dep][1][f] += 1
else:
@@ -59,12 +59,12 @@
dFeatureCounts = {}
# Give p |- p a higher weight
if not self.defaultPriorWeight == 0:
- for f,_w in features:
+ for f in features.iterkeys():
dFeatureCounts[f] = self.defaultPriorWeight
self.counts[dataPoint] = [self.defaultPriorWeight,dFeatureCounts]
for dep in dependencies:
self.counts[dep][0] += 1
- for f,_w in features:
+ for f in features.iterkeys():
if self.counts[dep][1].has_key(f):
self.counts[dep][1][f] += 1
else:
@@ -97,12 +97,14 @@
"""
tau = 0.05 # Jasmin, change value here
predictions = []
+ #observedFeatures = [f for f,_w in features]
+ observedFeatures = features.keys()
for a in accessibles:
posA = self.counts[a][0]
fA = set(self.counts[a][1].keys())
fWeightsA = self.counts[a][1]
resultA = log(posA)
- for f,w in features:
+ for f,w in features.iteritems():
# DEBUG
#w = 1.0
if f in fA:
@@ -114,9 +116,10 @@
else:
resultA += w*self.defVal
if not tau == 0.0:
- observedFeatures = [f for f,_w in features]
missingFeatures = list(fA.difference(observedFeatures))
- sumOfWeights = sum([log(float(fWeightsA[x])/posA) for x in missingFeatures])
+ #sumOfWeights = sum([log(float(fWeightsA[x])/posA) for x in missingFeatures]) # slower
+ sumOfWeights = sum([log(float(fWeightsA[x])) for x in missingFeatures]) - log(posA) * len(missingFeatures) #DEFAULT
+ #sumOfWeights = sum([log(float(fWeightsA[x])/self.totalFeatureCounts[x]) for x in missingFeatures]) - log(posA) * len(missingFeatures)
resultA -= tau * sumOfWeights
predictions.append(resultA)
predictions = array(predictions)
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Thu Sep 12 00:34:48 2013 +0200
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Thu Sep 12 09:59:45 2013 +0200
@@ -97,7 +97,7 @@
badPreds.append(dep)
recall100 = len(predictions)+1
positives+=1
- self.logger.debug('Dependencies missing for %s in accessibles! Estimating Statistics.',\
+ self.logger.debug('Dependencies missing for %s in cutoff predictions! Estimating Statistics.',\
string.join([str(dep) for dep in missing],','))
if positives == 0 or negatives == 0:
@@ -113,7 +113,7 @@
self.badPreds = badPreds
self.avgAvailable += available
self.avgDepNr += depNr
- self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\
+ self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff: %s',\
statementCounter,round(100*auc,2),depNr,recall100,available,self.cutOff)
def printAvg(self):
@@ -135,7 +135,7 @@
else:
medianrecall100 = float(sorted(self.recall100Median)[nrDataPoints/2] + sorted(self.recall100Median)[nrDataPoints/2 + 1])/2
- returnString = 'avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff:%s' %\
+ returnString = 'avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff: %s' %\
(round(100*self.avgAUC/self.problems,2),round(100*medianAUC,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),round(medianrecall100,2),self.cutOff)
self.logger.info(returnString)
return returnString
@@ -143,44 +143,6 @@
"""
self.logger.info('avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff:%s', \
round(100*self.avgAUC/self.problems,2),round(100*medianAUC,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),round(medianrecall100,2),self.cutOff)
-
- #try:
- #if True:
- if False:
- from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist
- avgRecall = [float(x)/self.problems for x in self.recallData]
- figure('Recall')
- plot(range(self.cutOff),avgRecall)
- ylabel('Average Recall')
- xlabel('Highest ranked premises')
- axis([0,self.cutOff,0.0,1.0])
- figure('100%Recall')
- plot(range(self.cutOff),self.recall100Data)
- ylabel('100%Recall')
- xlabel('Highest ranked premises')
- axis([0,self.cutOff,0,self.problems])
- figure('AUC Histogram')
- hist(self.aucData,bins=100)
- ylabel('Problems')
- xlabel('AUC')
- maxCount = max(self.premiseOccurenceCounter.values())
- minCount = min(self.premiseOccurenceCounter.values())
- figure('Dependency Occurances')
- hist(self.premiseOccurenceCounter.values(),bins=range(minCount,maxCount+2),align = 'left')
- #ylabel('Occurences')
- xlabel('Number of Times a Dependency Occurs')
- figure('Dependency Appearance in Problems after Introduction.')
- hist(self.depAppearances,bins=50)
- figure('Dependency Appearance in Problems after Introduction in Percent.')
- xAxis = range(max(self.depAppearances)+1)
- yAxis = [0] * (max(self.depAppearances)+1)
- for val in self.depAppearances:
- yAxis[val] += 1
- yAxis = [float(x)/len(self.firstDepAppearance.keys()) for x in yAxis]
- plot(xAxis,yAxis)
- show()
- #except:
- # self.logger.warning('Matplotlib module missing. Skipping graphs.')
"""
def save(self,fileName):