# HG changeset patch # User blanchet # Date 1378972785 -7200 # Node ID 12251bc889f129c366d322751bccce170cc3f5ef # Parent 78fe0002024d33fb79e9f7c2a6a8dc5860521d7d new version of MaSh diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/ExpandFeatures.py Thu Sep 12 09:59:45 2013 +0200 @@ -0,0 +1,162 @@ +''' +Created on Aug 21, 2013 + +@author: daniel +''' + +from math import log +from gensim import corpora, models, similarities + +class ExpandFeatures(object): + + def __init__(self,dicts): + self.dicts = dicts + self.featureMap = {} + self.alpha = 0.1 + self.featureCounts = {} + self.counter = 0 + self.corpus = [] + self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500) + + def initialize(self,dicts): + self.dicts = dicts + IS = open(dicts.accFile,'r') + for line in IS: + line = line.split(':') + name = line[0] + #print 'name',name + nameId = dicts.nameIdDict[name] + features = dicts.featureDict[nameId] + dependencies = dicts.dependenciesDict[nameId] + x = [self.dicts.idNameDict[d] for d in dependencies] + #print x + self.update(features, dependencies) + self.corpus.append([(x,1) for x in features.keys()]) + IS.close() + print 'x' + #self.LSIModel = models.lsimodel.LsiModel(self.corpus,num_topics=500) + print self.LSIModel + print 'y' + + def update(self,features,dependencies): + self.counter += 1 + self.corpus.append([(x,1) for x in features.keys()]) + self.LSIModel.add_documents([[(x,1) for x in features.keys()]]) + """ + for f in features.iterkeys(): + try: + self.featureCounts[f] += 1 + except: + self.featureCounts[f] = 1 + if self.featureCounts[f] > 100: + continue + try: + self.featureMap[f] = self.featureMap[f].intersection(features.keys()) + except: + self.featureMap[f] = set(features.keys()) + #print 'fOld',len(fMap),self.featureCounts[f],len(dependencies) + + for d in dependencies[1:]: + #print 'dep',self.dicts.idNameDict[d] + dFeatures = self.dicts.featureDict[d] + for df in dFeatures.iterkeys(): + if self.featureCounts.has_key(df): + if self.featureCounts[df] > 20: + continue + else: + print df + try: + fMap[df] += self.alpha * (1.0 - fMap[df]) + except: + fMap[df] = self.alpha + """ + #print 'fNew',len(fMap) + + def expand(self,features): + #print self.corpus[:50] + #print corpus + #tfidfmodel = models.TfidfModel(self.corpus, normalize=True) + #print features.keys() + #tfidfcorpus = [tfidfmodel[x] for x in self.corpus] + #newFeatures = LSI[[(x,1) for x in features.keys()]] + newFeatures = self.LSIModel[[(x,1) for x in features.keys()]] + print features + print newFeatures + #print newFeatures + + """ + newFeatures = dict(features) + for f in features.keys(): + try: + fC = self.featureCounts[f] + except: + fC = 0.5 + newFeatures[f] = log(float(8+self.counter) / fC) + #nrOfFeatures = float(len(features)) + addedCount = 0 + alpha = 0.2 + #""" + + """ + consideredFeatures = [] + while len(newFeatures) < 30: + #alpha = alpha * 0.5 + minF = None + minFrequence = 1000000 + for f in newFeatures.iterkeys(): + if f in consideredFeatures: + continue + try: + if self.featureCounts[f] < minFrequence: + minF = f + except: + pass + if minF == None: + break + # Expand minimal feature + consideredFeatures.append(minF) + for expF in self.featureMap[minF]: + if not newFeatures.has_key(expF): + fC = self.featureCounts[minF] + newFeatures[expF] = alpha*log(float(8+self.counter) / fC) + #print features, newFeatures + #""" + """ + for f in features.iterkeys(): + try: + self.featureCounts[f] += 1 + except: + self.featureCounts[f] = 0 + if self.featureCounts[f] > 10: + continue + addedCount += 1 + try: + fmap = self.featureMap[f] + except: + self.featureMap[f] = {} + fmap = {} + for nf,nv in fmap.iteritems(): + try: + newFeatures[nf] += nv + except: + newFeatures[nf] = nv + if addedCount > 0: + for f,w in newFeatures.iteritems(): + newFeatures[f] = float(w)/addedCount + #""" + """ + deleteF = [] + for f,w in newFeatures.iteritems(): + if w < 0.1: + deleteF.append(f) + for f in deleteF: + del newFeatures[f] + """ + #print 'fold',len(features) + #print 'fnew',len(newFeatures) + return dict(newFeatures) + +if __name__ == "__main__": + pass + + \ No newline at end of file diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/KNN.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/KNN.py Thu Sep 12 09:59:45 2013 +0200 @@ -0,0 +1,99 @@ +''' +Created on Aug 21, 2013 + +@author: daniel +''' + +from cPickle import dump,load +from numpy import array +from math import sqrt,log + +def cosine(f1,f2): + f1Norm = 0.0 + for f in f1.keys(): + f1Norm += f1[f] * f1[f] + #assert f1Norm = sum(map(lambda x,y: x*y,f1.itervalues(),f1.itervalues())) + f1Norm = sqrt(f1Norm) + + f2Norm = 0.0 + for f in f2.keys(): + f2Norm += f2[f] * f2[f] + f2Norm = sqrt(f2Norm) + + dotProduct = 0.0 + featureIntersection = set(f1.keys()) & set(f2.keys()) + for f in featureIntersection: + dotProduct += f1[f] * f2[f] + cosine = dotProduct / (f1Norm * f2Norm) + return 1.0 - cosine + +def euclidean(f1,f2): + diffSum = 0.0 + featureUnion = set(f1.keys()) | set(f2.keys()) + for f in featureUnion: + try: + f1Val = f1[f] + except: + f1Val = 0.0 + try: + f2Val = f2[f] + except: + f2Val = 0.0 + diff = f1Val - f2Val + diffSum += diff * diff + #if f in f1.keys(): + # diffSum += log(2+self.pointCount/self.featureCounts[f]) * diff * diff + #else: + # diffSum += diff * diff + #print diffSum,f1,f2 + return diffSum + +class KNN(object): + ''' + A basic KNN ranker. + ''' + + def __init__(self,dicts,metric=cosine): + ''' + Constructor + ''' + self.points = dicts.featureDict + self.metric = metric + + def initializeModel(self,_trainData,_dicts): + """ + Build basic model from training data. + """ + pass + + def update(self,dataPoint,features,dependencies): + assert self.points[dataPoint] == features + + def overwrite(self,problemId,newDependencies,dicts): + # Taken care of by dicts + pass + + def delete(self,dataPoint,features,dependencies): + # Taken care of by dicts + pass + + def predict(self,features,accessibles,dicts): + predictions = map(lambda x: self.metric(features,self.points[x]),accessibles) + predictions = array(predictions) + perm = predictions.argsort() + return array(accessibles)[perm],predictions[perm] + + def save(self,fileName): + OStream = open(fileName, 'wb') + dump((self.points,self.metric),OStream) + OStream.close() + + def load(self,fileName): + OStream = open(fileName, 'rb') + self.points,self.metric = load(OStream) + OStream.close() + +if __name__ == '__main__': + pass + + \ No newline at end of file diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/KNNs.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/KNNs.py Thu Sep 12 09:59:45 2013 +0200 @@ -0,0 +1,105 @@ +''' +Created on Aug 21, 2013 + +@author: daniel +''' + +from math import log +from KNN import KNN,cosine +from numpy import array + +class KNNAdaptPointFeatures(KNN): + + def __init__(self,dicts,metric=cosine,alpha = 0.05): + self.points = dicts.featureDict + self.metric = self.euclidean + self.alpha = alpha + self.count = 0 + self.featureCount = {} + + def initializeModel(self,trainData,dicts): + """ + Build basic model from training data. + """ + IS = open(dicts.accFile,'r') + for line in IS: + line = line.split(':') + name = line[0] + nameId = dicts.nameIdDict[name] + features = dicts.featureDict[nameId] + dependencies = dicts.dependenciesDict[nameId] + self.update(nameId, features, dependencies) + IS.close() + + def update(self,dataPoint,features,dependencies): + self.count += 1 + for f in features.iterkeys(): + try: + self.featureCount[f] += 1 + except: + self.featureCount[f] = 1 + for d in dependencies: + dFeatures = self.points[d] + featureUnion = set(dFeatures.keys()) | set(features.keys()) + for f in featureUnion: + try: + pVal = features[f] + except: + pVal = 0.0 + try: + dVal = dFeatures[f] + except: + dVal = 0.0 + newDVal = dVal + self.alpha * (pVal - dVal) + dFeatures[f] = newDVal + + def euclidean(self,f1,f2): + diffSum = 0.0 + f1Set = set(f1.keys()) + featureUnion = f1Set | set(f2.keys()) + for f in featureUnion: + if not self.featureCount.has_key(f): + continue + if self.featureCount[f] == 1: + continue + try: + f1Val = f1[f] + except: + f1Val = 0.0 + try: + f2Val = f2[f] + except: + f2Val = 0.0 + diff = f1Val - f2Val + diffSum += diff * diff + if f in f1Set: + diffSum += log(2+self.count/self.featureCount[f]) * diff * diff + else: + diffSum += diff * diff + #print diffSum,f1,f2 + return diffSum + +class KNNUrban(KNN): + def __init__(self,dicts,metric=cosine,nrOfNeighbours = 40): + self.points = dicts.featureDict + self.metric = metric + self.nrOfNeighbours = nrOfNeighbours # Ignored at the moment + + def predict(self,features,accessibles,dicts): + predictions = map(lambda x: self.metric(features,self.points[x]),accessibles) + pDict = dict(zip(accessibles,predictions)) + for a,p in zip(accessibles,predictions): + aDeps = dicts.dependenciesDict[a] + for d in aDeps: + pDict[d] -= p + predictions = [] + names = [] + for n,p in pDict.items(): + predictions.append(p) + names.append(n) + predictions = array(predictions) + perm = predictions.argsort() + return array(names)[perm],predictions[perm] + + + \ No newline at end of file diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Thu Sep 12 09:59:45 2013 +0200 @@ -1,4 +1,4 @@ -#!/usr/bin/env python +#!/usr/bin/python # Title: HOL/Tools/Sledgehammer/MaSh/src/compareStats.py # Author: Daniel Kuehlwein, ICIS, Radboud University Nijmegen # Copyright 2012 diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Thu Sep 12 09:59:45 2013 +0200 @@ -1,18 +1,13 @@ # Title: HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py # Author: Daniel Kuehlwein, ICIS, Radboud University Nijmegen -# Copyright 2012 +# Copyright 2012-2013 # # Persistent dictionaries: accessibility, dependencies, and features. -''' -Created on Jul 12, 2012 - -@author: daniel -''' - +import logging,sys from os.path import join from Queue import Queue -from readData import create_accessible_dict,create_dependencies_dict,create_feature_dict +from readData import create_accessible_dict,create_dependencies_dict from cPickle import load,dump class Dictionaries(object): @@ -32,20 +27,17 @@ self.dependenciesDict = {} self.accessibleDict = {} self.expandedAccessibles = {} - # For SInE features - self.useSine = False - self.featureCountDict = {} - self.triggerFeaturesDict = {} - self.featureTriggeredFormulasDict = {} + self.accFile = '' self.changed = True """ Init functions. nameIdDict, idNameDict, featureIdDict, articleDict get filled! """ - def init_featureDict(self,featureFile,sineFeatures): - self.featureDict,self.maxNameId,self.maxFeatureId,self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict =\ - create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,self.maxFeatureId,self.featureCountDict,\ - self.triggerFeaturesDict,self.featureTriggeredFormulasDict,sineFeatures,featureFile) + def init_featureDict(self,featureFile): + self.create_feature_dict(featureFile) + #self.featureDict,self.maxNameId,self.maxFeatureId,self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict =\ + # create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,self.maxFeatureId,self.featureCountDict,\ + # self.triggerFeaturesDict,self.featureTriggeredFormulasDict,sineFeatures,featureFile) def init_dependenciesDict(self,depFile): self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile) def init_accessibleDict(self,accFile): @@ -54,16 +46,37 @@ def init_all(self,args): self.featureFileName = 'mash_features' self.accFileName = 'mash_accessibility' - self.useSine = args.sineFeatures featureFile = join(args.inputDir,self.featureFileName) depFile = join(args.inputDir,args.depFile) - accFile = join(args.inputDir,self.accFileName) - self.init_featureDict(featureFile,self.useSine) - self.init_accessibleDict(accFile) + self.accFile = join(args.inputDir,self.accFileName) + self.init_featureDict(featureFile) + self.init_accessibleDict(self.accFile) self.init_dependenciesDict(depFile) self.expandedAccessibles = {} self.changed = True + def create_feature_dict(self,inputFile): + logger = logging.getLogger('create_feature_dict') + self.featureDict = {} + IS = open(inputFile,'r') + for line in IS: + line = line.split(':') + name = line[0] + # Name Id + if self.nameIdDict.has_key(name): + logger.warning('%s appears twice in the feature file. Aborting.',name) + sys.exit(-1) + else: + self.nameIdDict[name] = self.maxNameId + self.idNameDict[self.maxNameId] = name + nameId = self.maxNameId + self.maxNameId += 1 + features = self.get_features(line) + # Store results + self.featureDict[nameId] = features + IS.close() + return + def get_name_id(self,name): """ Return the Id for a name. @@ -82,27 +95,23 @@ def add_feature(self,featureName): if not self.featureIdDict.has_key(featureName): self.featureIdDict[featureName] = self.maxFeatureId - if self.useSine: - self.featureCountDict[self.maxFeatureId] = 0 self.maxFeatureId += 1 self.changed = True fId = self.featureIdDict[featureName] - if self.useSine: - self.featureCountDict[fId] += 1 return fId def get_features(self,line): - # Feature Ids featureNames = [f.strip() for f in line[1].split()] - features = [] + features = {} for fn in featureNames: tmp = fn.split('=') - weight = 1.0 + weight = 1.0 if len(tmp) == 2: fn = tmp[0] weight = float(tmp[1]) fId = self.add_feature(tmp[0]) - features.append((fId,weight)) + features[fId] = weight + #features[fId] = 1.0 ### return features def expand_accessibles(self,acc): @@ -142,16 +151,6 @@ self.accessibleDict[nameId] = unExpAcc features = self.get_features(line) self.featureDict[nameId] = features - if self.useSine: - # SInE Features - minFeatureCount = min([self.featureCountDict[f] for f,_w in features]) - triggerFeatures = [f for f,_w in features if self.featureCountDict[f] == minFeatureCount] - self.triggerFeaturesDict[nameId] = triggerFeatures - for f in triggerFeatures: - if self.featureTriggeredFormulasDict.has_key(f): - self.featureTriggeredFormulasDict[f].append(nameId) - else: - self.featureTriggeredFormulasDict[f] = [nameId] self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()] self.changed = True return nameId @@ -219,14 +218,12 @@ if self.changed: dictsStream = open(fileName, 'wb') dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\ - self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\ - self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine),dictsStream) + self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict),dictsStream) self.changed = False dictsStream.close() def load(self,fileName): dictsStream = open(fileName, 'rb') self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\ - self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\ - self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine = load(dictsStream) + self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream) self.changed = False dictsStream.close() diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/mash.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Thu Sep 12 09:59:45 2013 +0200 @@ -41,7 +41,7 @@ path = dirname(realpath(__file__)) spawnDaemon(os.path.join(path,'server.py')) serverIsUp=False - for _i in range(10): + for _i in range(20): # Test if server is up try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) @@ -118,11 +118,12 @@ OS.write('%s\n' % received) OS.close() IS.close() + + # Statistics + if args.statistics: + received = communicate('avgStats',args.host,args.port) + logger.info(received) - # Statistics - if args.statistics: - received = communicate('avgStats',args.host,args.port) - logger.info(received) if args.saveModels: communicate('save',args.host,args.port) diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/mashTest.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/mashTest.py Thu Sep 12 00:34:48 2013 +0200 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000 @@ -1,12 +0,0 @@ -''' -Created on Aug 20, 2013 - -@author: daniel -''' -from mash import mash - -if __name__ == "__main__": - args = ['--statistics', '--init', '--inputDir', '../data/20130118/Jinja', '--log', '../tmp/auth.log', '--modelFile', '../tmp/m0', '--dictsFile', '../tmp/d0','--NBDefaultPriorWeight', '20.0', '--NBDefVal', '-15.0', '--NBPosWeight', '10.0'] - mash(args) - args = ['-i', '../data/20130118/Jinja/mash_commands', '-p', '../tmp/auth.pred0', '--statistics', '--cutOff', '500', '--log', '../tmp/auth.log','--modelFile', '../tmp/m0', '--dictsFile', '../tmp/d0'] - mash(args) \ No newline at end of file diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/parameters.py Thu Sep 12 09:59:45 2013 +0200 @@ -22,14 +22,13 @@ parser.add_argument('--depFile', default='mash_dependencies', help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies') - parser.add_argument('--algorithm',default='nb',help="Which learning algorithm is used. nb = Naive Bayes,predef=predefined. Default=nb.") + parser.add_argument('--algorithm',default='nb',help="Which learning algorithm is used. nb = Naive Bayes,KNN,predef=predefined. Default=nb.") + parser.add_argument('--predef',help="File containing the predefined suggestions. Only used when algorithm = predef.") # NB Parameters parser.add_argument('--NBDefaultPriorWeight',default=20.0,help="Initializes classifiers with value * p |- p. Default=20.0.",type=float) parser.add_argument('--NBDefVal',default=-15.0,help="Default value for unknown features. Default=-15.0.",type=float) parser.add_argument('--NBPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float) - # TODO: Rename to sineFeatures - parser.add_argument('--sineFeatures',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.") - parser.add_argument('--sineWeight',default=0.5,help="How much the SInE prior is weighted. Default=0.5.",type=float) + parser.add_argument('--expandFeatures',default=False,action='store_true',help="Learning-based feature expansion. Default=False.") parser.add_argument('--statistics',default=False,action='store_true',help="Create and show statistics for the top CUTOFF predictions.\ WARNING: This will make the program a lot slower! Default=False.") diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/readData.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Thu Sep 12 09:59:45 2013 +0200 @@ -14,55 +14,6 @@ import sys,logging -def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,\ - triggerFeaturesDict,featureTriggeredFormulasDict,sineFeatures,inputFile): - logger = logging.getLogger('create_feature_dict') - featureDict = {} - IS = open(inputFile,'r') - for line in IS: - line = line.split(':') - name = line[0] - # Name Id - if nameIdDict.has_key(name): - logger.warning('%s appears twice in the feature file. Aborting.',name) - sys.exit(-1) - else: - nameIdDict[name] = maxNameId - idNameDict[maxNameId] = name - nameId = maxNameId - maxNameId += 1 - # Feature Ids - featureNames = [f.strip() for f in line[1].split()] - features = [] - minFeatureCount = 9999999 - for fn in featureNames: - weight = 1.0 - tmp = fn.split('=') - if len(tmp) == 2: - fn = tmp[0] - weight = float(tmp[1]) - if not featureIdDict.has_key(fn): - featureIdDict[fn] = maxFeatureId - featureCountDict[maxFeatureId] = 0 - maxFeatureId += 1 - fId = featureIdDict[fn] - features.append((fId,weight)) - if sineFeatures: - featureCountDict[fId] += 1 - minFeatureCount = min(minFeatureCount,featureCountDict[fId]) - # Store results - featureDict[nameId] = features - if sineFeatures: - triggerFeatures = [f for f,_w in features if featureCountDict[f] == minFeatureCount] - triggerFeaturesDict[nameId] = triggerFeatures - for f in triggerFeatures: - if featureTriggeredFormulasDict.has_key(f): - featureTriggeredFormulasDict[f].append(nameId) - else: - featureTriggeredFormulasDict[f] = [nameId] - IS.close() - return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeaturesDict,featureTriggeredFormulasDict - def create_dependencies_dict(nameIdDict,inputFile): logger = logging.getLogger('create_dependencies_dict') dependenciesDict = {} diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/server.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/server.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/server.py Thu Sep 12 09:59:45 2013 +0200 @@ -7,10 +7,15 @@ import SocketServer,os,string,logging from multiprocessing import Manager +from threading import Timer from time import time from dictionaries import Dictionaries from parameters import init_parser from sparseNaiveBayes import sparseNBClassifier +from KNN import KNN,euclidean +from KNNs import KNNAdaptPointFeatures,KNNUrban +from predefined import Predefined +from ExpandFeatures import ExpandFeatures from stats import Statistics @@ -19,6 +24,21 @@ SocketServer.ThreadingTCPServer.__init__(self,*args, **kwargs) self.manager = Manager() self.lock = Manager().Lock() + self.idle_timeout = 28800.0 # 8 hours in seconds + self.idle_timer = Timer(self.idle_timeout, self.shutdown) + self.idle_timer.start() + + def save(self): + # Save Models + self.model.save(self.args.modelFile) + self.dicts.save(self.args.dictsFile) + if not self.args.saveStats == None: + statsFile = os.path.join(self.args.outputDir,self.args.saveStats) + self.stats.save(statsFile) + + def save_and_shutdown(self): + self.save() + self.shutdown() class MaShHandler(SocketServer.BaseRequestHandler): @@ -28,25 +48,32 @@ else: argv = argv.split(';') self.server.args = init_parser(argv) - # Pick model - if self.server.args.algorithm == 'nb': - self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal) - else: # Default case - self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal) # Load all data - # TODO: rewrite dicts for concurrency and without sine self.server.dicts = Dictionaries() if os.path.isfile(self.server.args.dictsFile): self.server.dicts.load(self.server.args.dictsFile) elif self.server.args.init: self.server.dicts.init_all(self.server.args) + # Pick model + if self.server.args.algorithm == 'nb': + self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal) + elif self.server.args.algorithm == 'KNN': + #self.server.model = KNN(self.server.dicts) + self.server.model = KNNAdaptPointFeatures(self.server.dicts) + elif self.server.args.algorithm == 'predef': + self.server.model = Predefined(self.server.args.predef) + else: # Default case + self.server.model = sparseNBClassifier(self.server.args.NBDefaultPriorWeight,self.server.args.NBPosWeight,self.server.args.NBDefVal) + if self.server.args.expandFeatures: + self.server.expandFeatures = ExpandFeatures(self.server.dicts) + self.server.expandFeatures.initialize(self.server.dicts) # Create Model if os.path.isfile(self.server.args.modelFile): self.server.model.load(self.server.args.modelFile) elif self.server.args.init: trainData = self.server.dicts.featureDict.keys() self.server.model.initializeModel(trainData,self.server.dicts) - + if self.server.args.statistics: self.server.stats = Statistics(self.server.args.cutOff) self.server.statementCounter = 1 @@ -77,6 +104,8 @@ self.server.logger.debug('Poor predictions: %s',bp) self.server.statementCounter += 1 + if self.server.args.expandFeatures: + self.server.expandFeatures.update(self.server.dicts.featureDict[problemId],self.server.dicts.dependenciesDict[problemId]) # Update Dependencies, p proves p self.server.dicts.dependenciesDict[problemId] = [problemId]+self.server.dicts.dependenciesDict[problemId] self.server.model.update(problemId,self.server.dicts.featureDict[problemId],self.server.dicts.dependenciesDict[problemId]) @@ -92,22 +121,25 @@ self.server.computeStats = True if self.server.args.algorithm == 'predef': return - name,features,accessibles,hints,numberOfPredictions = self.server.dicts.parse_problem(self.data) + name,features,accessibles,hints,numberOfPredictions = self.server.dicts.parse_problem(self.data) if numberOfPredictions == None: numberOfPredictions = self.server.args.numberOfPredictions if not hints == []: self.server.model.update('hints',features,hints) - + if self.server.args.expandFeatures: + features = self.server.expandFeatures.expand(features) # Create predictions self.server.logger.debug('Starting computation for line %s',self.server.callCounter) - predictionsFeatures = features - self.server.predictions,predictionValues = self.server.model.predict(predictionsFeatures,accessibles,self.server.dicts) + + self.server.predictions,predictionValues = self.server.model.predict(features,accessibles,self.server.dicts) assert len(self.server.predictions) == len(predictionValues) self.server.logger.debug('Time needed: '+str(round(time()-self.startTime,2))) # Output predictionNames = [str(self.server.dicts.idNameDict[p]) for p in self.server.predictions[:numberOfPredictions]] - predictionValues = [str(x) for x in predictionValues[:numberOfPredictions]] + #predictionValues = [str(x) for x in predictionValues[:numberOfPredictions]] + #predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))] + #predictionsString = string.join(predictionsStringList,' ') predictionsString = string.join(predictionNames,' ') outString = '%s: %s' % (name,predictionsString) self.request.sendall(outString) @@ -115,27 +147,18 @@ def shutdown(self,saveModels=True): self.request.sendall('Shutting down server.') if saveModels: - self.save() + self.server.save() self.server.shutdown() - def save(self): - # Save Models - self.server.model.save(self.server.args.modelFile) - self.server.dicts.save(self.server.args.dictsFile) - if not self.server.args.saveStats == None: - statsFile = os.path.join(self.server.args.outputDir,self.server.args.saveStats) - self.server.stats.save(statsFile) - def handle(self): # self.request is the TCP socket connected to the client self.data = self.request.recv(4194304).strip() self.server.lock.acquire() - #print "{} wrote:".format(self.client_address[0]) self.startTime = time() if self.data == 'shutdown': self.shutdown() elif self.data == 'save': - self.save() + self.server.save() elif self.data.startswith('i'): self.init(self.data[2:]) elif self.data.startswith('!'): @@ -153,15 +176,16 @@ else: self.request.sendall('Unspecified input format: \n%s',self.data) self.server.callCounter += 1 + # Update idle shutdown timer + self.server.idle_timer.cancel() + self.server.idle_timer = Timer(self.server.idle_timeout, self.server.save_and_shutdown) + self.server.idle_timer.start() self.server.lock.release() if __name__ == "__main__": HOST, PORT = "localhost", 9255 - #print 'Started Server' - # Create the server, binding to localhost on port 9999 SocketServer.TCPServer.allow_reuse_address = True server = ThreadingTCPServer((HOST, PORT), MaShHandler) - #server = SocketServer.TCPServer((HOST, PORT), MaShHandler) # Activate the server; this will keep running until you # interrupt the program with Ctrl-C @@ -171,4 +195,4 @@ - + \ No newline at end of file diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Thu Sep 12 09:59:45 2013 +0200 @@ -36,7 +36,7 @@ dFeatureCounts = {} # Add p proves p with weight self.defaultPriorWeight if not self.defaultPriorWeight == 0: - for f,_w in dicts.featureDict[d]: + for f in dicts.featureDict[d].iterkeys(): dFeatureCounts[f] = self.defaultPriorWeight self.counts[d] = [self.defaultPriorWeight,dFeatureCounts] @@ -44,7 +44,7 @@ for dep in keyDeps: self.counts[dep][0] += 1 depFeatures = dicts.featureDict[key] - for f,_w in depFeatures: + for f in depFeatures.iterkeys(): if self.counts[dep][1].has_key(f): self.counts[dep][1][f] += 1 else: @@ -59,12 +59,12 @@ dFeatureCounts = {} # Give p |- p a higher weight if not self.defaultPriorWeight == 0: - for f,_w in features: + for f in features.iterkeys(): dFeatureCounts[f] = self.defaultPriorWeight self.counts[dataPoint] = [self.defaultPriorWeight,dFeatureCounts] for dep in dependencies: self.counts[dep][0] += 1 - for f,_w in features: + for f in features.iterkeys(): if self.counts[dep][1].has_key(f): self.counts[dep][1][f] += 1 else: @@ -97,12 +97,14 @@ """ tau = 0.05 # Jasmin, change value here predictions = [] + #observedFeatures = [f for f,_w in features] + observedFeatures = features.keys() for a in accessibles: posA = self.counts[a][0] fA = set(self.counts[a][1].keys()) fWeightsA = self.counts[a][1] resultA = log(posA) - for f,w in features: + for f,w in features.iteritems(): # DEBUG #w = 1.0 if f in fA: @@ -114,9 +116,10 @@ else: resultA += w*self.defVal if not tau == 0.0: - observedFeatures = [f for f,_w in features] missingFeatures = list(fA.difference(observedFeatures)) - sumOfWeights = sum([log(float(fWeightsA[x])/posA) for x in missingFeatures]) + #sumOfWeights = sum([log(float(fWeightsA[x])/posA) for x in missingFeatures]) # slower + sumOfWeights = sum([log(float(fWeightsA[x])) for x in missingFeatures]) - log(posA) * len(missingFeatures) #DEFAULT + #sumOfWeights = sum([log(float(fWeightsA[x])/self.totalFeatureCounts[x]) for x in missingFeatures]) - log(posA) * len(missingFeatures) resultA -= tau * sumOfWeights predictions.append(resultA) predictions = array(predictions) diff -r 78fe0002024d -r 12251bc889f1 src/HOL/Tools/Sledgehammer/MaSh/src/stats.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Thu Sep 12 00:34:48 2013 +0200 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Thu Sep 12 09:59:45 2013 +0200 @@ -97,7 +97,7 @@ badPreds.append(dep) recall100 = len(predictions)+1 positives+=1 - self.logger.debug('Dependencies missing for %s in accessibles! Estimating Statistics.',\ + self.logger.debug('Dependencies missing for %s in cutoff predictions! Estimating Statistics.',\ string.join([str(dep) for dep in missing],',')) if positives == 0 or negatives == 0: @@ -113,7 +113,7 @@ self.badPreds = badPreds self.avgAvailable += available self.avgDepNr += depNr - self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\ + self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff: %s',\ statementCounter,round(100*auc,2),depNr,recall100,available,self.cutOff) def printAvg(self): @@ -135,7 +135,7 @@ else: medianrecall100 = float(sorted(self.recall100Median)[nrDataPoints/2] + sorted(self.recall100Median)[nrDataPoints/2 + 1])/2 - returnString = 'avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff:%s' %\ + returnString = 'avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff: %s' %\ (round(100*self.avgAUC/self.problems,2),round(100*medianAUC,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),round(medianrecall100,2),self.cutOff) self.logger.info(returnString) return returnString @@ -143,44 +143,6 @@ """ self.logger.info('avgAUC: %s \t medianAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t medianRecall100: %s \t cutOff:%s', \ round(100*self.avgAUC/self.problems,2),round(100*medianAUC,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),round(medianrecall100,2),self.cutOff) - - #try: - #if True: - if False: - from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist - avgRecall = [float(x)/self.problems for x in self.recallData] - figure('Recall') - plot(range(self.cutOff),avgRecall) - ylabel('Average Recall') - xlabel('Highest ranked premises') - axis([0,self.cutOff,0.0,1.0]) - figure('100%Recall') - plot(range(self.cutOff),self.recall100Data) - ylabel('100%Recall') - xlabel('Highest ranked premises') - axis([0,self.cutOff,0,self.problems]) - figure('AUC Histogram') - hist(self.aucData,bins=100) - ylabel('Problems') - xlabel('AUC') - maxCount = max(self.premiseOccurenceCounter.values()) - minCount = min(self.premiseOccurenceCounter.values()) - figure('Dependency Occurances') - hist(self.premiseOccurenceCounter.values(),bins=range(minCount,maxCount+2),align = 'left') - #ylabel('Occurences') - xlabel('Number of Times a Dependency Occurs') - figure('Dependency Appearance in Problems after Introduction.') - hist(self.depAppearances,bins=50) - figure('Dependency Appearance in Problems after Introduction in Percent.') - xAxis = range(max(self.depAppearances)+1) - yAxis = [0] * (max(self.depAppearances)+1) - for val in self.depAppearances: - yAxis[val] += 1 - yAxis = [float(x)/len(self.firstDepAppearance.keys()) for x in yAxis] - plot(xAxis,yAxis) - show() - #except: - # self.logger.warning('Matplotlib module missing. Skipping graphs.') """ def save(self,fileName):