# HG changeset patch # User blanchet # Date 1358005779 -3600 # Node ID a5cc092156dad910653ce2c29607a44d4ee74b35 # Parent 9cc70b273e90babce50aa18ad180c33854e5915c new version of MaSh Python component diff -r 9cc70b273e90 -r a5cc092156da src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Fri Jan 11 22:38:12 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Sat Jan 12 16:49:39 2013 +0100 @@ -31,16 +31,18 @@ self.featureDict = {} self.dependenciesDict = {} self.accessibleDict = {} + self.expandedAccessibles = {} + # For SInE-like prior self.featureCountDict = {} - self.expandedAccessibles = {} + self.triggerFeatures = {} self.changed = True """ Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict, articleDict get filled! """ def init_featureDict(self,featureFile): - self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\ - self.maxFeatureId,self.featureCountDict,featureFile) + self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict,self.triggerFeatures = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\ + self.maxFeatureId,self.featureCountDict,self.triggerFeatures,featureFile) def init_dependenciesDict(self,depFile): self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile) def init_accessibleDict(self,accFile): @@ -131,8 +133,11 @@ # Accessible Ids unExpAcc = [self.nameIdDict[a.strip()] for a in line[0].split()] self.accessibleDict[nameId] = unExpAcc - self.featureDict[nameId] = self.get_features(line) - self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()] + features = self.get_features(line) + self.featureDict[nameId] = features + minVal = min([self.featureCountDict[f] for f,_w in features]) + self.triggerFeatures[nameId] = [f for f,_w in features if self.featureCountDict[f] == minVal] + self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()] self.changed = True return nameId @@ -192,12 +197,12 @@ if self.changed: dictsStream = open(fileName, 'wb') dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\ - self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict),dictsStream) + self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures),dictsStream) self.changed = False dictsStream.close() def load(self,fileName): dictsStream = open(fileName, 'rb') self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\ - self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict = load(dictsStream) + self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures = load(dictsStream) self.changed = False dictsStream.close() diff -r 9cc70b273e90 -r a5cc092156da src/HOL/Tools/Sledgehammer/MaSh/src/mash.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Fri Jan 11 22:38:12 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Sat Jan 12 16:49:39 2013 +0100 @@ -47,12 +47,21 @@ parser.add_argument('--depFile', default='mash_dependencies', help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies') parser.add_argument('--saveModel',default=False,action='store_true',help="Stores the learned Model at the end of a prediction run. Default=False.") + parser.add_argument('--learnTheories',default=False,action='store_true',help="Uses a two-lvl prediction mode. First the theories, then the premises. Default=False.") -#DEBUG: Change sineprioir default to false -parser.add_argument('--sinePrior',default=True,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.") - +# Theory Parameters +parser.add_argument('--theoryDefValPos',default=-7.5,help="Default value for positive unknown features. Default=-7.5.",type=float) +parser.add_argument('--theoryDefValNeg',default=-15.0,help="Default value for negative unknown features. Default=-15.0.",type=float) +parser.add_argument('--theoryPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float) parser.add_argument('--nb',default=False,action='store_true',help="Use Naive Bayes for learning. This is the default learning method.") +# NB Parameters +parser.add_argument('--NBDefaultPriorWeight',default=20.0,help="Initializes classifiers with value * p |- p. Default=20.0.",type=float) +parser.add_argument('--NBDefVal',default=-15.0,help="Default value for unknown features. Default=-15.0.",type=float) +parser.add_argument('--NBPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float) +parser.add_argument('--NBSinePrior',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.") +parser.add_argument('--NBSineWeight',default=100.0,help="How much the SInE prior is weighted. Default=100.0.",type=float) + parser.add_argument('--snow',default=False,action='store_true',help="Use SNoW's naive bayes instead of Naive Bayes for learning.") parser.add_argument('--predef',default=False,action='store_true',\ help="Use predefined predictions. Used only for comparison with the actual learning. Expects mash_mepo_suggestions in inputDir.") @@ -89,7 +98,7 @@ # Pick algorithm if args.nb: logger.info('Using sparse Naive Bayes for learning.') - model = sparseNBClassifier(args.sinePrior) + model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight) modelFile = os.path.join(args.outputDir,'NB.pickle') elif args.snow: logger.info('Using naive bayes (SNoW) for learning.') @@ -103,7 +112,7 @@ modelFile = os.path.join(args.outputDir,'mepo.pickle') else: logger.info('No algorithm specified. Using sparse Naive Bayes.') - model = sparseNBClassifier(args.sinePrior) + model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight) modelFile = os.path.join(args.outputDir,'NB.pickle') dictsFile = os.path.join(args.outputDir,'dicts.pickle') theoryFile = os.path.join(args.outputDir,'theory.pickle') @@ -123,7 +132,7 @@ if args.learnTheories: depFile = os.path.join(args.inputDir,args.depFile) - theoryModels = TheoryModels() + theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight) theoryModels.init(depFile,dicts) theoryModels.save(theoryFile) @@ -138,7 +147,7 @@ statementCounter = 1 computeStats = False dicts = Dictionaries() - theoryModels = TheoryModels() + theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight) # Load Files if os.path.isfile(dictsFile): dicts.load(dictsFile) diff -r 9cc70b273e90 -r a5cc092156da src/HOL/Tools/Sledgehammer/MaSh/src/readData.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Fri Jan 11 22:38:12 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Sat Jan 12 16:49:39 2013 +0100 @@ -14,7 +14,7 @@ import sys,logging -def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,inputFile): +def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,triggerFeatures,inputFile): logger = logging.getLogger('create_feature_dict') featureDict = {} IS = open(inputFile,'r') @@ -32,7 +32,8 @@ maxNameId += 1 # Feature Ids featureNames = [f.strip() for f in line[1].split()] - features = [] + features = [] + minFeatureCount = 0 for fn in featureNames: weight = 1.0 tmp = fn.split('=') @@ -46,10 +47,12 @@ fId = featureIdDict[fn] features.append((fId,weight)) featureCountDict[fId] += 1 + minFeatureCount = min(triggerFeatures,featureCountDict[fId]) # Store results featureDict[nameId] = features + triggerFeatures[nameId] = [f for f,_w in features if featureCountDict[f] == minFeatureCount] IS.close() - return featureDict,maxNameId,maxFeatureId,featureCountDict + return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeatures def create_dependencies_dict(nameIdDict,inputFile): logger = logging.getLogger('create_dependencies_dict') diff -r 9cc70b273e90 -r a5cc092156da src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Fri Jan 11 22:38:12 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Sat Jan 12 16:49:39 2013 +0100 @@ -19,13 +19,16 @@ An updateable naive Bayes classifier. ''' - def __init__(self): + def __init__(self,defValPos = -7.5,defValNeg = -15.0,posWeight = 10.0): ''' Constructor ''' self.neg = 0.0 self.pos = 0.0 self.counts = {} # Counts is the tuple poscounts,negcounts + self.defValPos = defValPos + self.defValNeg = defValNeg + self.posWeight = posWeight def update(self,features,label): """ @@ -85,10 +88,6 @@ return 1 elif self.pos ==0: return 0 - defValPos = -7.5 - defValNeg = -15.0 - posWeight = 10.0 - logneg = log(self.neg) logpos = log(self.pos) prob = logpos - logneg @@ -97,13 +96,13 @@ if self.counts.has_key(f): posCount,negCount = self.counts[f] if posCount > 0: - prob += (log(posWeight * posCount) - logpos) + prob += (log(self.posWeight * posCount) - logpos) else: - prob += defValPos + prob += self.defValPos if negCount > 0: prob -= (log(negCount) - logneg) else: - prob -= defValNeg + prob -= self.defValNeg if prob >= 0 : return 1 else: diff -r 9cc70b273e90 -r a5cc092156da src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Fri Jan 11 22:38:12 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Sat Jan 12 16:49:39 2013 +0100 @@ -11,7 +11,7 @@ ''' from cPickle import dump,load -from numpy import array,exp +from numpy import array from math import log class sparseNBClassifier(object): @@ -19,28 +19,28 @@ An updateable naive Bayes classifier. ''' - def __init__(self,useSinePrior = False): + def __init__(self,defaultPriorWeight = 20.0,posWeight = 20.0,defVal = -15.0,useSinePrior = False,sineWeight = 100.0): ''' Constructor ''' self.counts = {} self.sinePrior = useSinePrior - self.defaultPriorWeight = 20 - self.posWeight = 20 - self.defVal = 15 + self.sineWeight = sineWeight + self.defaultPriorWeight = defaultPriorWeight + self.posWeight = posWeight + self.defVal = defVal def initializeModel(self,trainData,dicts): """ Build basic model from training data. """ - for d in trainData: - dPosCount = 0 + for d in trainData: dFeatureCounts = {} - # DEBUG: give p |- p a higher weight - dPosCount = self.defaultPriorWeight - for f,_w in dicts.featureDict[d]: - dFeatureCounts[f] = self.defaultPriorWeight - self.counts[d] = [dPosCount,dFeatureCounts] + # Give p |- p a higher weight + if not self.defaultPriorWeight == 0: + for f,_w in dicts.featureDict[d]: + dFeatureCounts[f] = self.defaultPriorWeight + self.counts[d] = [self.defaultPriorWeight,dFeatureCounts] for key in dicts.dependenciesDict.keys(): # Add p proves p @@ -60,13 +60,12 @@ Updates the Model. """ if not self.counts.has_key(dataPoint): - dPosCount = 0 dFeatureCounts = {} - # DEBUG: give p |- p a higher weight - dPosCount = self.defaultPriorWeight - for f,_w in features: - dFeatureCounts[f] = self.defaultPriorWeight - self.counts[dataPoint] = [dPosCount,dFeatureCounts] + # Give p |- p a higher weight + if not self.defaultPriorWeight == 0: + for f,_w in features: + dFeatureCounts[f] = self.defaultPriorWeight + self.counts[dataPoint] = [self.defaultPriorWeight,dFeatureCounts] for dep in dependencies: self.counts[dep][0] += 1 for f,_w in features: @@ -101,37 +100,43 @@ Returns a ranking of the accessibles. """ predictions = [] + fSet = set([f for f,_w in features]) for a in accessibles: posA = self.counts[a][0] fA = set(self.counts[a][1].keys()) fWeightsA = self.counts[a][1] - resultA = log(posA) + prior = posA + if self.sinePrior: + triggerFeatures = dicts.triggerFeatures[a] + triggeredFeatures = fSet.intersection(triggerFeatures) + for f in triggeredFeatures: + posW = dicts.featureCountDict[f] + prior += self.sineWeight / posW + resultA = log(prior) for f,w in features: # DEBUG #w = 1 if f in fA: if fWeightsA[f] == 0: - resultA -= w*self.defVal + resultA += w*self.defVal else: assert fWeightsA[f] <= posA resultA += w*log(float(self.posWeight*fWeightsA[f])/posA) else: - resultA -= w*self.defVal + resultA += w*self.defVal predictions.append(resultA) - #expPredictions = array([exp(x) for x in predictions]) predictions = array(predictions) perm = (-predictions).argsort() - #return array(accessibles)[perm],expPredictions[perm] return array(accessibles)[perm],predictions[perm] def save(self,fileName): OStream = open(fileName, 'wb') - dump(self.counts,OStream) + dump((self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight),OStream) OStream.close() def load(self,fileName): OStream = open(fileName, 'rb') - self.counts = load(OStream) + self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight = load(OStream) OStream.close() diff -r 9cc70b273e90 -r a5cc092156da src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py --- a/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Fri Jan 11 22:38:12 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Sat Jan 12 16:49:39 2013 +0100 @@ -20,11 +20,15 @@ ''' - def __init__(self): + def __init__(self,defValPos = -7.5,defValNeg = -15.0,posWeight = 10.0): ''' Constructor ''' self.theoryModels = {} + # Model Params + self.defValPos = defValPos + self.defValNeg = defValNeg + self.posWeight = posWeight self.theoryDict = {} self.accessibleTheories = set([]) self.currentTheory = None @@ -49,7 +53,7 @@ self.accessibleTheories.add(self.currentTheory) self.currentTheory = theory self.theoryDict[theory] = set([nameId]) - theoryModel = singleNBClassifier() + theoryModel = singleNBClassifier(self.defValPos,self.defValNeg,self.posWeight) self.theoryModels[theory] = theoryModel else: self.theoryDict[theory] = self.theoryDict[theory].union([nameId]) @@ -94,6 +98,7 @@ self.theoryModels[a].delete(features,a in usedTheories) def update(self,problemId,features,dependencies,dicts): + # TODO: Implicit assumption that self.accessibleTheories contains all accessible theories! currentTheory = (dicts.idNameDict[problemId]).split('.')[0] # Create new theory model, if there is a new theory if not self.theoryDict.has_key(currentTheory): @@ -101,7 +106,7 @@ if not currentTheory == None: self.theoryDict[currentTheory] = [] self.currentTheory = currentTheory - theoryModel = singleNBClassifier() + theoryModel = singleNBClassifier(self.defValPos,self.defValNeg,self.posWeight) self.theoryModels[currentTheory] = theoryModel self.accessibleTheories.add(self.currentTheory) self.update_with_acc(problemId,features,dependencies,dicts,self.accessibleTheories) @@ -118,12 +123,7 @@ """ Predicts the relevant theories. Returns the predicted theories and a list of all accessible premises in these theories. """ - # TODO: This can be made a lot faster! - self.accessibleTheories = [] - for x in accessibles: - xArt = (dicts.idNameDict[x]).split('.')[0] - self.accessibleTheories.append(xArt) - self.accessibleTheories = set(self.accessibleTheories) + self.accessibleTheories = set([(dicts.idNameDict[x]).split('.')[0] for x in accessibles]) # Predict Theories predictedTheories = [self.currentTheory] @@ -143,9 +143,9 @@ def save(self,fileName): outStream = open(fileName, 'wb') - dump((self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict),outStream) + dump((self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict,self.defValPos,self.defValNeg,self.posWeight),outStream) outStream.close() def load(self,fileName): inStream = open(fileName, 'rb') - self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict = load(inStream) + self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict,self.defValPos,self.defValNeg,self.posWeight = load(inStream) inStream.close() \ No newline at end of file diff -r 9cc70b273e90 -r a5cc092156da src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML --- a/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Fri Jan 11 22:38:12 2013 +0100 +++ b/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Sat Jan 12 16:49:39 2013 +0100 @@ -313,7 +313,7 @@ local -val version = "*** MaSh version 20130111a ***" +val version = "*** MaSh version 20130112a ***" exception Too_New of unit