--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Sat Jan 12 16:49:39 2013 +0100
@@ -31,16 +31,18 @@
self.featureDict = {}
self.dependenciesDict = {}
self.accessibleDict = {}
+ self.expandedAccessibles = {}
+ # For SInE-like prior
self.featureCountDict = {}
- self.expandedAccessibles = {}
+ self.triggerFeatures = {}
self.changed = True
"""
Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict, articleDict get filled!
"""
def init_featureDict(self,featureFile):
- self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
- self.maxFeatureId,self.featureCountDict,featureFile)
+ self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict,self.triggerFeatures = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
+ self.maxFeatureId,self.featureCountDict,self.triggerFeatures,featureFile)
def init_dependenciesDict(self,depFile):
self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
def init_accessibleDict(self,accFile):
@@ -131,8 +133,11 @@
# Accessible Ids
unExpAcc = [self.nameIdDict[a.strip()] for a in line[0].split()]
self.accessibleDict[nameId] = unExpAcc
- self.featureDict[nameId] = self.get_features(line)
- self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]
+ features = self.get_features(line)
+ self.featureDict[nameId] = features
+ minVal = min([self.featureCountDict[f] for f,_w in features])
+ self.triggerFeatures[nameId] = [f for f,_w in features if self.featureCountDict[f] == minVal]
+ self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]
self.changed = True
return nameId
@@ -192,12 +197,12 @@
if self.changed:
dictsStream = open(fileName, 'wb')
dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
- self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict),dictsStream)
+ self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures),dictsStream)
self.changed = False
dictsStream.close()
def load(self,fileName):
dictsStream = open(fileName, 'rb')
self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
- self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict = load(dictsStream)
+ self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures = load(dictsStream)
self.changed = False
dictsStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Sat Jan 12 16:49:39 2013 +0100
@@ -47,12 +47,21 @@
parser.add_argument('--depFile', default='mash_dependencies',
help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies')
parser.add_argument('--saveModel',default=False,action='store_true',help="Stores the learned Model at the end of a prediction run. Default=False.")
+
parser.add_argument('--learnTheories',default=False,action='store_true',help="Uses a two-lvl prediction mode. First the theories, then the premises. Default=False.")
-#DEBUG: Change sineprioir default to false
-parser.add_argument('--sinePrior',default=True,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
-
+# Theory Parameters
+parser.add_argument('--theoryDefValPos',default=-7.5,help="Default value for positive unknown features. Default=-7.5.",type=float)
+parser.add_argument('--theoryDefValNeg',default=-15.0,help="Default value for negative unknown features. Default=-15.0.",type=float)
+parser.add_argument('--theoryPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
parser.add_argument('--nb',default=False,action='store_true',help="Use Naive Bayes for learning. This is the default learning method.")
+# NB Parameters
+parser.add_argument('--NBDefaultPriorWeight',default=20.0,help="Initializes classifiers with value * p |- p. Default=20.0.",type=float)
+parser.add_argument('--NBDefVal',default=-15.0,help="Default value for unknown features. Default=-15.0.",type=float)
+parser.add_argument('--NBPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
+parser.add_argument('--NBSinePrior',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
+parser.add_argument('--NBSineWeight',default=100.0,help="How much the SInE prior is weighted. Default=100.0.",type=float)
+
parser.add_argument('--snow',default=False,action='store_true',help="Use SNoW's naive bayes instead of Naive Bayes for learning.")
parser.add_argument('--predef',default=False,action='store_true',\
help="Use predefined predictions. Used only for comparison with the actual learning. Expects mash_mepo_suggestions in inputDir.")
@@ -89,7 +98,7 @@
# Pick algorithm
if args.nb:
logger.info('Using sparse Naive Bayes for learning.')
- model = sparseNBClassifier(args.sinePrior)
+ model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight)
modelFile = os.path.join(args.outputDir,'NB.pickle')
elif args.snow:
logger.info('Using naive bayes (SNoW) for learning.')
@@ -103,7 +112,7 @@
modelFile = os.path.join(args.outputDir,'mepo.pickle')
else:
logger.info('No algorithm specified. Using sparse Naive Bayes.')
- model = sparseNBClassifier(args.sinePrior)
+ model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight)
modelFile = os.path.join(args.outputDir,'NB.pickle')
dictsFile = os.path.join(args.outputDir,'dicts.pickle')
theoryFile = os.path.join(args.outputDir,'theory.pickle')
@@ -123,7 +132,7 @@
if args.learnTheories:
depFile = os.path.join(args.inputDir,args.depFile)
- theoryModels = TheoryModels()
+ theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight)
theoryModels.init(depFile,dicts)
theoryModels.save(theoryFile)
@@ -138,7 +147,7 @@
statementCounter = 1
computeStats = False
dicts = Dictionaries()
- theoryModels = TheoryModels()
+ theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight)
# Load Files
if os.path.isfile(dictsFile):
dicts.load(dictsFile)
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Sat Jan 12 16:49:39 2013 +0100
@@ -14,7 +14,7 @@
import sys,logging
-def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,inputFile):
+def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,triggerFeatures,inputFile):
logger = logging.getLogger('create_feature_dict')
featureDict = {}
IS = open(inputFile,'r')
@@ -32,7 +32,8 @@
maxNameId += 1
# Feature Ids
featureNames = [f.strip() for f in line[1].split()]
- features = []
+ features = []
+ minFeatureCount = 0
for fn in featureNames:
weight = 1.0
tmp = fn.split('=')
@@ -46,10 +47,12 @@
fId = featureIdDict[fn]
features.append((fId,weight))
featureCountDict[fId] += 1
+ minFeatureCount = min(triggerFeatures,featureCountDict[fId])
# Store results
featureDict[nameId] = features
+ triggerFeatures[nameId] = [f for f,_w in features if featureCountDict[f] == minFeatureCount]
IS.close()
- return featureDict,maxNameId,maxFeatureId,featureCountDict
+ return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeatures
def create_dependencies_dict(nameIdDict,inputFile):
logger = logging.getLogger('create_dependencies_dict')
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Sat Jan 12 16:49:39 2013 +0100
@@ -19,13 +19,16 @@
An updateable naive Bayes classifier.
'''
- def __init__(self):
+ def __init__(self,defValPos = -7.5,defValNeg = -15.0,posWeight = 10.0):
'''
Constructor
'''
self.neg = 0.0
self.pos = 0.0
self.counts = {} # Counts is the tuple poscounts,negcounts
+ self.defValPos = defValPos
+ self.defValNeg = defValNeg
+ self.posWeight = posWeight
def update(self,features,label):
"""
@@ -85,10 +88,6 @@
return 1
elif self.pos ==0:
return 0
- defValPos = -7.5
- defValNeg = -15.0
- posWeight = 10.0
-
logneg = log(self.neg)
logpos = log(self.pos)
prob = logpos - logneg
@@ -97,13 +96,13 @@
if self.counts.has_key(f):
posCount,negCount = self.counts[f]
if posCount > 0:
- prob += (log(posWeight * posCount) - logpos)
+ prob += (log(self.posWeight * posCount) - logpos)
else:
- prob += defValPos
+ prob += self.defValPos
if negCount > 0:
prob -= (log(negCount) - logneg)
else:
- prob -= defValNeg
+ prob -= self.defValNeg
if prob >= 0 :
return 1
else:
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Sat Jan 12 16:49:39 2013 +0100
@@ -11,7 +11,7 @@
'''
from cPickle import dump,load
-from numpy import array,exp
+from numpy import array
from math import log
class sparseNBClassifier(object):
@@ -19,28 +19,28 @@
An updateable naive Bayes classifier.
'''
- def __init__(self,useSinePrior = False):
+ def __init__(self,defaultPriorWeight = 20.0,posWeight = 20.0,defVal = -15.0,useSinePrior = False,sineWeight = 100.0):
'''
Constructor
'''
self.counts = {}
self.sinePrior = useSinePrior
- self.defaultPriorWeight = 20
- self.posWeight = 20
- self.defVal = 15
+ self.sineWeight = sineWeight
+ self.defaultPriorWeight = defaultPriorWeight
+ self.posWeight = posWeight
+ self.defVal = defVal
def initializeModel(self,trainData,dicts):
"""
Build basic model from training data.
"""
- for d in trainData:
- dPosCount = 0
+ for d in trainData:
dFeatureCounts = {}
- # DEBUG: give p |- p a higher weight
- dPosCount = self.defaultPriorWeight
- for f,_w in dicts.featureDict[d]:
- dFeatureCounts[f] = self.defaultPriorWeight
- self.counts[d] = [dPosCount,dFeatureCounts]
+ # Give p |- p a higher weight
+ if not self.defaultPriorWeight == 0:
+ for f,_w in dicts.featureDict[d]:
+ dFeatureCounts[f] = self.defaultPriorWeight
+ self.counts[d] = [self.defaultPriorWeight,dFeatureCounts]
for key in dicts.dependenciesDict.keys():
# Add p proves p
@@ -60,13 +60,12 @@
Updates the Model.
"""
if not self.counts.has_key(dataPoint):
- dPosCount = 0
dFeatureCounts = {}
- # DEBUG: give p |- p a higher weight
- dPosCount = self.defaultPriorWeight
- for f,_w in features:
- dFeatureCounts[f] = self.defaultPriorWeight
- self.counts[dataPoint] = [dPosCount,dFeatureCounts]
+ # Give p |- p a higher weight
+ if not self.defaultPriorWeight == 0:
+ for f,_w in features:
+ dFeatureCounts[f] = self.defaultPriorWeight
+ self.counts[dataPoint] = [self.defaultPriorWeight,dFeatureCounts]
for dep in dependencies:
self.counts[dep][0] += 1
for f,_w in features:
@@ -101,37 +100,43 @@
Returns a ranking of the accessibles.
"""
predictions = []
+ fSet = set([f for f,_w in features])
for a in accessibles:
posA = self.counts[a][0]
fA = set(self.counts[a][1].keys())
fWeightsA = self.counts[a][1]
- resultA = log(posA)
+ prior = posA
+ if self.sinePrior:
+ triggerFeatures = dicts.triggerFeatures[a]
+ triggeredFeatures = fSet.intersection(triggerFeatures)
+ for f in triggeredFeatures:
+ posW = dicts.featureCountDict[f]
+ prior += self.sineWeight / posW
+ resultA = log(prior)
for f,w in features:
# DEBUG
#w = 1
if f in fA:
if fWeightsA[f] == 0:
- resultA -= w*self.defVal
+ resultA += w*self.defVal
else:
assert fWeightsA[f] <= posA
resultA += w*log(float(self.posWeight*fWeightsA[f])/posA)
else:
- resultA -= w*self.defVal
+ resultA += w*self.defVal
predictions.append(resultA)
- #expPredictions = array([exp(x) for x in predictions])
predictions = array(predictions)
perm = (-predictions).argsort()
- #return array(accessibles)[perm],expPredictions[perm]
return array(accessibles)[perm],predictions[perm]
def save(self,fileName):
OStream = open(fileName, 'wb')
- dump(self.counts,OStream)
+ dump((self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight),OStream)
OStream.close()
def load(self,fileName):
OStream = open(fileName, 'rb')
- self.counts = load(OStream)
+ self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight = load(OStream)
OStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Sat Jan 12 16:49:39 2013 +0100
@@ -20,11 +20,15 @@
'''
- def __init__(self):
+ def __init__(self,defValPos = -7.5,defValNeg = -15.0,posWeight = 10.0):
'''
Constructor
'''
self.theoryModels = {}
+ # Model Params
+ self.defValPos = defValPos
+ self.defValNeg = defValNeg
+ self.posWeight = posWeight
self.theoryDict = {}
self.accessibleTheories = set([])
self.currentTheory = None
@@ -49,7 +53,7 @@
self.accessibleTheories.add(self.currentTheory)
self.currentTheory = theory
self.theoryDict[theory] = set([nameId])
- theoryModel = singleNBClassifier()
+ theoryModel = singleNBClassifier(self.defValPos,self.defValNeg,self.posWeight)
self.theoryModels[theory] = theoryModel
else:
self.theoryDict[theory] = self.theoryDict[theory].union([nameId])
@@ -94,6 +98,7 @@
self.theoryModels[a].delete(features,a in usedTheories)
def update(self,problemId,features,dependencies,dicts):
+ # TODO: Implicit assumption that self.accessibleTheories contains all accessible theories!
currentTheory = (dicts.idNameDict[problemId]).split('.')[0]
# Create new theory model, if there is a new theory
if not self.theoryDict.has_key(currentTheory):
@@ -101,7 +106,7 @@
if not currentTheory == None:
self.theoryDict[currentTheory] = []
self.currentTheory = currentTheory
- theoryModel = singleNBClassifier()
+ theoryModel = singleNBClassifier(self.defValPos,self.defValNeg,self.posWeight)
self.theoryModels[currentTheory] = theoryModel
self.accessibleTheories.add(self.currentTheory)
self.update_with_acc(problemId,features,dependencies,dicts,self.accessibleTheories)
@@ -118,12 +123,7 @@
"""
Predicts the relevant theories. Returns the predicted theories and a list of all accessible premises in these theories.
"""
- # TODO: This can be made a lot faster!
- self.accessibleTheories = []
- for x in accessibles:
- xArt = (dicts.idNameDict[x]).split('.')[0]
- self.accessibleTheories.append(xArt)
- self.accessibleTheories = set(self.accessibleTheories)
+ self.accessibleTheories = set([(dicts.idNameDict[x]).split('.')[0] for x in accessibles])
# Predict Theories
predictedTheories = [self.currentTheory]
@@ -143,9 +143,9 @@
def save(self,fileName):
outStream = open(fileName, 'wb')
- dump((self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict),outStream)
+ dump((self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict,self.defValPos,self.defValNeg,self.posWeight),outStream)
outStream.close()
def load(self,fileName):
inStream = open(fileName, 'rb')
- self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict = load(inStream)
+ self.currentTheory,self.accessibleTheories,self.theoryModels,self.theoryDict,self.defValPos,self.defValNeg,self.posWeight = load(inStream)
inStream.close()
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Fri Jan 11 22:38:12 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Sat Jan 12 16:49:39 2013 +0100
@@ -313,7 +313,7 @@
local
-val version = "*** MaSh version 20130111a ***"
+val version = "*** MaSh version 20130112a ***"
exception Too_New of unit