--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Fri Jan 11 16:30:56 2013 +0100
@@ -31,6 +31,7 @@
self.featureDict = {}
self.dependenciesDict = {}
self.accessibleDict = {}
+ self.featureCountDict = {}
self.expandedAccessibles = {}
self.changed = True
@@ -38,8 +39,8 @@
Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict, articleDict get filled!
"""
def init_featureDict(self,featureFile):
- self.featureDict,self.maxNameId,self.maxFeatureId = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
- self.maxFeatureId,featureFile)
+ self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
+ self.maxFeatureId,self.featureCountDict,featureFile)
def init_dependenciesDict(self,depFile):
self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
def init_accessibleDict(self,accFile):
@@ -73,9 +74,12 @@
def add_feature(self,featureName):
if not self.featureIdDict.has_key(featureName):
self.featureIdDict[featureName] = self.maxFeatureId
+ self.featureCountDict[self.maxFeatureId] = 0
self.maxFeatureId += 1
self.changed = True
- return self.featureIdDict[featureName]
+ fId = self.featureIdDict[featureName]
+ self.featureCountDict[fId] += 1
+ return fId
def get_features(self,line):
# Feature Ids
@@ -83,12 +87,12 @@
features = []
for fn in featureNames:
tmp = fn.split('=')
+ weight = 1.0
if len(tmp) == 2:
- fId = self.add_feature(tmp[0])
- features.append((fId,float(tmp[1])))
- else:
- fId = self.add_feature(fn)
- features.append((fId,1.0))
+ fn = tmp[0]
+ weight = float(tmp[1])
+ fId = self.add_feature(tmp[0])
+ features.append((fId,weight))
return features
def expand_accessibles(self,acc):
@@ -150,7 +154,7 @@
def parse_problem(self,line):
"""
- Parses a problem and returns the features and the accessibles.
+ Parses a problem and returns the features, the accessibles, and any hints.
"""
assert line.startswith('? ')
line = line[2:]
@@ -176,19 +180,24 @@
self.changed = True
accessibles = self.expand_accessibles(unExpAcc)
features = self.get_features(line)
+ # Get hints:
+ if len(line) == 3:
+ hints = [self.nameIdDict[d.strip()] for d in line[2].split()]
+ else:
+ hints = []
- return name,features,accessibles
+ return name,features,accessibles,hints
def save(self,fileName):
if self.changed:
dictsStream = open(fileName, 'wb')
dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
- self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict),dictsStream)
+ self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict),dictsStream)
self.changed = False
dictsStream.close()
def load(self,fileName):
dictsStream = open(fileName, 'rb')
self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
- self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream)
+ self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict = load(dictsStream)
self.changed = False
dictsStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Fri Jan 11 16:30:56 2013 +0100
@@ -48,6 +48,8 @@
help='Name of the file with the premise dependencies. The file must be in inputDir. Default = mash_dependencies')
parser.add_argument('--saveModel',default=False,action='store_true',help="Stores the learned Model at the end of a prediction run. Default=False.")
parser.add_argument('--learnTheories',default=False,action='store_true',help="Uses a two-lvl prediction mode. First the theories, then the premises. Default=False.")
+#DEBUG: Change sineprioir default to false
+parser.add_argument('--sinePrior',default=True,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
parser.add_argument('--nb',default=False,action='store_true',help="Use Naive Bayes for learning. This is the default learning method.")
@@ -87,7 +89,7 @@
# Pick algorithm
if args.nb:
logger.info('Using sparse Naive Bayes for learning.')
- model = sparseNBClassifier()
+ model = sparseNBClassifier(args.sinePrior)
modelFile = os.path.join(args.outputDir,'NB.pickle')
elif args.snow:
logger.info('Using naive bayes (SNoW) for learning.')
@@ -101,7 +103,7 @@
modelFile = os.path.join(args.outputDir,'mepo.pickle')
else:
logger.info('No algorithm specified. Using sparse Naive Bayes.')
- model = sparseNBClassifier()
+ model = sparseNBClassifier(args.sinePrior)
modelFile = os.path.join(args.outputDir,'NB.pickle')
dictsFile = os.path.join(args.outputDir,'dicts.pickle')
theoryFile = os.path.join(args.outputDir,'theory.pickle')
@@ -182,7 +184,7 @@
# Update Dependencies, p proves p
dicts.dependenciesDict[problemId] = [problemId]+dicts.dependenciesDict[problemId]
if args.learnTheories:
- theoryModels.update(problemId,dicts)
+ theoryModels.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId],dicts)
if args.snow:
model.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId],dicts)
else:
@@ -200,16 +202,34 @@
computeStats = True
if args.predef:
continue
- name,features,accessibles = dicts.parse_problem(line)
+ name,features,accessibles,hints = dicts.parse_problem(line)
# Create predictions
logger.info('Starting computation for problem on line %s',lineCounter)
+ # Update Models with hints
+ if not hints == []:
+ if args.learnTheories:
+ accessibleTheories = set([(dicts.idNameDict[x]).split('.')[0] for x in accessibles])
+ theoryModels.update_with_acc('hints',features,hints,dicts,accessibleTheories)
+ if args.snow:
+ pass
+ else:
+ model.update('hints',features,hints)
+
+ # Predict premises
if args.learnTheories:
predictedTheories,accessibles = theoryModels.predict(features,accessibles,dicts)
- if args.snow:
- predictions,predictionValues = model.predict(features,accessibles,dicts)
- else:
- predictions,predictionValues = model.predict(features,accessibles)
+ predictions,predictionValues = model.predict(features,accessibles,dicts)
assert len(predictions) == len(predictionValues)
+
+ # Delete hints
+ if not hints == []:
+ if args.learnTheories:
+ theoryModels.delete('hints',features,hints,dicts)
+ if args.snow:
+ pass
+ else:
+ model.delete('hints',features,hints)
+
logger.info('Done. %s seconds needed.',round(time()-startTime,2))
# Output
predictionNames = [str(dicts.idNameDict[p]) for p in predictions[:args.numberOfPredictions]]
@@ -253,10 +273,19 @@
if __name__ == '__main__':
# Example:
+ # Auth
+ # ISAR Theories
+ #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--learnTheories']
+ #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
+ # ISAR MePo
+ #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--predef']
+ #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
+
+
# Jinja
# ISAR Theories
- #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121221/Jinja/','--learnTheories']
- #args = ['-i', '../data/20121221/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
+ #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Jinja/','--learnTheories']
+ #args = ['-i', '../data/20121227b/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
# ISAR NB
#args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121221/Jinja/']
#args = ['-i', '../data/20121221/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500']
@@ -277,6 +306,9 @@
# Probability
+ # ISAR Theories
+ #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/','--learnTheories']
+ #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
# ISAR NB
#args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/']
#args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/ProbIsarNB.stats','--cutOff','500']
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Fri Jan 11 16:30:56 2013 +0100
@@ -14,7 +14,7 @@
import sys,logging
-def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,inputFile):
+def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,inputFile):
logger = logging.getLogger('create_feature_dict')
featureDict = {}
IS = open(inputFile,'r')
@@ -32,23 +32,24 @@
maxNameId += 1
# Feature Ids
featureNames = [f.strip() for f in line[1].split()]
- features = []
+ features = []
for fn in featureNames:
+ weight = 1.0
tmp = fn.split('=')
if len(tmp) == 2:
- if not featureIdDict.has_key(tmp[0]):
- featureIdDict[tmp[0]] = maxFeatureId
- maxFeatureId += 1
- features.append((featureIdDict[tmp[0]],float(tmp[1])))
- else:
- if not featureIdDict.has_key(fn):
- featureIdDict[fn] = maxFeatureId
- maxFeatureId += 1
- features.append((featureIdDict[fn],1.0))
+ fn = tmp[0]
+ weight = float(tmp[1])
+ if not featureIdDict.has_key(fn):
+ featureIdDict[fn] = maxFeatureId
+ featureCountDict[maxFeatureId] = 0
+ maxFeatureId += 1
+ fId = featureIdDict[fn]
+ features.append((fId,weight))
+ featureCountDict[fId] += 1
# Store results
featureDict[nameId] = features
IS.close()
- return featureDict,maxNameId,maxFeatureId
+ return featureDict,maxNameId,maxFeatureId,featureCountDict
def create_dependencies_dict(nameIdDict,inputFile):
logger = logging.getLogger('create_dependencies_dict')
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100
@@ -61,7 +61,7 @@
self.pos -= 1
else:
self.neg -= 1
- for f in features:
+ for f,_w in features:
posCount,negCount = self.counts[f]
if label:
posCount -= 1
@@ -79,7 +79,7 @@
def predict_sparse(self,features):
"""
- Returns 1 if the probability is greater than 50%.
+ Returns 1 if the probability of + being the correct label is greater than the probability that - is the correct label.
"""
if self.neg == 0:
return 1
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py Fri Jan 11 16:30:56 2013 +0100
@@ -19,11 +19,15 @@
An updateable naive Bayes classifier.
'''
- def __init__(self):
+ def __init__(self,useSinePrior = False):
'''
Constructor
'''
self.counts = {}
+ self.sinePrior = useSinePrior
+ self.defaultPriorWeight = 20
+ self.posWeight = 20
+ self.defVal = 15
def initializeModel(self,trainData,dicts):
"""
@@ -32,6 +36,10 @@
for d in trainData:
dPosCount = 0
dFeatureCounts = {}
+ # DEBUG: give p |- p a higher weight
+ dPosCount = self.defaultPriorWeight
+ for f,_w in dicts.featureDict[d]:
+ dFeatureCounts[f] = self.defaultPriorWeight
self.counts[d] = [dPosCount,dFeatureCounts]
for key in dicts.dependenciesDict.keys():
@@ -53,8 +61,12 @@
"""
if not self.counts.has_key(dataPoint):
dPosCount = 0
- dFeatureCounts = {}
- self.counts[dataPoint] = [dPosCount,dFeatureCounts]
+ dFeatureCounts = {}
+ # DEBUG: give p |- p a higher weight
+ dPosCount = self.defaultPriorWeight
+ for f,_w in features:
+ dFeatureCounts[f] = self.defaultPriorWeight
+ self.counts[dataPoint] = [dPosCount,dFeatureCounts]
for dep in dependencies:
self.counts[dep][0] += 1
for f,_w in features:
@@ -69,7 +81,7 @@
"""
for dep in dependencies:
self.counts[dep][0] -= 1
- for f in features:
+ for f,_w in features:
self.counts[dep][1][f] -= 1
@@ -83,13 +95,11 @@
self.delete(problemId,features,oldDeps)
self.update(problemId,features,newDependencies)
- def predict(self,features,accessibles):
+ def predict(self,features,accessibles,dicts):
"""
For each accessible, predicts the probability of it being useful given the features.
Returns a ranking of the accessibles.
"""
- posWeight = 20.0
- defVal = 15
predictions = []
for a in accessibles:
posA = self.counts[a][0]
@@ -101,12 +111,12 @@
#w = 1
if f in fA:
if fWeightsA[f] == 0:
- resultA -= w*defVal
+ resultA -= w*self.defVal
else:
assert fWeightsA[f] <= posA
- resultA += w*log(float(posWeight*fWeightsA[f])/posA)
+ resultA += w*log(float(self.posWeight*fWeightsA[f])/posA)
else:
- resultA -= w*defVal
+ resultA -= w*self.defVal
predictions.append(resultA)
#expPredictions = array([exp(x) for x in predictions])
predictions = array(predictions)
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/theoryModels.py Fri Jan 11 16:30:56 2013 +0100
@@ -87,15 +87,13 @@
for a in self.accTheories:
self.theoryModels[a].overwrite(features,a in oldTheories,a in newTheories)
- def delete(self):
- pass
+ def delete(self,problemId,features,dependencies,dicts):
+ tmp = [dicts.idNameDict[x] for x in dependencies]
+ usedTheories = set([x.split('.')[0] for x in tmp])
+ for a in self.accessibleTheories:
+ self.theoryModels[a].delete(features,a in usedTheories)
- def update(self,problemId,dicts):
- features = dicts.featureDict[problemId]
-
- # Find the actually used theories
- tmp = [dicts.idNameDict[x] for x in dicts.dependenciesDict[problemId]]
- usedTheories = set([x.split('.')[0] for x in tmp])
+ def update(self,problemId,features,dependencies,dicts):
currentTheory = (dicts.idNameDict[problemId]).split('.')[0]
# Create new theory model, if there is a new theory
if not self.theoryDict.has_key(currentTheory):
@@ -105,9 +103,15 @@
self.currentTheory = currentTheory
theoryModel = singleNBClassifier()
self.theoryModels[currentTheory] = theoryModel
- self.accessibleTheories.add(self.currentTheory)
- if not len(usedTheories) == 0:
- for a in self.accessibleTheories:
+ self.accessibleTheories.add(self.currentTheory)
+ self.update_with_acc(problemId,features,dependencies,dicts,self.accessibleTheories)
+
+ def update_with_acc(self,problemId,features,dependencies,dicts,accessibleTheories):
+ # Find the actually used theories
+ tmp = [dicts.idNameDict[x] for x in dependencies]
+ usedTheories = set([x.split('.')[0] for x in tmp])
+ if not len(usedTheories) == 0:
+ for a in accessibleTheories:
self.theoryModels[a].update(features,a in usedTheories)
def predict(self,features,accessibles,dicts):
--- a/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Fri Jan 11 16:30:56 2013 +0100
+++ b/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML Fri Jan 11 16:30:56 2013 +0100
@@ -313,7 +313,7 @@
local
-val version = "*** MaSh version 20130104a ***"
+val version = "*** MaSh version 20130111a ***"
exception Too_New of unit