--- a/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/compareStats.py Thu Dec 06 11:25:10 2012 +0100
@@ -21,12 +21,12 @@
-------- Example Usage ---------------\n\
./compareStats.py --statFiles ../tmp/natISANB.stats ../tmp/natATPNB.stats -b 30\n\n\
Author: Daniel Kuehlwein, July 2012',formatter_class=RawDescriptionHelpFormatter)
-parser.add_argument('--statFiles', default=None, nargs='+',
+parser.add_argument('--statFiles', default=None, nargs='+',
help='The names of the saved statistic files.')
parser.add_argument('-b','--bins',default=50,help="Number of bins for the AUC histogram. Default=50.",type=int)
def main(argv = sys.argv[1:]):
- args = parser.parse_args(argv)
+ args = parser.parse_args(argv)
if args.statFiles == None:
print 'Filenames missing.'
sys.exit(-1)
@@ -56,11 +56,10 @@
legend(loc='upper left')
ylabel('Problems')
xlabel('AUC')
-
+
show()
if __name__ == '__main__':
#args = ['--statFiles','../tmp/natISANB.stats','../tmp/natATPNB.stats','-b','30']
#sys.exit(main(args))
sys.exit(main())
-
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py Thu Dec 06 11:25:10 2012 +0100
@@ -33,18 +33,18 @@
self.accessibleDict = {}
self.expandedAccessibles = {}
self.changed = True
-
+
"""
Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict get filled!
- """
+ """
def init_featureDict(self,featureFile):
self.featureDict,self.maxNameId,self.maxFeatureId = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
- self.maxFeatureId,featureFile)
+ self.maxFeatureId,featureFile)
def init_dependenciesDict(self,depFile):
self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
def init_accessibleDict(self,accFile):
self.accessibleDict,self.maxNameId = create_accessible_dict(self.nameIdDict,self.idNameDict,self.maxNameId,accFile)
-
+
def init_all(self,inputFolder,featureFileName = 'mash_features',depFileName='mash_dependencies',accFileName = 'mash_accessibility'):
featureFile = join(inputFolder,featureFileName)
depFile = join(inputFolder,depFileName)
@@ -54,7 +54,7 @@
self.init_dependenciesDict(depFile)
self.expandedAccessibles = {}
self.changed = True
-
+
def get_name_id(self,name):
"""
Return the Id for a name.
@@ -66,7 +66,7 @@
self.nameIdDict[name] = self.maxNameId
self.idNameDict[self.maxNameId] = name
nameId = self.maxNameId
- self.maxNameId += 1
+ self.maxNameId += 1
self.changed = True
return nameId
@@ -74,8 +74,23 @@
if not self.featureIdDict.has_key(featureName):
self.featureIdDict[featureName] = self.maxFeatureId
self.maxFeatureId += 1
- self.changed = True
-
+ self.changed = True
+ return self.featureIdDict[featureName]
+
+ def get_features(self,line):
+ # Feature Ids
+ featureNames = [f.strip() for f in line[1].split()]
+ features = []
+ for fn in featureNames:
+ tmp = fn.split('=')
+ if len(tmp) == 2:
+ fId = self.add_feature(tmp[0])
+ features.append((fId,float(tmp[1])))
+ else:
+ fId = self.add_feature(fn)
+ features.append((fId,1.0))
+ return features
+
def expand_accessibles(self,acc):
accessibles = set(acc)
unexpandedQueue = Queue()
@@ -86,71 +101,67 @@
unexpandedQueue.put(a)
while not unexpandedQueue.empty():
nextUnExp = unexpandedQueue.get()
- nextUnExpAcc = self.accessibleDict[nextUnExp]
+ nextUnExpAcc = self.accessibleDict[nextUnExp]
for a in nextUnExpAcc:
if not a in accessibles:
accessibles = accessibles.union([a])
if self.expandedAccessibles.has_key(a):
accessibles = accessibles.union(self.expandedAccessibles[a])
else:
- unexpandedQueue.put(a)
+ unexpandedQueue.put(a)
return list(accessibles)
-
+
def parse_fact(self,line):
"""
Parses a single line, extracting accessibles, features, and dependencies.
"""
assert line.startswith('! ')
line = line[2:]
-
+
# line = name:accessibles;features;dependencies
line = line.split(':')
name = line[0].strip()
nameId = self.get_name_id(name)
-
- line = line[1].split(';')
+
+ line = line[1].split(';')
# Accessible Ids
unExpAcc = [self.nameIdDict[a.strip()] for a in line[0].split()]
self.accessibleDict[nameId] = unExpAcc
- # Feature Ids
- featureNames = [f.strip() for f in line[1].split()]
- for fn in featureNames:
- self.add_feature(fn)
- self.featureDict[nameId] = [self.featureIdDict[fn] for fn in featureNames]
+ self.featureDict[nameId] = self.get_features(line)
self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]
self.changed = True
return nameId
-
+
def parse_overwrite(self,line):
"""
Parses a single line, extracts the problemId and the Ids of the dependencies.
"""
assert line.startswith('p ')
line = line[2:]
-
+
# line = name:dependencies
line = line.split(':')
name = line[0].strip()
nameId = self.get_name_id(name)
-
+
dependencies = [self.nameIdDict[d.strip()] for d in line[1].split()]
self.changed = True
return nameId,dependencies
-
+
def parse_problem(self,line):
"""
- Parses a problem and returns the features and the accessibles.
+ Parses a problem and returns the features and the accessibles.
"""
assert line.startswith('? ')
line = line[2:]
name = None
-
+
# Check whether there is a problem name:
tmp = line.split(':')
if len(tmp) == 2:
name = tmp[0].strip()
line = tmp[1]
-
+
# line = accessibles;features
line = line.split(';')
# Accessible Ids, expand and store the accessibles.
@@ -164,13 +175,14 @@
self.expandedAccessibles[accId] = self.expand_accessibles(accIdAcc)
self.changed = True
accessibles = self.expand_accessibles(unExpAcc)
- # Feature Ids
- featureNames = [f.strip() for f in line[1].split()]
- for fn in featureNames:
- self.add_feature(fn)
- features = [self.featureIdDict[fn] for fn in featureNames]
- return name,features,accessibles
-
+# # Feature Ids
+# featureNames = [f.strip() for f in line[1].split()]
+# for fn in featureNames:
+# self.add_feature(fn)
+# features = [self.featureIdDict[fn] for fn in featureNames]
+ features = self.get_features(line)
+ return name,features,accessibles
+
def save(self,fileName):
if self.changed:
dictsStream = open(fileName, 'wb')
@@ -179,10 +191,8 @@
self.changed = False
dictsStream.close()
def load(self,fileName):
- dictsStream = open(fileName, 'rb')
+ dictsStream = open(fileName, 'rb')
self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict = load(dictsStream)
self.changed = False
dictsStream.close()
-
-
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py Thu Dec 06 11:25:10 2012 +0100
@@ -34,7 +34,7 @@
Author: Daniel Kuehlwein, July 2012',formatter_class=RawDescriptionHelpFormatter)
parser.add_argument('-i','--inputFile',help='File containing all problems to be solved.')
parser.add_argument('-o','--outputDir', default='../tmp/',help='Directory where all created files are stored. Default=../tmp/.')
-parser.add_argument('-p','--predictions',default='../tmp/%s.predictions' % datetime.datetime.now(),
+parser.add_argument('-p','--predictions',default='../tmp/%s.predictions' % datetime.datetime.now(),
help='File where the predictions stored. Default=../tmp/dateTime.predictions.')
parser.add_argument('--numberOfPredictions',default=200,help="Number of premises to write in the output. Default=200.",type=int)
@@ -56,11 +56,11 @@
parser.add_argument('-l','--log', default='../tmp/%s.log' % datetime.datetime.now(), help='Log file name. Default=../tmp/dateTime.log')
parser.add_argument('-q','--quiet',default=False,action='store_true',help="If enabled, only print warnings. Default=False.")
-def main(argv = sys.argv[1:]):
+def main(argv = sys.argv[1:]):
# Initializing command-line arguments
args = parser.parse_args(argv)
- # Set up logging
+ # Set up logging
logging.basicConfig(level=logging.DEBUG,
format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
datefmt='%d-%m %H:%M:%S',
@@ -81,8 +81,8 @@
logger.info('Using the following settings: %s',args)
# Pick algorithm
if args.nb:
- logger.info('Using Naive Bayes for learning.')
- model = NBClassifier()
+ logger.info('Using Naive Bayes for learning.')
+ model = NBClassifier()
modelFile = os.path.join(args.outputDir,'NB.pickle')
elif args.snow:
logger.info('Using naive bayes (SNoW) for learning.')
@@ -90,37 +90,37 @@
modelFile = os.path.join(args.outputDir,'SNoW.pickle')
elif args.predef:
logger.info('Using predefined predictions.')
- predictionFile = os.path.join(args.inputDir,'mash_meng_paulson_suggestions')
+ predictionFile = os.path.join(args.inputDir,'mash_meng_paulson_suggestions')
model = Predefined(predictionFile)
- modelFile = os.path.join(args.outputDir,'isabelle.pickle')
+ modelFile = os.path.join(args.outputDir,'isabelle.pickle')
else:
- logger.info('No algorithm specified. Using Naive Bayes.')
- model = NBClassifier()
- modelFile = os.path.join(args.outputDir,'NB.pickle')
- dictsFile = os.path.join(args.outputDir,'dicts.pickle')
-
+ logger.info('No algorithm specified. Using Naive Bayes.')
+ model = NBClassifier()
+ modelFile = os.path.join(args.outputDir,'NB.pickle')
+ dictsFile = os.path.join(args.outputDir,'dicts.pickle')
+
# Initializing model
- if args.init:
+ if args.init:
logger.info('Initializing Model.')
startTime = time()
-
- # Load all data
+
+ # Load all data
dicts = Dictionaries()
dicts.init_all(args.inputDir,depFileName=args.depFile)
-
+
# Create Model
trainData = dicts.featureDict.keys()
if args.predef:
dicts = model.initializeModel(trainData,dicts)
else:
model.initializeModel(trainData,dicts)
-
+
model.save(modelFile)
dicts.save(dictsFile)
logger.info('All Done. %s seconds needed.',round(time()-startTime,2))
return 0
- # Create predictions and/or update model
+ # Create predictions and/or update model
else:
lineCounter = 0
dicts = Dictionaries()
@@ -129,15 +129,15 @@
dicts.load(dictsFile)
if os.path.isfile(modelFile):
model.load(modelFile)
-
+
# IO Streams
OS = open(args.predictions,'a')
IS = open(args.inputFile,'r')
-
+
# Statistics
if args.statistics:
stats = Statistics(args.cutOff)
-
+
predictions = None
#Reading Input File
for line in IS:
@@ -151,11 +151,11 @@
if args.predef:
predictions = model.predict[problemId]
else:
- predictions,_predictionsValues = model.predict(dicts.featureDict[problemId],dicts.expand_accessibles(acc))
+ predictions,_predictionsValues = model.predict(dicts.featureDict[problemId],dicts.expand_accessibles(acc))
stats.update(predictions,dicts.dependenciesDict[problemId])
if not stats.badPreds == []:
bp = string.join([str(dicts.idNameDict[x]) for x in stats.badPreds], ',')
- logger.debug('Bad predictions: %s',bp)
+ logger.debug('Bad predictions: %s',bp)
# Update Dependencies, p proves p
dicts.dependenciesDict[problemId] = [problemId]+dicts.dependenciesDict[problemId]
model.update(problemId,dicts.featureDict[problemId],dicts.dependenciesDict[problemId])
@@ -171,15 +171,15 @@
continue
name,features,accessibles = dicts.parse_problem(line)
# Create predictions
- logger.info('Starting computation for problem on line %s',lineCounter)
- predictions,predictionValues = model.predict(features,accessibles)
+ logger.info('Starting computation for problem on line %s',lineCounter)
+ predictions,predictionValues = model.predict(features,accessibles)
assert len(predictions) == len(predictionValues)
logger.info('Done. %s seconds needed.',round(time()-startTime,2))
-
- # Output
+
+ # Output
predictionNames = [str(dicts.idNameDict[p]) for p in predictions[:args.numberOfPredictions]]
- predictionValues = [str(x) for x in predictionValues[:args.numberOfPredictions]]
- predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))]
+ predictionValues = [str(x) for x in predictionValues[:args.numberOfPredictions]]
+ predictionsStringList = ['%s=%s' % (predictionNames[i],predictionValues[i]) for i in range(len(predictionNames))]
predictionsString = string.join(predictionsStringList,' ')
outString = '%s: %s' % (name,predictionsString)
OS.write('%s\n' % outString)
@@ -192,14 +192,14 @@
logger.warning('An error occurred on line %s .',line)
lineCounter += 1
continue
- """
+ """
OS.close()
IS.close()
-
+
# Statistics
if args.statistics:
stats.printAvg()
-
+
# Save
if args.saveModel:
model.save(modelFile)
@@ -214,7 +214,7 @@
# Nat
#args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/','--predef']
#args = ['-i', '../data/Nat/mash_commands','-p','../tmp/testIsabelle.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/natATPMP.stats']
- #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/']
+ #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Nat/']
#args = ['-i', '../data/Nat/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/natATPNB.stats','--cutOff','500']
# BUG
#args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/List/','--isabelle']
@@ -223,6 +223,5 @@
#args = ['-i', '../bug/adds/mash_commands','-p','../tmp/testNB.pred','-l','testNB.log','--nb','-o','../tmp/']
#startTime = time()
#sys.exit(main(args))
- #print 'New ' + str(round(time()-startTime,2))
+ #print 'New ' + str(round(time()-startTime,2))
sys.exit(main())
-
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/naiveBayes.py Thu Dec 06 11:25:10 2012 +0100
@@ -24,46 +24,46 @@
Constructor
'''
self.counts = {}
-
+
def initializeModel(self,trainData,dicts):
"""
Build basic model from training data.
- """
+ """
for d in trainData:
dPosCount = 0
- dFeatureCounts = {}
+ dFeatureCounts = {}
self.counts[d] = [dPosCount,dFeatureCounts]
-
+
for key in dicts.dependenciesDict.keys():
# Add p proves p
keyDeps = [key]+dicts.dependenciesDict[key]
-
+
for dep in keyDeps:
self.counts[dep][0] += 1
depFeatures = dicts.featureDict[key]
- for f in depFeatures:
+ for f,_w in depFeatures:
if self.counts[dep][1].has_key(f):
self.counts[dep][1][f] += 1
else:
self.counts[dep][1][f] = 1
-
+
def update(self,dataPoint,features,dependencies):
"""
Updates the Model.
"""
if not self.counts.has_key(dataPoint):
dPosCount = 0
- dFeatureCounts = {}
+ dFeatureCounts = {}
self.counts[dataPoint] = [dPosCount,dFeatureCounts]
for dep in dependencies:
self.counts[dep][0] += 1
- for f in features:
+ for f,_w in features:
if self.counts[dep][1].has_key(f):
self.counts[dep][1][f] += 1
else:
self.counts[dep][1][f] = 1
-
+
def delete(self,dataPoint,features,dependencies):
"""
Deletes a single datapoint from the model.
@@ -73,7 +73,7 @@
for f in features:
self.counts[dep][1][f] -= 1
-
+
def overwrite(self,problemId,newDependencies,dicts):
"""
Deletes the old dependencies of problemId and replaces them with the new ones. Updates the model accordingly.
@@ -83,45 +83,45 @@
features = dicts.featureDict[problemId]
self.delete(problemId,features,oldDeps)
self.update(problemId,features,newDependencies)
-
+
def predict(self,features,accessibles):
"""
For each accessible, predicts the probability of it being useful given the features.
Returns a ranking of the accessibles.
"""
predictions = []
- for a in accessibles:
+ for a in accessibles:
posA = self.counts[a][0]
fA = set(self.counts[a][1].keys())
fWeightsA = self.counts[a][1]
- resultA = log(posA)
- for f in features:
+ resultA = log(posA)
+ for f,w in features:
if f in fA:
if fWeightsA[f] == 0:
- resultA -= 15
+ resultA -= w*15
else:
assert fWeightsA[f] <= posA
- resultA += log(float(fWeightsA[f])/posA)
+ resultA += w*log(float(fWeightsA[f])/posA)
else:
- resultA -= 15
+ resultA -= w*15
predictions.append(resultA)
#expPredictions = array([exp(x) for x in predictions])
predictions = array(predictions)
- perm = (-predictions).argsort()
- #return array(accessibles)[perm],expPredictions[perm]
+ perm = (-predictions).argsort()
+ #return array(accessibles)[perm],expPredictions[perm]
return array(accessibles)[perm],predictions[perm]
-
+
def save(self,fileName):
OStream = open(fileName, 'wb')
- dump(self.counts,OStream)
+ dump(self.counts,OStream)
OStream.close()
-
+
def load(self,fileName):
OStream = open(fileName, 'rb')
- self.counts = load(OStream)
+ self.counts = load(OStream)
OStream.close()
-
+
if __name__ == '__main__':
featureDict = {0:[0,1,2],1:[3,2,1]}
dependenciesDict = {0:[0],1:[0,1]}
@@ -136,4 +136,4 @@
d.loadModel('x')
print c.counts
print d.counts
- print 'Done'
\ No newline at end of file
+ print 'Done'
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/predefined.py Thu Dec 06 11:25:10 2012 +0100
@@ -23,11 +23,11 @@
Constructor
'''
self.predictionFile = mpPredictionFile
-
+
def initializeModel(self,_trainData,dicts):
"""
Load predictions.
- """
+ """
self.predictions = {}
IS = open(self.predictionFile,'r')
for line in IS:
@@ -39,28 +39,27 @@
self.predictions[predId] = preds
IS.close()
return dicts
-
+
def update(self,dataPoint,features,dependencies):
"""
Updates the Model.
"""
# No Update needed since we assume that we got all predictions
pass
-
-
+
+
def predict(self,problemId):
"""
Return the saved predictions.
"""
- return self.predictions[problemId]
-
+ return self.predictions[problemId]
+
def save(self,fileName):
OStream = open(fileName, 'wb')
- dump((self.predictionFile,self.predictions),OStream)
+ dump((self.predictionFile,self.predictions),OStream)
OStream.close()
-
+
def load(self,fileName):
OStream = open(fileName, 'rb')
- self.predictionFile,self.predictions = load(OStream)
+ self.predictionFile,self.predictions = load(OStream)
OStream.close()
-
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py Thu Dec 06 11:25:10 2012 +0100
@@ -15,12 +15,12 @@
import sys,logging
def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,inputFile):
- logger = logging.getLogger('create_feature_dict')
+ logger = logging.getLogger('create_feature_dict')
featureDict = {}
- IS = open(inputFile,'r')
+ IS = open(inputFile,'r')
for line in IS:
line = line.split(':')
- name = line[0]
+ name = line[0]
# Name Id
if nameIdDict.has_key(name):
logger.warning('%s appears twice in the feature file. Aborting.',name)
@@ -29,33 +29,41 @@
nameIdDict[name] = maxNameId
idNameDict[maxNameId] = name
nameId = maxNameId
- maxNameId += 1
+ maxNameId += 1
# Feature Ids
- featureNames = [f.strip() for f in line[1].split()]
+ featureNames = [f.strip() for f in line[1].split()]
+ features = []
for fn in featureNames:
- if not featureIdDict.has_key(fn):
- featureIdDict[fn] = maxFeatureId
- maxFeatureId += 1
- featureIds = [featureIdDict[fn] for fn in featureNames]
+ tmp = fn.split('=')
+ if len(tmp) == 2:
+ if not featureIdDict.has_key(tmp[0]):
+ featureIdDict[tmp[0]] = maxFeatureId
+ maxFeatureId += 1
+ features.append((featureIdDict[tmp[0]],float(tmp[1])))
+ else:
+ if not featureIdDict.has_key(fn):
+ featureIdDict[fn] = maxFeatureId
+ maxFeatureId += 1
+ features.append((featureIdDict[fn],1.0))
# Store results
- featureDict[nameId] = featureIds
+ featureDict[nameId] = features
IS.close()
return featureDict,maxNameId,maxFeatureId
-
+
def create_dependencies_dict(nameIdDict,inputFile):
logger = logging.getLogger('create_dependencies_dict')
dependenciesDict = {}
IS = open(inputFile,'r')
for line in IS:
line = line.split(':')
- name = line[0]
+ name = line[0]
# Name Id
if not nameIdDict.has_key(name):
logger.warning('%s is missing in nameIdDict. Aborting.',name)
sys.exit(-1)
nameId = nameIdDict[name]
- dependenciesIds = [nameIdDict[f.strip()] for f in line[1].split()]
+ dependenciesIds = [nameIdDict[f.strip()] for f in line[1].split()]
# Store results, add p proves p
dependenciesDict[nameId] = [nameId] + dependenciesIds
IS.close()
@@ -67,19 +75,17 @@
IS = open(inputFile,'r')
for line in IS:
line = line.split(':')
- name = line[0]
+ name = line[0]
# Name Id
if not nameIdDict.has_key(name):
logger.warning('%s is missing in nameIdDict. Adding it as theory.',name)
nameIdDict[name] = maxNameId
idNameDict[maxNameId] = name
nameId = maxNameId
- maxNameId += 1
+ maxNameId += 1
else:
nameId = nameIdDict[name]
accessibleStrings = line[1].split()
accessibleDict[nameId] = [nameIdDict[a.strip()] for a in accessibleStrings]
IS.close()
return accessibleDict,maxNameId
-
-
\ No newline at end of file
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/snow.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/snow.py Thu Dec 06 11:25:10 2012 +0100
@@ -27,13 +27,13 @@
'''
self.logger = logging.getLogger('SNoW')
self.SNoWTrainFile = '../tmp/snow.train'
- self.SNoWTestFile = '../snow.test'
- self.SNoWNetFile = '../tmp/snow.net'
-
+ self.SNoWTestFile = '../snow.test'
+ self.SNoWNetFile = '../tmp/snow.net'
+
def initializeModel(self,trainData,dicts):
"""
Build basic model from training data.
- """
+ """
# Prepare input files
self.logger.debug('Creating IO Files')
OS = open(self.SNoWTrainFile,'w')
@@ -44,60 +44,60 @@
dependencies = dicts.dependenciesDict[nameId]
dependencies = map(str,dependencies)
dependenciesString = string.join(dependencies,',')
- snowString = string.join([featureString,dependenciesString],',')+':\n'
+ snowString = string.join([featureString,dependenciesString],',')+':\n'
OS.write(snowString)
OS.close()
# Build Model
self.logger.debug('Building Model START.')
- snowTrainCommand = '../bin/snow -train -M+ -I %s -F %s -g- -B :0-%s' % (self.SNoWTrainFile,self.SNoWNetFile,dicts.maxNameId-1)
+ snowTrainCommand = '../bin/snow -train -M+ -I %s -F %s -g- -B :0-%s' % (self.SNoWTrainFile,self.SNoWNetFile,dicts.maxNameId-1)
args = shlex.split(snowTrainCommand)
- p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+ p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
p.wait()
- self.logger.debug('Building Model END.')
+ self.logger.debug('Building Model END.')
-
+
def update(self,dataPoint,features,dependencies,dicts):
"""
Updates the Model.
THIS IS NOT WORKING ATM< BUT WE DONT CARE
- """
- self.logger.debug('Updating Model START')
- trainData = dicts.featureDict.keys()
+ """
+ self.logger.debug('Updating Model START')
+ trainData = dicts.featureDict.keys()
self.initializeModel(trainData,dicts)
self.logger.debug('Updating Model END')
-
-
+
+
def predict(self,features,accessibles,dicts):
logger = logging.getLogger('predict_SNoW')
-
+
OS = open(self.SNoWTestFile,'w')
features = map(str,features)
featureString = string.join(features, ',')
snowString = featureString+':'
OS.write(snowString)
- OS.close()
-
+ OS.close()
+
snowTestCommand = '../bin/snow -test -I %s -F %s -o allboth' % (self.SNoWTestFile,self.SNoWNetFile)
args = shlex.split(snowTestCommand)
- p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
+ p = subprocess.Popen(args,stdout=subprocess.PIPE,stderr=subprocess.STDOUT)
(lines, _stderrdata) = p.communicate()
logger.debug('SNoW finished.')
- lines = lines.split('\n')
+ lines = lines.split('\n')
assert lines[9].startswith('Example ')
assert lines[-4] == ''
- predictionsCon = []
+ predictionsCon = []
for line in lines[10:-4]:
premiseId = int(line.split()[0][:-1])
predictionsCon.append(premiseId)
return predictionsCon
-
+
def save(self,fileName):
OStream = open(fileName, 'wb')
- dump(self.counts,OStream)
+ dump(self.counts,OStream)
OStream.close()
-
+
def load(self,fileName):
OStream = open(fileName, 'rb')
- self.counts = load(OStream)
- OStream.close()
\ No newline at end of file
+ self.counts = load(OStream)
+ OStream.close()
--- a/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Wed Dec 05 15:59:08 2012 +0100
+++ b/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py Thu Dec 06 11:25:10 2012 +0100
@@ -32,18 +32,38 @@
self.recallData = [0]*cutOff
self.recall100Data = [0]*cutOff
self.aucData = []
-
- def update(self,predictions,dependencies):
+ self.premiseOccurenceCounter = {}
+ self.firstDepAppearance = {}
+ self.depAppearances = []
+
+ def update(self,predictions,dependencies,statementCounter):
"""
Evaluates AUC, dependencies, recall100 and number of available premises of a prediction.
"""
-
available = len(predictions)
predictions = predictions[:self.cutOff]
dependencies = set(dependencies)
+ # No Stats for if no dependencies
+ if len(dependencies) == 0:
+ self.logger.debug('No Dependencies for statement %s' % statementCounter )
+ self.badPreds = []
+ return
+ if len(predictions) < self.cutOff:
+ for i in range(len(predictions),self.cutOff):
+ self.recall100Data[i] += 1
+ self.recallData[i] += 1
+ for d in dependencies:
+ if self.premiseOccurenceCounter.has_key(d):
+ self.premiseOccurenceCounter[d] += 1
+ else:
+ self.premiseOccurenceCounter[d] = 1
+ if self.firstDepAppearance.has_key(d):
+ self.depAppearances.append(statementCounter-self.firstDepAppearance[d])
+ else:
+ self.firstDepAppearance[d] = statementCounter
depNr = len(dependencies)
- aucSum = 0.
- posResults = 0.
+ aucSum = 0.
+ posResults = 0.
positives, negatives = 0, 0
recall100 = 0.0
badPreds = []
@@ -56,7 +76,7 @@
depsFound.append(pId)
if index > 200:
badPreds.append(pId)
- else:
+ else:
aucSum += posResults
negatives+=1
# Update Recall and Recall100 stats
@@ -66,7 +86,7 @@
self.recallData[index] += 1
else:
self.recallData[index] += float(positives)/depNr
-
+
if not depNr == positives:
depsFound = set(depsFound)
missing = []
@@ -78,28 +98,29 @@
positives+=1
self.logger.debug('Dependencies missing for %s in accessibles! Estimating Statistics.',\
string.join([str(dep) for dep in missing],','))
-
+
if positives == 0 or negatives == 0:
auc = 1.0
- else:
+ else:
auc = aucSum/(negatives*positives)
-
+
self.aucData.append(auc)
self.avgAUC += auc
self.avgRecall100 += recall100
self.problems += 1
self.badPreds = badPreds
- self.avgAvailable += available
+ self.avgAvailable += available
self.avgDepNr += depNr
- self.logger.info('AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\
- round(100*auc,2),depNr,recall100,available,self.cutOff)
-
+ self.logger.info('Statement: %s: AUC: %s \t Needed: %s \t Recall100: %s \t Available: %s \t cutOff:%s',\
+ statementCounter,round(100*auc,2),depNr,recall100,available,self.cutOff)
+
def printAvg(self):
self.logger.info('Average results:')
self.logger.info('avgAUC: %s \t avgDepNr: %s \t avgRecall100: %s \t cutOff:%s', \
- round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),self.avgRecall100/self.problems,self.cutOff)
+ round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),self.cutOff)
- try:
+ #try:
+ if True:
from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist
avgRecall = [float(x)/self.problems for x in self.recallData]
figure('Recall')
@@ -116,15 +137,30 @@
hist(self.aucData,bins=100)
ylabel('Problems')
xlabel('AUC')
+ maxCount = max(self.premiseOccurenceCounter.values())
+ minCount = min(self.premiseOccurenceCounter.values())
+ figure('Dependency Occurances')
+ hist(self.premiseOccurenceCounter.values(),bins=range(minCount,maxCount+2),align = 'left')
+ #ylabel('Occurences')
+ xlabel('Number of Times a Dependency Occurs')
+ figure('Dependency Appearance in Problems after Introduction.')
+ hist(self.depAppearances,bins=50)
+ figure('Dependency Appearance in Problems after Introduction in Percent.')
+ xAxis = range(max(self.depAppearances)+1)
+ yAxis = [0] * (max(self.depAppearances)+1)
+ for val in self.depAppearances:
+ yAxis[val] += 1
+ yAxis = [float(x)/len(self.firstDepAppearance.keys()) for x in yAxis]
+ plot(xAxis,yAxis)
show()
- except:
- self.logger.warning('Matplotlib module missing. Skipping graphs.')
-
- def save(self,fileName):
+ #except:
+ # self.logger.warning('Matplotlib module missing. Skipping graphs.')
+
+ def save(self,fileName):
oStream = open(fileName, 'wb')
- dump((self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData),oStream)
+ dump((self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter),oStream)
oStream.close()
def load(self,fileName):
- iStream = open(fileName, 'rb')
- self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData = load(iStream)
- iStream.close()
\ No newline at end of file
+ iStream = open(fileName, 'rb')
+ self.avgAUC,self.avgRecall100,self.avgAvailable,self.avgDepNr,self.problems,self.cutOff,self.recallData,self.recall100Data,self.aucData,self.premiseOccurenceCounter = load(iStream)
+ iStream.close()