updated MaSh
authorblanchet
Thu Jan 17 17:55:03 2013 +0100 (2013-01-17)
changeset 50951e1cbaa7d5536
parent 50950 a145ee10371b
child 50952 3fd83a0cc4bd
updated MaSh
src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py
src/HOL/Tools/Sledgehammer/MaSh/src/mash.py
src/HOL/Tools/Sledgehammer/MaSh/src/readData.py
src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py
src/HOL/Tools/Sledgehammer/MaSh/src/stats.py
src/HOL/Tools/Sledgehammer/MaSh/src/tester.py
src/HOL/Tools/Sledgehammer/MaSh/src/theoryStats.py
src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML
     1.1 --- a/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Thu Jan 17 17:55:02 2013 +0100
     1.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/dictionaries.py	Thu Jan 17 17:55:03 2013 +0100
     1.3 @@ -32,27 +32,33 @@
     1.4          self.dependenciesDict = {}
     1.5          self.accessibleDict = {}
     1.6          self.expandedAccessibles = {}
     1.7 -        # For SInE-like prior
     1.8 -        self.featureCountDict = {}
     1.9 -        self.triggerFeatures = {}
    1.10 +        # For SInE features
    1.11 +        self.useSine = False
    1.12 +        self.featureCountDict = {} 
    1.13 +        self.triggerFeaturesDict = {} 
    1.14 +        self.featureTriggeredFormulasDict = {}
    1.15          self.changed = True
    1.16  
    1.17      """
    1.18 -    Init functions. Side Effect: nameIdDict, idNameDict, featureIdDict, articleDict get filled!
    1.19 +    Init functions. nameIdDict, idNameDict, featureIdDict, articleDict get filled!
    1.20      """
    1.21 -    def init_featureDict(self,featureFile):
    1.22 -        self.featureDict,self.maxNameId,self.maxFeatureId, self.featureCountDict,self.triggerFeatures = create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,\
    1.23 -                                                                                                       self.maxFeatureId,self.featureCountDict,self.triggerFeatures,featureFile)
    1.24 +    def init_featureDict(self,featureFile,sineFeatures):
    1.25 +        self.featureDict,self.maxNameId,self.maxFeatureId,self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict =\
    1.26 +         create_feature_dict(self.nameIdDict,self.idNameDict,self.maxNameId,self.featureIdDict,self.maxFeatureId,self.featureCountDict,\
    1.27 +                             self.triggerFeaturesDict,self.featureTriggeredFormulasDict,sineFeatures,featureFile)
    1.28      def init_dependenciesDict(self,depFile):
    1.29          self.dependenciesDict = create_dependencies_dict(self.nameIdDict,depFile)
    1.30      def init_accessibleDict(self,accFile):
    1.31          self.accessibleDict,self.maxNameId = create_accessible_dict(self.nameIdDict,self.idNameDict,self.maxNameId,accFile)
    1.32  
    1.33 -    def init_all(self,inputFolder,featureFileName = 'mash_features',depFileName='mash_dependencies',accFileName = 'mash_accessibility'):
    1.34 -        featureFile = join(inputFolder,featureFileName)
    1.35 -        depFile = join(inputFolder,depFileName)
    1.36 -        accFile = join(inputFolder,accFileName)
    1.37 -        self.init_featureDict(featureFile)
    1.38 +    def init_all(self,args):
    1.39 +        self.featureFileName = 'mash_features'
    1.40 +        self.accFileName = 'mash_accessibility'
    1.41 +        self.useSine = args.sineFeatures
    1.42 +        featureFile = join(args.inputDir,self.featureFileName)
    1.43 +        depFile = join(args.inputDir,args.depFile)
    1.44 +        accFile = join(args.inputDir,self.accFileName)
    1.45 +        self.init_featureDict(featureFile,self.useSine)
    1.46          self.init_accessibleDict(accFile)
    1.47          self.init_dependenciesDict(depFile)
    1.48          self.expandedAccessibles = {}
    1.49 @@ -76,11 +82,13 @@
    1.50      def add_feature(self,featureName):
    1.51          if not self.featureIdDict.has_key(featureName):
    1.52              self.featureIdDict[featureName] = self.maxFeatureId
    1.53 -            self.featureCountDict[self.maxFeatureId] = 0
    1.54 +            if self.useSine:
    1.55 +                self.featureCountDict[self.maxFeatureId] = 0
    1.56              self.maxFeatureId += 1
    1.57              self.changed = True
    1.58          fId = self.featureIdDict[featureName]
    1.59 -        self.featureCountDict[fId] += 1
    1.60 +        if self.useSine:
    1.61 +            self.featureCountDict[fId] += 1
    1.62          return fId
    1.63  
    1.64      def get_features(self,line):
    1.65 @@ -128,15 +136,22 @@
    1.66          line = line.split(':')
    1.67          name = line[0].strip()
    1.68          nameId = self.get_name_id(name)
    1.69 -
    1.70          line = line[1].split(';')
    1.71          # Accessible Ids
    1.72          unExpAcc = [self.nameIdDict[a.strip()] for a in line[0].split()]
    1.73          self.accessibleDict[nameId] = unExpAcc
    1.74          features = self.get_features(line)
    1.75          self.featureDict[nameId] = features
    1.76 -        minVal = min([self.featureCountDict[f] for f,_w in features])
    1.77 -        self.triggerFeatures[nameId] = [f for f,_w in features if self.featureCountDict[f] == minVal]
    1.78 +        if self.useSine:
    1.79 +            # SInE Features
    1.80 +            minFeatureCount = min([self.featureCountDict[f] for f,_w in features])
    1.81 +            triggerFeatures = [f for f,_w in features if self.featureCountDict[f] == minFeatureCount]
    1.82 +            self.triggerFeaturesDict[nameId] = triggerFeatures
    1.83 +            for f in triggerFeatures:
    1.84 +                if self.featureTriggeredFormulasDict.has_key(f): 
    1.85 +                    self.featureTriggeredFormulasDict[f].append(nameId)
    1.86 +                else:
    1.87 +                    self.featureTriggeredFormulasDict[f] = [nameId]        
    1.88          self.dependenciesDict[nameId] = [self.nameIdDict[d.strip()] for d in line[2].split()]        
    1.89          self.changed = True
    1.90          return nameId
    1.91 @@ -197,12 +212,14 @@
    1.92          if self.changed:
    1.93              dictsStream = open(fileName, 'wb')
    1.94              dump((self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
    1.95 -                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures),dictsStream)
    1.96 +                self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\
    1.97 +                self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine),dictsStream)
    1.98              self.changed = False
    1.99              dictsStream.close()
   1.100      def load(self,fileName):
   1.101          dictsStream = open(fileName, 'rb')
   1.102          self.accessibleDict,self.dependenciesDict,self.expandedAccessibles,self.featureDict,\
   1.103 -              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,self.featureCountDict,self.triggerFeatures = load(dictsStream)
   1.104 +              self.featureIdDict,self.idNameDict,self.maxFeatureId,self.maxNameId,self.nameIdDict,\
   1.105 +              self.featureCountDict,self.triggerFeaturesDict,self.featureTriggeredFormulasDict,self.useSine = load(dictsStream)
   1.106          self.changed = False
   1.107          dictsStream.close()
     2.1 --- a/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Thu Jan 17 17:55:02 2013 +0100
     2.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/mash.py	Thu Jan 17 17:55:03 2013 +0100
     2.3 @@ -51,46 +51,62 @@
     2.4  parser.add_argument('--learnTheories',default=False,action='store_true',help="Uses a two-lvl prediction mode. First the theories, then the premises. Default=False.")
     2.5  # Theory Parameters
     2.6  parser.add_argument('--theoryDefValPos',default=-7.5,help="Default value for positive unknown features. Default=-7.5.",type=float)
     2.7 -parser.add_argument('--theoryDefValNeg',default=-15.0,help="Default value for negative unknown features. Default=-15.0.",type=float)
     2.8 -parser.add_argument('--theoryPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
     2.9 +parser.add_argument('--theoryDefValNeg',default=-10.0,help="Default value for negative unknown features. Default=-15.0.",type=float)
    2.10 +parser.add_argument('--theoryPosWeight',default=2.0,help="Weight value for positive features. Default=2.0.",type=float)
    2.11  
    2.12  parser.add_argument('--nb',default=False,action='store_true',help="Use Naive Bayes for learning. This is the default learning method.")
    2.13  # NB Parameters
    2.14  parser.add_argument('--NBDefaultPriorWeight',default=20.0,help="Initializes classifiers with value * p |- p. Default=20.0.",type=float)
    2.15  parser.add_argument('--NBDefVal',default=-15.0,help="Default value for unknown features. Default=-15.0.",type=float)
    2.16  parser.add_argument('--NBPosWeight',default=10.0,help="Weight value for positive features. Default=10.0.",type=float)
    2.17 -parser.add_argument('--NBSinePrior',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
    2.18 -parser.add_argument('--NBSineWeight',default=100.0,help="How much the SInE prior is weighted. Default=100.0.",type=float)
    2.19 +# TODO: Rename to sineFeatures
    2.20 +parser.add_argument('--sineFeatures',default=False,action='store_true',help="Uses a SInE like prior for premise lvl predictions. Default=False.")
    2.21 +parser.add_argument('--sineWeight',default=0.5,help="How much the SInE prior is weighted. Default=0.5.",type=float)
    2.22  
    2.23  parser.add_argument('--snow',default=False,action='store_true',help="Use SNoW's naive bayes instead of Naive Bayes for learning.")
    2.24 -parser.add_argument('--predef',default=False,action='store_true',\
    2.25 -                    help="Use predefined predictions. Used only for comparison with the actual learning. Expects mash_mepo_suggestions in inputDir.")
    2.26 +parser.add_argument('--predef',help="Use predefined predictions. Used only for comparison with the actual learning. Argument is the filename of the predictions.")
    2.27  parser.add_argument('--statistics',default=False,action='store_true',help="Create and show statistics for the top CUTOFF predictions.\
    2.28                      WARNING: This will make the program a lot slower! Default=False.")
    2.29  parser.add_argument('--saveStats',default=None,help="If defined, stores the statistics in the filename provided.")
    2.30  parser.add_argument('--cutOff',default=500,help="Option for statistics. Only consider the first cutOff predictions. Default=500.",type=int)
    2.31  parser.add_argument('-l','--log', default='../tmp/%s.log' % datetime.datetime.now(), help='Log file name. Default=../tmp/dateTime.log')
    2.32  parser.add_argument('-q','--quiet',default=False,action='store_true',help="If enabled, only print warnings. Default=False.")
    2.33 +parser.add_argument('--modelFile', default='../tmp/model.pickle', help='Model file name. Default=../tmp/model.pickle')
    2.34 +parser.add_argument('--dictsFile', default='../tmp/dict.pickle', help='Dict file name. Default=../tmp/dict.pickle')
    2.35 +parser.add_argument('--theoryFile', default='../tmp/theory.pickle', help='Model file name. Default=../tmp/theory.pickle')
    2.36  
    2.37 -def main(argv = sys.argv[1:]):
    2.38 +def mash(argv = sys.argv[1:]):
    2.39      # Initializing command-line arguments
    2.40      args = parser.parse_args(argv)
    2.41 -
    2.42 +    
    2.43      # Set up logging
    2.44      logging.basicConfig(level=logging.DEBUG,
    2.45                          format='%(asctime)s %(name)-12s %(levelname)-8s %(message)s',
    2.46                          datefmt='%d-%m %H:%M:%S',
    2.47                          filename=args.log,
    2.48 -                        filemode='w')
    2.49 -    console = logging.StreamHandler(sys.stdout)
    2.50 -    console.setLevel(logging.INFO)
    2.51 -    formatter = logging.Formatter('# %(message)s')
    2.52 -    console.setFormatter(formatter)
    2.53 -    logging.getLogger('').addHandler(console)
    2.54 +                        filemode='w')    
    2.55      logger = logging.getLogger('main.py')
    2.56 +    
    2.57 +    #"""
    2.58 +    # remove old handler for tester
    2.59 +    #logger.root.handlers[0].stream.close()
    2.60 +    logger.root.removeHandler(logger.root.handlers[0])
    2.61 +    file_handler = logging.FileHandler(args.log)
    2.62 +    file_handler.setLevel(logging.DEBUG)
    2.63 +    formatter = logging.Formatter('%(asctime)s %(name)-12s %(levelname)-8s %(message)s')
    2.64 +    file_handler.setFormatter(formatter)
    2.65 +    logger.root.addHandler(file_handler)
    2.66 +    #"""
    2.67      if args.quiet:
    2.68          logger.setLevel(logging.WARNING)
    2.69 -        console.setLevel(logging.WARNING)
    2.70 +        #console.setLevel(logging.WARNING)
    2.71 +    else:
    2.72 +        console = logging.StreamHandler(sys.stdout)
    2.73 +        console.setLevel(logging.INFO)
    2.74 +        formatter = logging.Formatter('# %(message)s')
    2.75 +        console.setFormatter(formatter)
    2.76 +        logging.getLogger('').addHandler(console)
    2.77 +        
    2.78      if not os.path.exists(args.outputDir):
    2.79          os.makedirs(args.outputDir)
    2.80  
    2.81 @@ -98,24 +114,16 @@
    2.82      # Pick algorithm
    2.83      if args.nb:
    2.84          logger.info('Using sparse Naive Bayes for learning.')
    2.85 -        model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight)
    2.86 -        modelFile = os.path.join(args.outputDir,'NB.pickle')
    2.87 +        model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal)
    2.88      elif args.snow:
    2.89          logger.info('Using naive bayes (SNoW) for learning.')
    2.90          model = SNoW()
    2.91 -        modelFile = os.path.join(args.outputDir,'SNoW.pickle')
    2.92      elif args.predef:
    2.93          logger.info('Using predefined predictions.')
    2.94 -        #predictionFile = os.path.join(args.inputDir,'mash_meng_paulson_suggestions') 
    2.95 -        predictionFile = os.path.join(args.inputDir,'mash_mepo_suggestions')
    2.96 -        model = Predefined(predictionFile)
    2.97 -        modelFile = os.path.join(args.outputDir,'mepo.pickle')        
    2.98 +        model = Predefined(args.predef)
    2.99      else:
   2.100          logger.info('No algorithm specified. Using sparse Naive Bayes.')
   2.101 -        model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal,args.NBSinePrior,args.NBSineWeight)
   2.102 -        modelFile = os.path.join(args.outputDir,'NB.pickle')
   2.103 -    dictsFile = os.path.join(args.outputDir,'dicts.pickle')
   2.104 -    theoryFile = os.path.join(args.outputDir,'theory.pickle')
   2.105 +        model = sparseNBClassifier(args.NBDefaultPriorWeight,args.NBPosWeight,args.NBDefVal)
   2.106  
   2.107      # Initializing model
   2.108      if args.init:
   2.109 @@ -124,7 +132,7 @@
   2.110  
   2.111          # Load all data
   2.112          dicts = Dictionaries()
   2.113 -        dicts.init_all(args.inputDir,depFileName=args.depFile)
   2.114 +        dicts.init_all(args)
   2.115          
   2.116          # Create Model
   2.117          trainData = dicts.featureDict.keys()
   2.118 @@ -134,10 +142,10 @@
   2.119              depFile = os.path.join(args.inputDir,args.depFile)
   2.120              theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight)
   2.121              theoryModels.init(depFile,dicts)
   2.122 -            theoryModels.save(theoryFile)
   2.123 +            theoryModels.save(args.theoryFile)
   2.124              
   2.125 -        model.save(modelFile)
   2.126 -        dicts.save(dictsFile)
   2.127 +        model.save(args.modelFile)
   2.128 +        dicts.save(args.dictsFile)
   2.129  
   2.130          logger.info('All Done. %s seconds needed.',round(time()-startTime,2))
   2.131          return 0
   2.132 @@ -149,12 +157,22 @@
   2.133          dicts = Dictionaries()
   2.134          theoryModels = TheoryModels(args.theoryDefValPos,args.theoryDefValNeg,args.theoryPosWeight)
   2.135          # Load Files
   2.136 -        if os.path.isfile(dictsFile):
   2.137 -            dicts.load(dictsFile)
   2.138 -        if os.path.isfile(modelFile):
   2.139 -            model.load(modelFile)
   2.140 -        if os.path.isfile(theoryFile) and args.learnTheories:
   2.141 -            theoryModels.load(theoryFile)
   2.142 +        if os.path.isfile(args.dictsFile):
   2.143 +            #logger.info('Loading Dictionaries')
   2.144 +            #startTime = time()
   2.145 +            dicts.load(args.dictsFile)            
   2.146 +            #logger.info('Done %s',time()-startTime)
   2.147 +        if os.path.isfile(args.modelFile):
   2.148 +            #logger.info('Loading Model')
   2.149 +            #startTime = time()
   2.150 +            model.load(args.modelFile)            
   2.151 +            #logger.info('Done %s',time()-startTime)
   2.152 +        if os.path.isfile(args.theoryFile) and args.learnTheories:
   2.153 +            #logger.info('Loading Theory Models')
   2.154 +            #startTime = time()
   2.155 +            theoryModels.load(args.theoryFile)
   2.156 +            #logger.info('Done %s',time()-startTime)
   2.157 +        logger.info('All loading completed')
   2.158  
   2.159          # IO Streams
   2.160          OS = open(args.predictions,'w')
   2.161 @@ -173,7 +191,7 @@
   2.162  #           try:
   2.163              if True:
   2.164                  if line.startswith('!'):
   2.165 -                    problemId = dicts.parse_fact(line)                        
   2.166 +                    problemId = dicts.parse_fact(line)    
   2.167                      # Statistics
   2.168                      if args.statistics and computeStats:
   2.169                          computeStats = False
   2.170 @@ -183,7 +201,7 @@
   2.171                          if args.learnTheories:
   2.172                              tmp = [dicts.idNameDict[x] for x in dicts.dependenciesDict[problemId]]
   2.173                              usedTheories = set([x.split('.')[0] for x in tmp]) 
   2.174 -                            theoryStats.update((dicts.idNameDict[problemId]).split('.')[0],predictedTheories,usedTheories)                        
   2.175 +                            theoryStats.update((dicts.idNameDict[problemId]).split('.')[0],predictedTheories,usedTheories,len(theoryModels.accessibleTheories))                        
   2.176                          stats.update(predictions,dicts.dependenciesDict[problemId],statementCounter)
   2.177                          if not stats.badPreds == []:
   2.178                              bp = string.join([str(dicts.idNameDict[x]) for x in stats.badPreds], ',')
   2.179 @@ -211,7 +229,8 @@
   2.180                      computeStats = True
   2.181                      if args.predef:
   2.182                          continue
   2.183 -                    name,features,accessibles,hints = dicts.parse_problem(line)    
   2.184 +                    name,features,accessibles,hints = dicts.parse_problem(line)  
   2.185 +                        
   2.186                      # Create predictions
   2.187                      logger.info('Starting computation for problem on line %s',lineCounter)
   2.188                      # Update Models with hints
   2.189 @@ -223,11 +242,29 @@
   2.190                              pass
   2.191                          else:
   2.192                              model.update('hints',features,hints)
   2.193 -                    
   2.194 +
   2.195                      # Predict premises
   2.196                      if args.learnTheories:
   2.197                          predictedTheories,accessibles = theoryModels.predict(features,accessibles,dicts)
   2.198 -                    predictions,predictionValues = model.predict(features,accessibles,dicts)
   2.199 +
   2.200 +                    # Add additional features on premise lvl if sine is enabled
   2.201 +                    if args.sineFeatures:
   2.202 +                        origFeatures = [f for f,_w in features]
   2.203 +                        secondaryFeatures = []
   2.204 +                        for f in origFeatures:
   2.205 +                            if dicts.featureCountDict[f] == 1:
   2.206 +                                continue
   2.207 +                            triggeredFormulas = dicts.featureTriggeredFormulasDict[f]                                
   2.208 +                            for formula in triggeredFormulas: 
   2.209 +                                tFeatures = dicts.triggerFeaturesDict[formula]                                
   2.210 +                                #tFeatures = [ff for ff,_fw in dicts.featureDict[formula]]
   2.211 +                                newFeatures = set(tFeatures).difference(secondaryFeatures+origFeatures)
   2.212 +                            for fNew in newFeatures:
   2.213 +                                secondaryFeatures.append((fNew,args.sineWeight))
   2.214 +                        predictionsFeatures = features+secondaryFeatures
   2.215 +                    else:
   2.216 +                        predictionsFeatures = features                    
   2.217 +                    predictions,predictionValues = model.predict(predictionsFeatures,accessibles,dicts)
   2.218                      assert len(predictions) == len(predictionValues)
   2.219                      
   2.220                      # Delete hints
   2.221 @@ -268,10 +305,10 @@
   2.222  
   2.223          # Save
   2.224          if args.saveModel:
   2.225 -            model.save(modelFile)
   2.226 +            model.save(args.modelFile)
   2.227              if args.learnTheories:
   2.228 -                theoryModels.save(theoryFile)
   2.229 -        dicts.save(dictsFile)
   2.230 +                theoryModels.save(args.theoryFile)
   2.231 +        dicts.save(args.dictsFile)
   2.232          if not args.saveStats == None:
   2.233              if args.learnTheories:
   2.234                  theoryStatsFile = os.path.join(args.outputDir,'theoryStats')
   2.235 @@ -282,25 +319,37 @@
   2.236  
   2.237  if __name__ == '__main__':
   2.238      # Example:
   2.239 +    #List
   2.240 +    # ISAR Theories
   2.241 +    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20130110/List/','--learnTheories']    
   2.242 +    #args = ['-i', '../data/20130110/List/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
   2.243 +    # ISAR predef mesh 
   2.244 +    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20130110/List/','--predef','../data/20130110/List/mesh_suggestions']
   2.245 +    #args = ['-i', '../data/20130110/List/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','../data/20130110/List/mesh_suggestions','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
   2.246 + 
   2.247 +    
   2.248      # Auth
   2.249      # ISAR Theories
   2.250 -    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--learnTheories']    
   2.251 +    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--learnTheories','--sineFeatures']    
   2.252      #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
   2.253 -    # ISAR MePo 
   2.254 -    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--predef']
   2.255 -    #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
   2.256 +    # ISAR predef mesh 
   2.257 +    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Auth/','--predef','../data/20121227b/Auth/mesh_suggestions']
   2.258 +    #args = ['-i', '../data/20121227b/Auth/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','../data/20121227b/Auth/mesh_suggestions','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
   2.259  
   2.260      
   2.261      # Jinja
   2.262      # ISAR Theories
   2.263 -    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121227b/Jinja/','--learnTheories']    
   2.264 -    #args = ['-i', '../data/20121227b/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
   2.265 +    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20130111/Jinja/','--learnTheories']    
   2.266 +    #args = ['-i', '../data/20130111/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--cutOff','500','--learnTheories']
   2.267 +    # ISAR Theories SinePrior
   2.268 +    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20130111/Jinja/','--learnTheories','--sineFeatures']    
   2.269 +    #args = ['-i', '../data/20130111/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories','--sineFeatures']    
   2.270      # ISAR NB
   2.271 -    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121221/Jinja/']    
   2.272 -    #args = ['-i', '../data/20121221/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500']
   2.273 -    # ISAR MePo
   2.274 -    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121212/Jinja/','--predef']
   2.275 -    #args = ['-i', '../data/20121212/Jinja/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
   2.276 +    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20130111/Jinja/']    
   2.277 +    #args = ['-i', '../data/20130111/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500']
   2.278 +    # ISAR predef mesh
   2.279 +    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20130111/Jinja/','--predef','../data/20130111/Jinja/mesh_suggestions']
   2.280 +    #args = ['-i', '../data/20130111/Jinja/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','../data/20130111/Jinja/mesh_suggestions','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
   2.281      # ISAR NB ATP
   2.282      #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121212/Jinja/','--depFile','mash_atp_dependencies']    
   2.283      #args = ['-i', '../data/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--depFile','mash_atp_dependencies']
   2.284 @@ -313,28 +362,5 @@
   2.285      #args = ['-i', '../data/20121212/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--snow','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500']
   2.286   
   2.287  
   2.288 -
   2.289 -    # Probability
   2.290 -    # ISAR Theories
   2.291 -    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/','--learnTheories']    
   2.292 -    #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--learnTheories']
   2.293 -    # ISAR NB
   2.294 -    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/']    
   2.295 -    #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/ProbIsarNB.stats','--cutOff','500']
   2.296 -    # ISAR MePo
   2.297 -    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121213/Probability/','--predef']
   2.298 -    #args = ['-i', '../data/20121213/Probability/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats']
   2.299 -    # ISAR NB ATP
   2.300 -    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/20121212/Jinja/','--depFile','mash_atp_dependencies']    
   2.301 -    #args = ['-i', '../data/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--nb','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--depFile','mash_atp_dependencies']
   2.302 -    #args = ['-l','testIsabelle.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Jinja/','--predef','--depFile','mash_atp_dependencies']
   2.303 -    #args = ['-i', '../data/Jinja/mash_commands','-p','../tmp/JinjaMePo.pred','-l','testIsabelle.log','--predef','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaMePo.stats','--depFile','mash_atp_dependencies']
   2.304 -    #args = ['-l','testNB.log','-o','../tmp/','--statistics','--init','--inputDir','../data/Jinja/','--depFile','mash_atp_dependencies','--snow']    
   2.305 -    #args = ['-i', '../data/Jinja/mash_commands','-p','../tmp/testNB.pred','-l','../tmp/testNB.log','--snow','-o','../tmp/','--statistics','--saveStats','../tmp/JinjaIsarNB.stats','--cutOff','500','--depFile','mash_atp_dependencies']
   2.306 -
   2.307 -
   2.308 -    
   2.309 -    #startTime = time()
   2.310 -    #sys.exit(main(args))
   2.311 -    #print 'New ' + str(round(time()-startTime,2))
   2.312 -    sys.exit(main())
   2.313 +    #sys.exit(mash(args))
   2.314 +    sys.exit(mash())
     3.1 --- a/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Thu Jan 17 17:55:02 2013 +0100
     3.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/readData.py	Thu Jan 17 17:55:03 2013 +0100
     3.3 @@ -14,7 +14,8 @@
     3.4  
     3.5  import sys,logging
     3.6  
     3.7 -def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,triggerFeatures,inputFile):
     3.8 +def create_feature_dict(nameIdDict,idNameDict,maxNameId,featureIdDict,maxFeatureId,featureCountDict,\
     3.9 +                        triggerFeaturesDict,featureTriggeredFormulasDict,sineFeatures,inputFile):
    3.10      logger = logging.getLogger('create_feature_dict')
    3.11      featureDict = {}
    3.12      IS = open(inputFile,'r')
    3.13 @@ -33,7 +34,7 @@
    3.14          # Feature Ids
    3.15          featureNames = [f.strip() for f in line[1].split()]
    3.16          features = []     
    3.17 -        minFeatureCount = 0   
    3.18 +        minFeatureCount = 9999999   
    3.19          for fn in featureNames:
    3.20              weight = 1.0
    3.21              tmp = fn.split('=')
    3.22 @@ -46,13 +47,21 @@
    3.23                  maxFeatureId += 1
    3.24              fId = featureIdDict[fn]
    3.25              features.append((fId,weight))
    3.26 -            featureCountDict[fId] += 1
    3.27 -            minFeatureCount = min(triggerFeatures,featureCountDict[fId])
    3.28 +            if sineFeatures:
    3.29 +                featureCountDict[fId] += 1
    3.30 +                minFeatureCount = min(minFeatureCount,featureCountDict[fId])
    3.31          # Store results
    3.32          featureDict[nameId] = features
    3.33 -        triggerFeatures[nameId] = [f for f,_w in features if featureCountDict[f] == minFeatureCount]
    3.34 +        if sineFeatures:
    3.35 +            triggerFeatures = [f for f,_w in features if featureCountDict[f] == minFeatureCount]
    3.36 +            triggerFeaturesDict[nameId] = triggerFeatures
    3.37 +            for f in triggerFeatures:
    3.38 +                if featureTriggeredFormulasDict.has_key(f): 
    3.39 +                    featureTriggeredFormulasDict[f].append(nameId)
    3.40 +                else:
    3.41 +                    featureTriggeredFormulasDict[f] = [nameId]
    3.42      IS.close()
    3.43 -    return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeatures
    3.44 +    return featureDict,maxNameId,maxFeatureId,featureCountDict,triggerFeaturesDict,featureTriggeredFormulasDict
    3.45  
    3.46  def create_dependencies_dict(nameIdDict,inputFile):
    3.47      logger = logging.getLogger('create_dependencies_dict')
     4.1 --- a/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py	Thu Jan 17 17:55:02 2013 +0100
     4.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/singleNaiveBayes.py	Thu Jan 17 17:55:03 2013 +0100
     4.3 @@ -44,9 +44,12 @@
     4.4          
     4.5          for f,_w in features:
     4.6              if not self.counts.has_key(f):
     4.7 -                fPosCount = 0.0
     4.8 -                fNegCount = 0.0
     4.9 -                self.counts[f] = [fPosCount,fNegCount]
    4.10 +                if label:
    4.11 +                    fPosCount = 0.0
    4.12 +                    fNegCount = 0.0
    4.13 +                    self.counts[f] = [fPosCount,fNegCount]
    4.14 +                else:
    4.15 +                    continue
    4.16              posCount,negCount = self.counts[f]
    4.17              if label:
    4.18                  posCount += 1
    4.19 @@ -89,8 +92,9 @@
    4.20          elif self.pos ==0:
    4.21              return 0
    4.22          logneg = log(self.neg)
    4.23 +        lognegprior=log(float(self.neg)/5)
    4.24          logpos = log(self.pos)
    4.25 -        prob = logpos - logneg
    4.26 +        prob = logpos - lognegprior
    4.27          
    4.28          for f,_w in features:
    4.29              if self.counts.has_key(f):
     5.1 --- a/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Thu Jan 17 17:55:02 2013 +0100
     5.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/sparseNaiveBayes.py	Thu Jan 17 17:55:03 2013 +0100
     5.3 @@ -19,13 +19,11 @@
     5.4      An updateable naive Bayes classifier.
     5.5      '''
     5.6  
     5.7 -    def __init__(self,defaultPriorWeight = 20.0,posWeight = 20.0,defVal = -15.0,useSinePrior = False,sineWeight = 100.0):
     5.8 +    def __init__(self,defaultPriorWeight = 20.0,posWeight = 20.0,defVal = -15.0):
     5.9          '''
    5.10          Constructor
    5.11          '''
    5.12          self.counts = {}
    5.13 -        self.sinePrior = useSinePrior
    5.14 -        self.sineWeight = sineWeight
    5.15          self.defaultPriorWeight = defaultPriorWeight
    5.16          self.posWeight = posWeight
    5.17          self.defVal = defVal
    5.18 @@ -100,19 +98,11 @@
    5.19          Returns a ranking of the accessibles.
    5.20          """
    5.21          predictions = []
    5.22 -        fSet = set([f for f,_w in features])
    5.23          for a in accessibles:
    5.24              posA = self.counts[a][0]
    5.25              fA = set(self.counts[a][1].keys())
    5.26              fWeightsA = self.counts[a][1]
    5.27 -            prior = posA
    5.28 -            if self.sinePrior:
    5.29 -                triggerFeatures = dicts.triggerFeatures[a]
    5.30 -                triggeredFeatures = fSet.intersection(triggerFeatures)
    5.31 -                for f in triggeredFeatures:
    5.32 -                    posW = dicts.featureCountDict[f]
    5.33 -                    prior += self.sineWeight /  posW 
    5.34 -            resultA = log(prior)
    5.35 +            resultA = log(posA)
    5.36              for f,w in features:
    5.37                  # DEBUG
    5.38                  #w = 1
    5.39 @@ -131,12 +121,12 @@
    5.40  
    5.41      def save(self,fileName):
    5.42          OStream = open(fileName, 'wb')
    5.43 -        dump((self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight),OStream)
    5.44 +        dump((self.counts,self.defaultPriorWeight,self.posWeight,self.defVal),OStream)
    5.45          OStream.close()
    5.46  
    5.47      def load(self,fileName):
    5.48          OStream = open(fileName, 'rb')
    5.49 -        self.counts,self.defaultPriorWeight,self.posWeight,self.defVal,self.sinePrior,self.sineWeight = load(OStream)
    5.50 +        self.counts,self.defaultPriorWeight,self.posWeight,self.defVal = load(OStream)
    5.51          OStream.close()
    5.52  
    5.53  
     6.1 --- a/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py	Thu Jan 17 17:55:02 2013 +0100
     6.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/stats.py	Thu Jan 17 17:55:03 2013 +0100
     6.3 @@ -120,7 +120,8 @@
     6.4                           round(100*self.avgAUC/self.problems,2),round(self.avgDepNr/self.problems,2),round(self.avgRecall100/self.problems,2),self.cutOff)
     6.5  
     6.6          #try:
     6.7 -        if True:
     6.8 +        #if True:
     6.9 +        if False:
    6.10              from matplotlib.pyplot import plot,figure,show,xlabel,ylabel,axis,hist
    6.11              avgRecall = [float(x)/self.problems for x in self.recallData]
    6.12              figure('Recall')
     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/tester.py	Thu Jan 17 17:55:03 2013 +0100
     7.3 @@ -0,0 +1,182 @@
     7.4 +'''
     7.5 +Created on Jan 11, 2013
     7.6 +
     7.7 +Searches for the best parameters.
     7.8 +
     7.9 +@author: Daniel Kuehlwein
    7.10 +'''
    7.11 +
    7.12 +import logging,sys,os
    7.13 +from multiprocessing import Process,Queue,current_process,cpu_count
    7.14 +from mash import mash
    7.15 +
    7.16 +def worker(inQueue, outQueue):
    7.17 +    for func, args in iter(inQueue.get, 'STOP'):        
    7.18 +        result = func(*args)
    7.19 +        #print '%s says that %s%s = %s' % (current_process().name, func.__name__, args, result)
    7.20 +        outQueue.put(result)
    7.21 +
    7.22 +def run_mash(runId,inputDir,logFile,predictionFile,\
    7.23 +             learnTheories,theoryDefValPos,theoryDefValNeg,theoryPosWeight,\
    7.24 +             NBDefaultPriorWeight,NBDefVal,NBPosWeight,\
    7.25 +             sineFeatures,sineWeight):
    7.26 +    # Init
    7.27 +    runId = str(runId)
    7.28 +    predictionFile = predictionFile + runId
    7.29 +    args = ['--statistics','--init','--inputDir',inputDir,'-q','--log',logFile,'--theoryFile','../tmp/t'+runId,'--modelFile','../tmp/m'+runId,'--dictsFile','../tmp/d'+runId,
    7.30 +            '--theoryDefValPos',str(theoryDefValPos),'--theoryDefValNeg',str(theoryDefValNeg),'--theoryPosWeight',str(theoryPosWeight),\
    7.31 +            '--NBDefaultPriorWeight',str(NBDefaultPriorWeight),'--NBDefVal',str(NBDefVal),'--NBPosWeight',str(NBPosWeight)]    
    7.32 +    if learnTheories:
    7.33 +        args = args + ['--learnTheories']    
    7.34 +    if sineFeatures:
    7.35 +        args += ['--sineFeatures','--sineWeight',str(sineWeight)]
    7.36 +    mash(args)
    7.37 +    # Run
    7.38 +    args = ['-q','-i',inputFile,'-p',predictionFile,'--statistics','--cutOff','500','--log',logFile,'--theoryFile','../tmp/t'+runId,'--modelFile','../tmp/m'+runId,'--dictsFile','../tmp/d'+runId,\
    7.39 +            '--theoryDefValPos',str(theoryDefValPos),'--theoryDefValNeg',str(theoryDefValNeg),'--theoryPosWeight',str(theoryPosWeight),\
    7.40 +            '--NBDefaultPriorWeight',str(NBDefaultPriorWeight),'--NBDefVal',str(NBDefVal),'--NBPosWeight',str(NBPosWeight)]
    7.41 +    if learnTheories:
    7.42 +        args = args + ['--learnTheories']    
    7.43 +    if sineFeatures:
    7.44 +        args += ['--sineFeatures','--sineWeight',str(sineWeight)]
    7.45 +    mash(args)
    7.46 +
    7.47 +    # Get Results
    7.48 +    IS = open(logFile,'r')
    7.49 +    lines =  IS.readlines()
    7.50 +    tmpRes = lines[-1].split()
    7.51 +    avgAuc = tmpRes[5]
    7.52 +    avgRecall100 = tmpRes[9]
    7.53 +    tmpTheoryRes = lines[-3].split()
    7.54 +    avgTheoryPrecision = tmpTheoryRes[5] 
    7.55 +    avgTheoryRecall100 = tmpTheoryRes[7]
    7.56 +    avgTheoryRecall = tmpTheoryRes[9]
    7.57 +    avgTheoryPredictedPercent = tmpTheoryRes[11]
    7.58 +    IS.close()
    7.59 +    
    7.60 +    # Delete old models
    7.61 +    os.remove(logFile)
    7.62 +    os.remove(predictionFile)
    7.63 +    os.remove('../tmp/t'+runId)
    7.64 +    os.remove('../tmp/m'+runId)
    7.65 +    os.remove('../tmp/d'+runId)
    7.66 +    
    7.67 +    outFile = open('tester','a')
    7.68 +    #print 'avgAuc %s avgRecall100 %s avgTheoryPrecision %s avgTheoryRecall100 %s avgTheoryRecall %s avgTheoryPredictedPercent %s'
    7.69 +    outFile.write('\t'.join([str(learnTheories),str(theoryDefValPos),str(theoryDefValNeg),str(theoryPosWeight),str(NBDefaultPriorWeight),str(NBDefVal),str(NBPosWeight),str(sineFeatures),str(sineWeight),str(avgAuc),str(avgRecall100),str(avgTheoryPrecision),str(avgTheoryRecall100),str(avgTheoryRecall),str(avgTheoryPredictedPercent)])+'\n')
    7.70 +    outFile.close()
    7.71 +    print learnTheories,'\t',theoryDefValPos,'\t',theoryDefValNeg,'\t',theoryPosWeight,'\t',\
    7.72 +        NBDefaultPriorWeight,'\t',NBDefVal,'\t',NBPosWeight,'\t',\
    7.73 +        sineFeatures,'\t',sineWeight,'\t',\
    7.74 +        avgAuc,'\t',avgRecall100,'\t',avgTheoryPrecision,'\t',avgTheoryRecall100,'\t',avgTheoryRecall,'\t',avgTheoryPredictedPercent    
    7.75 +    return learnTheories,theoryDefValPos,theoryDefValNeg,theoryPosWeight,\
    7.76 +             NBDefaultPriorWeight,NBDefVal,NBPosWeight,\
    7.77 +             sineFeatures,sineWeight,\
    7.78 +             avgAuc,avgRecall100,avgTheoryPrecision,avgTheoryRecall100,avgTheoryRecall,avgTheoryPredictedPercent 
    7.79 +
    7.80 +def update_best_params(avgRecall100,bestAvgRecall100,\
    7.81 +                       bestNBDefaultPriorWeight,bestNBDefVal,bestNBPosWeight,bestSineFeatures,bestSineWeight,\
    7.82 +                       bestlearnTheories,besttheoryDefValPos,besttheoryDefValNeg,besttheoryPosWeight,\
    7.83 +                       NBDefaultPriorWeight,NBDefVal,NBPosWeight,sineFeatures,sineWeight,\
    7.84 +                       learnTheories,theoryDefValPos,theoryDefValNeg,theoryPosWeight):
    7.85 +                        if avgRecall100 > bestAvgRecall100:
    7.86 +                            bestAvgRecall100 = avgRecall100
    7.87 +                            bestNBDefaultPriorWeight = NBDefaultPriorWeight
    7.88 +                            bestNBDefVal = NBDefVal
    7.89 +                            bestNBPosWeight = NBPosWeight
    7.90 +                            bestSineFeatures = sineFeatures
    7.91 +                            bestSineWeight = sineWeight  
    7.92 +                        return bestlearnTheories,besttheoryDefValPos,besttheoryDefValNeg,besttheoryPosWeight,bestNBDefaultPriorWeight,bestNBDefVal,bestNBPosWeight,bestSineFeatures,bestSineWeight
    7.93 +
    7.94 +if __name__ == '__main__':
    7.95 +    cores = cpu_count()
    7.96 +    #cores = 1
    7.97 +    # Options
    7.98 +    depFile = 'mash_dependencies'
    7.99 +    outputDir = '../tmp/'
   7.100 +    numberOfPredictions = 500
   7.101 +    
   7.102 +    learnTheoriesRange = [True,False]
   7.103 +    theoryDefValPosRange = [-x for x in range(1,20)]
   7.104 +    theoryDefValNegRange = [-x for x in range(1,20)]
   7.105 +    theoryPosWeightRange = [x for x in range(1,10)]
   7.106 +    
   7.107 +    NBDefaultPriorWeightRange = [10*x for x in range(10)]
   7.108 +    NBDefValRange = [-x for x in range(1,20)]
   7.109 +    NBPosWeightRange = [10*x for x in range(1,10)]
   7.110 +    sineFeaturesRange = [True,False]    
   7.111 +    sineWeightRange = [0.1,0.25,0.5,0.75,1.0]
   7.112 +    
   7.113 +    # Test 1
   7.114 +    inputFile = '../data/20121227b/Auth/mash_commands'
   7.115 +    inputDir = '../data/20121227b/Auth/'
   7.116 +    predictionFile = '../tmp/auth.pred'
   7.117 +    logFile = '../tmp/auth.log'
   7.118 +    learnTheories = True
   7.119 +    theoryDefValPos = -7.5
   7.120 +    theoryDefValNeg = -15.0
   7.121 +    theoryPosWeight = 10.0
   7.122 +    NBDefaultPriorWeight = 20.0
   7.123 +    NBDefVal =- 15.0
   7.124 +    NBPosWeight = 10.0
   7.125 +    sineFeatures = True
   7.126 +    sineWeight =  0.5
   7.127 +
   7.128 +    task_queue = Queue()
   7.129 +    done_queue = Queue()
   7.130 +
   7.131 +    runs = 0
   7.132 +    for inputDir in ['../data/20121227b/Auth/']:
   7.133 +        problemId = inputDir.split('/')[-2]
   7.134 +        inputFile = os.path.join(inputDir,'mash_commands')
   7.135 +        predictionFile = os.path.join('../tmp/',problemId+'.pred')
   7.136 +        logFile = os.path.join('../tmp/',problemId+'.log')
   7.137 +        learnTheories = True
   7.138 +        theoryDefValPos = -7.5
   7.139 +        theoryDefValNeg = -15.0
   7.140 +        theoryPosWeight = 10.0
   7.141 +        
   7.142 +        bestAvgRecall100 = 0.0
   7.143 +        bestNBDefaultPriorWeight = 1.0
   7.144 +        bestNBDefVal = 1.0
   7.145 +        bestNBPosWeight = 1.0
   7.146 +        bestSineFeatures = False
   7.147 +        bestSineWeight = 0.0
   7.148 +        bestlearnTheories = True
   7.149 +        besttheoryDefValPos = 1.0 
   7.150 +        besttheoryDefValNeg = -15.0
   7.151 +        besttheoryPosWeight = 5.0
   7.152 +        for theoryPosWeight in theoryPosWeightRange:
   7.153 +            for theoryDefValNeg in theoryDefValNegRange:
   7.154 +                for NBDefaultPriorWeight in NBDefaultPriorWeightRange:
   7.155 +                    for NBDefVal in NBDefValRange:
   7.156 +                        for NBPosWeight in NBPosWeightRange:
   7.157 +                            for sineFeatures in sineFeaturesRange:
   7.158 +                                if sineFeatures:
   7.159 +                                    for sineWeight in sineWeightRange:  
   7.160 +                                        localLogFile = logFile+str(runs)                           
   7.161 +                                        task_queue.put((run_mash,(runs,inputDir, localLogFile, predictionFile, learnTheories, theoryDefValPos, theoryDefValNeg, theoryPosWeight, NBDefaultPriorWeight, NBDefVal, NBPosWeight, sineFeatures, sineWeight)))
   7.162 +                                        runs += 1
   7.163 +                                else:
   7.164 +                                    localLogFile = logFile+str(runs)
   7.165 +                                    task_queue.put((run_mash,(runs,inputDir, localLogFile, predictionFile, learnTheories, theoryDefValPos, theoryDefValNeg, theoryPosWeight, NBDefaultPriorWeight, NBDefVal, NBPosWeight, sineFeatures, sineWeight)))
   7.166 +                                    runs += 1
   7.167 +        # Start worker processes
   7.168 +        processes = []
   7.169 +        for _i in range(cores):
   7.170 +            process = Process(target=worker, args=(task_queue, done_queue))
   7.171 +            process.start()
   7.172 +            processes.append(process)
   7.173 +    
   7.174 +        for _i in range(runs):      
   7.175 +            learnTheories,theoryDefValPos,theoryDefValNeg,theoryPosWeight,\
   7.176 +             NBDefaultPriorWeight,NBDefVal,NBPosWeight,\
   7.177 +             sineFeatures,sineWeight,\
   7.178 +             avgAuc,avgRecall100,avgTheoryPrecision,avgTheoryRecall100,avgTheoryRecall,avgTheoryPredictedPercent  = done_queue.get()
   7.179 +            bestlearnTheories,besttheoryDefValPos,besttheoryDefValNeg,besttheoryPosWeight,bestNBDefaultPriorWeight,bestNBDefVal,bestNBPosWeight,bestSineFeatures,bestSineWeight = update_best_params(avgRecall100,bestAvgRecall100,\
   7.180 +                       bestNBDefaultPriorWeight,bestNBDefVal,bestNBPosWeight,bestSineFeatures,bestSineWeight,\
   7.181 +                       bestlearnTheories,besttheoryDefValPos,besttheoryDefValNeg,besttheoryPosWeight,\
   7.182 +                       NBDefaultPriorWeight,NBDefVal,NBPosWeight,sineFeatures,sineWeight,\
   7.183 +                       learnTheories,theoryDefValPos,theoryDefValNeg,theoryPosWeight)              
   7.184 +        print 'bestAvgRecall100 %s bestNBDefaultPriorWeight %s bestNBDefVal %s bestNBPosWeight %s bestSineFeatures %s bestSineWeight %s',bestAvgRecall100,bestNBDefaultPriorWeight,bestNBDefVal,bestNBPosWeight,bestSineFeatures,bestSineWeight
   7.185 +    
   7.186 \ No newline at end of file
     8.1 --- a/src/HOL/Tools/Sledgehammer/MaSh/src/theoryStats.py	Thu Jan 17 17:55:02 2013 +0100
     8.2 +++ b/src/HOL/Tools/Sledgehammer/MaSh/src/theoryStats.py	Thu Jan 17 17:55:03 2013 +0100
     8.3 @@ -29,28 +29,35 @@
     8.4          self.recall100 = 0
     8.5          self.recall = 0.0
     8.6          self.predicted = 0.0
     8.7 +        self.predictedPercent = 0.0
     8.8      
     8.9 -    def update(self,currentTheory,predictedTheories,usedTheories):
    8.10 +    def update(self,currentTheory,predictedTheories,usedTheories,nrAvailableTheories):
    8.11          self.count += 1
    8.12          allPredTheories = predictedTheories.union([currentTheory])
    8.13          if set(usedTheories).issubset(allPredTheories):
    8.14              self.recall100 += 1
    8.15          localPredicted = len(allPredTheories)
    8.16          self.predicted += localPredicted 
    8.17 +        localPredictedPercent = float(localPredicted)/nrAvailableTheories
    8.18 +        self.predictedPercent += localPredictedPercent 
    8.19          localPrec = float(len(set(usedTheories).intersection(allPredTheories))) / localPredicted
    8.20          self.precision += localPrec
    8.21 -        localRecall = float(len(set(usedTheories).intersection(allPredTheories))) / len(set(usedTheories))
    8.22 +        if len(set(usedTheories)) == 0:
    8.23 +            localRecall = 1.0
    8.24 +        else:
    8.25 +            localRecall = float(len(set(usedTheories).intersection(allPredTheories))) / len(set(usedTheories))
    8.26          self.recall += localRecall
    8.27          self.logger.info('Theory prediction results:')
    8.28 -        self.logger.info('Problem: %s \t Recall100: %s \t Precision: %s \t Recall: %s \t PredictedTeories: %s',\
    8.29 -                         self.count,self.recall100,round(localPrec,2),round(localRecall,2),localPredicted)
    8.30 +        self.logger.info('Problem: %s \t Recall100: %s \t Precision: %s \t Recall: %s \t PredictedTeoriesPercent: %s PredictedTeories: %s',\
    8.31 +                         self.count,self.recall100,round(localPrec,2),round(localRecall,2),round(localPredictedPercent,2),localPredicted)
    8.32          
    8.33      def printAvg(self):
    8.34          self.logger.info('Average theory results:')
    8.35 -        self.logger.info('avgPrecision: %s \t avgRecall100: %s \t avgRecall: %s \t avgPredicted:%s', \
    8.36 +        self.logger.info('avgPrecision: %s \t avgRecall100: %s \t avgRecall: %s \t avgPredictedPercent: %s \t avgPredicted: %s', \
    8.37                           round(self.precision/self.count,2),\
    8.38                           round(float(self.recall100)/self.count,2),\
    8.39                           round(self.recall/self.count,2),\
    8.40 +                         round(self.predictedPercent /self.count,2),\
    8.41                           round(self.predicted /self.count,2))
    8.42          
    8.43      def save(self,fileName):
     9.1 --- a/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML	Thu Jan 17 17:55:02 2013 +0100
     9.2 +++ b/src/HOL/Tools/Sledgehammer/sledgehammer_mash.ML	Thu Jan 17 17:55:03 2013 +0100
     9.3 @@ -150,8 +150,7 @@
     9.4      val core =
     9.5        "--inputFile " ^ cmd_file ^ " --predictions " ^ sugg_file ^
     9.6        " --numberOfPredictions " ^ string_of_int max_suggs ^
     9.7 -      " --NBSinePrior" (* --learnTheories *) ^
     9.8 -      (if save then " --saveModel" else "")
     9.9 +      (* " --learnTheories" ^ *) (if save then " --saveModel" else "")
    9.10      val command =
    9.11        "\"$ISABELLE_SLEDGEHAMMER_MASH/src/mash.py\" --quiet --outputDir " ^
    9.12        File.shell_path (mash_model_dir ()) ^ " --log " ^ log_file ^ " " ^ core ^