diff --git a/.gitignore b/.gitignore index 54f3916..d50b674 100644 --- a/.gitignore +++ b/.gitignore @@ -12,3 +12,7 @@ __pycache__ *.png *.eps *.bak +*.h5 +*.csv +*.root +*.json diff --git a/Analysis/test.pdf b/Analysis/test.pdf deleted file mode 100644 index 34d5151..0000000 Binary files a/Analysis/test.pdf and /dev/null differ diff --git a/Train/Plotting/Jan/compareTwoModels.py b/Train/Plotting/Jan/compareTwoModels.py new file mode 100755 index 0000000..06bb04c --- /dev/null +++ b/Train/Plotting/Jan/compareTwoModels.py @@ -0,0 +1,196 @@ + + +from argparse import ArgumentParser + +parser = ArgumentParser('make a set of ROC curves, comparing two training') +parser.add_argument('inputDirA') +parser.add_argument('inputDirB') +parser.add_argument('outputDir') +args = parser.parse_args() + + +outdir=args.outputDir+'/' + + +from testing import makeROCs_async, makePlots_async, testDescriptor +#from keras.models import load_model +from DataCollection import DataCollection + +import os +os.system('mkdir -p '+outdir) + +trainings=[args.inputDirA, + args.inputDirB] + + +trainings.extend(trainings) + +filesttbar=[] +for t in trainings: + filesttbar.append(t+'/ttbar/tree_association.txt') + +filesqcd=[] +for t in trainings: + filesqcd.append(t+'/qcd_600_800/tree_association.txt') + + +legend=['standard','p_{T} cut'] + +btruth='isB+isBB+isGBB+isLeptonicB+isLeptonicB_C' +ctruth='isC+isCC+isGCC' + + +bprob=[ 'prob_isB+prob_isBB+prob_isLeptB', + 'prob_isB+prob_isBB+prob_isLeptB', + + 'prob_isB+prob_isBB+prob_isLeptB', + 'prob_isB+prob_isBB+prob_isLeptB', + ] + +cprob=[ 'prob_isC', + 'prob_isC', + + 'prob_isC', + 'prob_isC'] + +usdprob=['prob_isUDS', + 'prob_isUDS', + + 'prob_isUDS', + 'prob_isUDS',] + + + + +print('creating ROCs') + +#makeROCs_async(intextfile, +# name_list, +# probabilities_list, +# truths_list, +# vetos_list, +# colors_list, +# outpdffile, +# cuts, +# cmsstyle, +# firstcomment, +# secondcomment, +# extralegend+None, +# logY=True) + + + + +for ptcut in ['30','150']: + + makeROCs_async(intextfile=filesttbar, + name_list=legend, + probabilities_list=bprob, + truths_list=btruth, + vetos_list=len(legend)*['isUD+isS+isG']+len(legend)*['isC'], + colors_list='auto', + outpdffile=outdir+"btag_pt"+ptcut+".pdf", + cuts='jet_pt>'+ptcut, + cmsstyle=True, + firstcomment='t#bar{t} events', + secondcomment='jet p_{T} > '+ptcut+' GeV', + extralegend=None, + logY=True, + individual=True + ) + + makeROCs_async(intextfile=filesttbar, + name_list=legend, + probabilities_list=cprob, + truths_list=ctruth, + vetos_list=len(legend)*['isUD+isS+isG']+len(legend)*[btruth], + colors_list='auto', + outpdffile=outdir+"ctag_pt"+ptcut+".pdf", + cuts='jet_pt>'+ptcut, + cmsstyle=True, + firstcomment='t#bar{t} events', + secondcomment='jet p_{T} > '+ptcut+' GeV', + extralegend=['solid?udsg','dashed?b'], + logY=True, + individual=True) + + makeROCs_async(intextfile=filesttbar, + name_list=legend, + probabilities_list=usdprob, + truths_list='isUD+isS', + vetos_list=len(legend)*['isG']+len(legend)*['isB+isLeptonicB+isLeptonicB_C+isC'], + colors_list='auto', + outpdffile=outdir+"gtag_pt"+ptcut+".pdf", + cuts='jet_pt>'+ptcut, + cmsstyle=True, + firstcomment='t#bar{t} events', + secondcomment='jet p_{T} > '+ptcut+' GeV', + extralegend=['solid?g','dashed?bc'], + logY=True, + individual=True) + + +makeROCs_async(intextfile=filesqcd, + name_list=legend, + probabilities_list=bprob, + truths_list=btruth, + vetos_list=len(legend)*['isUD+isS+isG']+len(legend)*['isC'], + colors_list='auto', + outpdffile=outdir+"btag_qcd_pt400.pdf", + cuts='jet_pt>400', + cmsstyle=True, + firstcomment='QCD, 600 < p_{T} < 800 GeV', + secondcomment='jet p_{T} > 400 GeV', + extralegend=None, + logY=True, + individual=True) + +makeROCs_async(intextfile=filesqcd, + name_list=legend, + probabilities_list=cprob, + truths_list=ctruth, + vetos_list=len(legend)*['isUD+isS+isG']+len(legend)*[btruth], + colors_list='auto', + outpdffile=outdir+"ctag_qcd_pt400.pdf", + cuts='jet_pt>400', + cmsstyle=True, + firstcomment='QCD, 600 < p_{T} < 800 GeV', + secondcomment='jet p_{T} > 400 GeV', + extralegend=['solid?udsg','dashed?b'], + logY=True, + individual=True) + +makeROCs_async(intextfile=filesqcd, + name_list=legend, + probabilities_list=usdprob, + truths_list='isUD+isS', + vetos_list=len(legend)*['isG']+len(legend)*['isB+isLeptonicB+isLeptonicB_C+isC'], + colors_list='auto', + outpdffile=outdir+"gtag_qcd_pt400.pdf", + cuts='jet_pt>400', + cmsstyle=True, + firstcomment='QCD, 600 < p_{T} < 800 GeV', + secondcomment='jet p_{T} > 400 GeV', + extralegend=['solid?g','dashed?bc'], + logY=False, + individual=True) + + +#individual plot for top/ttbar + + +makeROCs_async(intextfile=[filesttbar[1]], + name_list=['DeepFlavour'], + probabilities_list='prob_isUDS+prob_isC', + truths_list='isUD+isS+isC', + vetos_list=1*['isG']+1*['isB+isLeptonicB+isLeptonicB_C'], + colors_list='auto', + outpdffile=outdir+"lightQuarkJets_pt30.pdf", + cuts='jet_pt>400', + cmsstyle=True, + firstcomment='t#bar{t} events', + secondcomment='jet p_{T} > 30 GeV', + extralegend=['solid?g','dashed?b'], + logY=False, + individual=True) + diff --git a/Train/QGRegression_recurrent.py b/Train/QGRegression_recurrent.py index a1fb480..a638b31 100644 --- a/Train/QGRegression_recurrent.py +++ b/Train/QGRegression_recurrent.py @@ -75,7 +75,7 @@ ) from DataCollection import DataCollection -from TrainData_PT_recur import TrainData_PT_recur +from TrainData_PT_recur import TrainData_PT_recur, TrainData_recurrent_fullTruth traind = DataCollection(args.inputfile) traind.useweights = config_args['useweights'] @@ -147,6 +147,13 @@ def identity(generator): traind.writeToFile(outputDir+'trainsamples.dc') testd.writeToFile( outputDir+'valsamples.dc') +#make sure tokens don't expire +from tokenTools import checkTokens, renew_token_process +from thread import start_new_thread + +checkTokens() +start_new_thread(renew_token_process,()) + print 'training' try: model.fit_generator( diff --git a/Train/QGRegression_simple.py b/Train/QGRegression_simple.py index e6ae55d..553f69c 100644 --- a/Train/QGRegression_simple.py +++ b/Train/QGRegression_simple.py @@ -104,6 +104,13 @@ traind.writeToFile(outputDir+'trainsamples.dc') testd.writeToFile( outputDir+'valsamples.dc') +#make sure tokens don't expire +from tokenTools import checkTokens, renew_token_process +from thread import start_new_thread + +checkTokens() +start_new_thread(renew_token_process,()) + print 'training' try: model.fit_generator( diff --git a/Train/QG_Class_Regr_image.py b/Train/QG_Class_Regr_image.py index a5ed095..3af1761 100644 --- a/Train/QG_Class_Regr_image.py +++ b/Train/QG_Class_Regr_image.py @@ -48,7 +48,7 @@ # configure the in/out/split etc config_args = { #we might want to move it to an external file 'testrun' : False, - 'nepochs' : 2, + 'nepochs' : 100, 'batchsize' : 2000, 'startlearnrate' : 0.0005, 'useweights' : False, @@ -93,6 +93,13 @@ def identity(generator): for i in generator: yield i +#make sure tokens don't expire +from tokenTools import checkTokens, renew_token_process +from thread import start_new_thread + +checkTokens() +start_new_thread(renew_token_process,()) + if args.mode == 'class': model = TrainData_image.classification_model(input_shapes, output_shapes[0]) model.compile( @@ -163,26 +170,26 @@ def identity(generator): plt.clf() #plt.show() -import json -def normalize(inmap): - ret = {} - for i in inmap: - ret[i] = [float(j) for j in inmap[i]] - return ret - -with open(outputDir+'history.json', 'w') as history: - history.write(json.dumps(normalize(callbacks.history.history))) - -plt.plot(*callbacks.timer.points) -plt.title('model loss') -plt.ylabel('loss') -plt.xlabel('time [s]') -plt.savefig(outputDir+'loss_vs_time.pdf') -plt.clf() - -with open(outputDir+'loss_vs_time.json', 'w') as timeloss: - jmap = { - 'elapsed' : callbacks.timer.points[0], - 'loss' : callbacks.timer.points[1] - } - timeloss.write(json.dumps(normalize(jmap))) +## import json +## def normalize(inmap): +## ret = {} +## for i in inmap: +## ret[i] = [float(j) for j in inmap[i]] +## return ret +## +## with open(outputDir+'history.json', 'w') as history: +## history.write(json.dumps(normalize(callbacks.history.history))) +## +## plt.plot(*callbacks.timer.points) +## plt.title('model loss') +## plt.ylabel('loss') +## plt.xlabel('time [s]') +## plt.savefig(outputDir+'loss_vs_time.pdf') +## plt.clf() +## +## with open(outputDir+'loss_vs_time.json', 'w') as timeloss: +## jmap = { +## 'elapsed' : callbacks.timer.points[0], +## 'loss' : callbacks.timer.points[1] +## } +## timeloss.write(json.dumps(normalize(jmap))) diff --git a/Train/deepFlavour_noneutral.py b/Train/deepFlavour_noneutral.py new file mode 100644 index 0000000..4a9d749 --- /dev/null +++ b/Train/deepFlavour_noneutral.py @@ -0,0 +1,56 @@ + + +from training_base import training_base +from Losses import loss_NLL +from modelTools import fixLayersContaining,printLayerInfosAndWeights + +#also does all the parsing +train=training_base(testrun=False) + +newtraining= not train.modelSet() +#for recovering a training +if newtraining: + from models import model_deepFlavourNoNeutralReference + + train.setModel(model_deepFlavourNoNeutralReference,dropoutRate=0.1) + + #train.keras_model=fixLayersContaining(train.keras_model, 'regression', invert=False) + + train.compileModel(learningrate=0.001, + loss=['categorical_crossentropy',loss_NLL], + metrics=['accuracy'], + loss_weights=[1., 0.000000000001]) + + +print(train.keras_model.summary()) +model,history = train.trainModel(nepochs=1, + batchsize=10000, + stop_patience=300, + lr_factor=0.5, + lr_patience=3, + lr_epsilon=0.0001, + lr_cooldown=6, + lr_minimum=0.0001, + maxqsize=100) + + +print('fixing input norms...') +train.keras_model=fixLayersContaining(train.keras_model, 'input_batchnorm') +train.compileModel(learningrate=0.001, + loss=['categorical_crossentropy',loss_NLL], + metrics=['accuracy'], + loss_weights=[1., 0.000000000001]) + + +print(train.keras_model.summary()) +#printLayerInfosAndWeights(train.keras_model) + +model,history = train.trainModel(nepochs=60, + batchsize=10000, + stop_patience=300, + lr_factor=0.5, + lr_patience=3, + lr_epsilon=0.0001, + lr_cooldown=6, + lr_minimum=0.0001, + maxqsize=100) diff --git a/Train/deepFlavour_reference.py b/Train/deepFlavour_reference.py index a79fdf4..b69c684 100644 --- a/Train/deepFlavour_reference.py +++ b/Train/deepFlavour_reference.py @@ -12,7 +12,7 @@ if newtraining: from models import model_deepFlavourReference - train.setModel(model_deepFlavourReference,dropoutRate=0.1) + train.setModel(model_deepFlavourReference,dropoutRate=0.1,momentum=0.3) #train.keras_model=fixLayersContaining(train.keras_model, 'regression', invert=False) @@ -22,37 +22,36 @@ loss_weights=[1., 0.000000000001]) -train.train_data.maxFilesOpen=5 - -print(train.keras_model.summary()) -model,history = train.trainModel(nepochs=1, - batchsize=10000, - stop_patience=300, - lr_factor=0.5, - lr_patience=3, - lr_epsilon=0.0001, - lr_cooldown=6, - lr_minimum=0.0001, - maxqsize=400) - - -print('fixing input norms...') -train.keras_model=fixLayersContaining(train.keras_model, 'input_batchnorm') -train.compileModel(learningrate=0.001, - loss=['categorical_crossentropy',loss_meansquared], - metrics=['accuracy'], - loss_weights=[1., 0.000000000001]) - - + train.train_data.maxFilesOpen=5 + + print(train.keras_model.summary()) + model,history = train.trainModel(nepochs=1, + batchsize=10000, + stop_patience=300, + lr_factor=0.5, + lr_patience=3, + lr_epsilon=0.0001, + lr_cooldown=6, + lr_minimum=0.0001, + maxqsize=5) + + + print('fixing input norms...') + train.keras_model=fixLayersContaining(train.keras_model, 'input_batchnorm') + train.compileModel(learningrate=0.0003, + loss=['categorical_crossentropy',loss_meansquared], + metrics=['accuracy'], + loss_weights=[1., 0.000000000001]) + print(train.keras_model.summary()) #printLayerInfosAndWeights(train.keras_model) model,history = train.trainModel(nepochs=63, #sweet spot from looking at the testing plots batchsize=10000, stop_patience=300, - lr_factor=0.5, - lr_patience=3, + lr_factor=0.8, + lr_patience=-3, lr_epsilon=0.0001, - lr_cooldown=6, - lr_minimum=0.0001, - maxqsize=100) + lr_cooldown=8, + lr_minimum=0.00001, + maxqsize=5,verbose=1) diff --git a/Train/train_DeepCSV.py b/Train/train_DeepCSV.py new file mode 100644 index 0000000..9545c40 --- /dev/null +++ b/Train/train_DeepCSV.py @@ -0,0 +1,28 @@ + + +from training_base import training_base +from Losses import loss_NLL + +#also does all the parsing +train=training_base(testrun=False) +print 'Inited' + +if not train.modelSet(): + from models import dense_model + print 'Setting model' + train.setModel(dense_model,dropoutRate=0.1) + + train.compileModel(learningrate=0.003, + loss='categorical_crossentropy', + metrics=['accuracy']) + + +model,history = train.trainModel(nepochs=50, + batchsize=5000, + stop_patience=300, + lr_factor=0.5, + lr_patience=10, + lr_epsilon=0.0001, + lr_cooldown=2, + lr_minimum=0.0001, + maxqsize=100) diff --git a/Train/train_DeepCSV_gradientAda.py b/Train/train_DeepCSV_gradientAda.py new file mode 100644 index 0000000..c67ca2d --- /dev/null +++ b/Train/train_DeepCSV_gradientAda.py @@ -0,0 +1,42 @@ +from training_base import training_base +from MultiDataCollection import MultiDataCollection +from pdb import set_trace + +#also does all the parsing +train=training_base(testrun=False, collection_class=MultiDataCollection) +print 'Inited' +sizes = train.train_data.sizes +norm = float(sizes[2])/sizes[1] #normalization because samples have different sizes +train.train_data.setFlags([[1,0], [0,norm], [0,1]]) +train.train_data.addYs([[0], [1], [0]]) + +evt = train.train_data.generator().next() +set_trace() +train.val_data.setFlags([[1,0], [0,norm], [0,1]]) +train.val_data.addYs([[0], [1], [0]]) + +if not train.modelSet(): + from models import dense_model_gradientReversal + print 'Setting model' + train.setModel(dense_model_gradientReversal,dropoutRate=0.1) + + train.compileModel( + learningrate=0.003, + loss=['categorical_crossentropy', 'binary_crossentropy'], + #loss_weights=[1., 0.000000000001], + metrics=['accuracy'] + ) + + +model,history = train.trainModel( + nepochs=50, + batchsize=5000, + stop_patience=300, + lr_factor=0.5, + lr_patience=10, + lr_epsilon=0.0001, + lr_cooldown=2, + lr_minimum=0.0001, + maxqsize=100 +) + diff --git a/Train/train_DeepCSV_momentAda.py b/Train/train_DeepCSV_momentAda.py new file mode 100644 index 0000000..e74369b --- /dev/null +++ b/Train/train_DeepCSV_momentAda.py @@ -0,0 +1,43 @@ +from training_base import training_base +from Losses import nd_moment_loss +from MultiDataCollection import MultiDataCollection +from pdb import set_trace + +#also does all the parsing +train=training_base(testrun=False, collection_class=MultiDataCollection) +print 'Inited' +sizes = train.train_data.sizes +norm = float(sizes[2])/sizes[1] #normalization because samples have different sizes +train.train_data.setFlags([[1,0], [0,norm], [0,1]]) +train.train_data.addYs([[[0,0]], [[1,0]], [[0,1]]]) + +evt = train.train_data.generator().next() +set_trace() + +train.val_data.setFlags([[1,0], [0,norm], [0,1]]) +train.val_data.addYs([[[0,0]], [[1,0]], [[0,1]]]) + +if not train.modelSet(): + from models import dense_model_moments + print 'Setting model' + train.setModel(dense_model_moments, dropoutRate=0.1) + + train.compileModel( + learningrate=0.003, + loss=['categorical_crossentropy', nd_moment_loss], + #loss_weights=[1., 0.000000000001], + metrics=['accuracy'], + ) + + +model,history = train.trainModel( + nepochs=50, + batchsize=5000, + stop_patience=300, + lr_factor=0.5, + lr_patience=10, + lr_epsilon=0.0001, + lr_cooldown=2, + lr_minimum=0.0001, + maxqsize=100 +) diff --git a/convertFromRoot/convertFromRoot.py b/convertFromRoot/convertFromRoot.py index 6b82ddc..2a1e05b 100755 --- a/convertFromRoot/convertFromRoot.py +++ b/convertFromRoot/convertFromRoot.py @@ -41,21 +41,27 @@ def main(argv=None): #try: # setup option parser + from TrainData import TrainData - from TrainData_deepCSV import TrainData_deepCSV + from TrainData_test import TrainData_test + from TrainData_deepCSV import TrainData_deepCSV, TrainData_deepCSV_RNN, TrainData_deepCSV_RNN_Deeper from TrainData_deepConvCSV import TrainData_deepConvCSV from TrainData_deepCMVA import TrainData_deepCMVA from TrainData_deepCSV_PF import TrainData_deepCSV_PF,TrainData_deepCSV_miniPF,TrainData_deepCSV_microPF,TrainData_deepCSV_softL_PF, TrainData_deepCSV_PF_rec from TrainData_deepCSV_PF_Reg import TrainData_deepCSV_PF_Reg from TrainData_deepJet_Reg import TrainData_deepJet_Reg, TrainData_PF_Reg from TrainData_deepCSV_PF_binned import TrainData_deepCSV_PF_Binned - from TrainData_deepFlavour import TrainData_deepFlavour_FT_reg_noScale,TrainData_deepFlavour_QGOnly_reg,TrainData_deepFlavour_FT,TrainData_deepFlavour_FT_reg,TrainData_deepFlavour_FT_map,TrainData_deepFlavour_FT_map_reg,TrainData_image + from TrainData_deepFlavour import TrainData_deepFlavour_FT_reg_noScale,TrainData_deepFlavour_QGOnly_reg,TrainData_deepFlavour_FT,TrainData_deepFlavour_FT_reg,TrainData_deepFlavour_FT_map,TrainData_deepFlavour_FT_map_reg,TrainData_image, TrainData_deepFlavour_cleaninput, TrainData_deepFlavour_cleanBTVOnly, TrainData_deepFlavour_nopuppi from TrainData_FatJet import TrainData_FatJet_Test from TrainData_PT_recur import TrainData_PT_recur, TrainData_QG_simple, TrainData_recurrent_fullTruth from TrainData_deepCSV_int import TrainData_deepCSV_int,TrainData_deepCSV_conv from TrainData_deepAK8 import TrainData_AK8Jet_init + from TrainData_domainAda import TrainData_sampleCheck class_options = [ + TrainData_test, + TrainData_deepCSV_RNN, + TrainData_deepCSV_RNN_Deeper, TrainData_deepCSV, TrainData_deepConvCSV, TrainData_deepCMVA, @@ -82,6 +88,10 @@ def main(argv=None): TrainData_deepCSV_int, TrainData_deepCSV_conv, TrainData_AK8Jet_init, + TrainData_deepFlavour_cleaninput, + TrainData_deepFlavour_cleanBTVOnly, + TrainData_deepFlavour_nopuppi, + TrainData_sampleCheck, ] class_options = dict((str(i).split("'")[1].split('.')[-1], i) for i in class_options) diff --git a/modules/DataCollection.py b/modules/DataCollection.py index 0e7b9f5..5e25f7b 100644 --- a/modules/DataCollection.py +++ b/modules/DataCollection.py @@ -14,6 +14,8 @@ from pdb import set_trace import copy +usenewformat=True + class DataCollection(object): ''' classdocs @@ -44,10 +46,13 @@ def clear(self): self.weighter=Weighter() self.weightsfraction=0.05 self.maxConvertThreads=2 - self.maxFilesOpen=5 + self.maxFilesOpen=2 self.means=None self.classweights={} + def __len__(self): + return self.nsamples + def __iadd__(self, other): 'A += B' if not isinstance(other, DataCollection): @@ -111,7 +116,7 @@ def removeLast(self): def getClassWeights(self): if not len(self.classweights): - self.__computeClassWeights() + self.__computeClassWeights(self.dataclass.getUsedTruth()) return self.classweights def __computeClassWeights(self,truthclassesarray): @@ -143,7 +148,7 @@ def getInputShapes(self): return [] self.dataclass.filelock=None td=copy.deepcopy(self.dataclass) - td.readIn(self.getSamplePath(self.samples[0])) + td.readIn(self.getSamplePath(self.samples[0]),shapesOnly=True) shapes=td.getInputShapes() td.clear() return shapes @@ -164,7 +169,14 @@ def setBatchSize(self,bsize): if bsize > self.nsamples: raise Exception('Batch size must not be bigger than total sample size') self.__batchsize=bsize + + def getBatchSize(self): + return self.__batchsize + @property + def batch_size(self): + return self.__batchsize + def getSamplesPerEpoch(self): #modify by batch split count=self.getNBatchesPerEpoch() @@ -173,6 +185,9 @@ def getSamplesPerEpoch(self): else: return self.nsamples + def getAvEntriesPerFile(self): + return float(self.nsamples)/float(len(self.samples)) + def getNBatchesPerEpoch(self): if self.__batchsize <= 1: @@ -395,7 +410,7 @@ def createDataFromRoot( - def __writeData(self,sample,means, weighter,outputDir,dataclass): + def __writeData(self,sample,means, weighter,outputDir,dataclass,number=-1): import os import copy from stopwatch import stopwatch @@ -415,10 +430,17 @@ def removefile(): try: td.readFromRootFile(ramdisksample,means, weighter) newname=os.path.basename(sample).rsplit('.', 1)[0] - newpath=os.path.abspath(outputDir+newname+'.z') + if number>0: + newname+=str(number) + + if usenewformat: + newname+='.meta' + else: + newname+='.z' + newpath=os.path.abspath(outputDir+newname) td.writeOut(newpath) - print('converted and written '+newname+'.z in ',sw.getAndReset(),' sec') - self.samples.append(newname+'.z') + print('converted and written '+newname+' in ',sw.getAndReset(),' sec') + self.samples.append(newname) self.nsamples+=td.nsamples self.sampleentries.append(td.nsamples) td.clear() @@ -431,8 +453,9 @@ def removefile(): def __writeData_async_andCollect(self, startindex, outputDir): - from multiprocessing import Process, Queue, cpu_count + from multiprocessing import Process, Queue, cpu_count, Lock wo_queue = Queue() + writelock=Lock() import os thispid=str(os.getpid()) if not os.path.isfile(outputDir+'/snapshot.dc'): @@ -443,14 +466,13 @@ def __writeData_async_andCollect(self, startindex, outputDir): print('creating dir '+tempstoragepath) os.system('mkdir -p '+tempstoragepath) - def writeData_async(index,woq): + def writeData_async(index,woq,wrlck): import copy from stopwatch import stopwatch sw=stopwatch() td=copy.deepcopy(self.dataclass) sample=self.originRoots[index] - fileTimeOut(sample,120) #once available copy to ram ramdisksample= tempstoragepath+'/'+str(os.getpid())+os.path.basename(sample) def removefile(): @@ -462,17 +484,26 @@ def removefile(): out_samplename='' out_sampleentries=0 newname=os.path.basename(sample).rsplit('.', 1)[0] - newpath=os.path.abspath(outputDir+newname+'.z') + newname+=str(index) + + if usenewformat: + newname+='.meta' + else: + newname+='.z' + newpath=os.path.abspath(outputDir+newname) try: + fileTimeOut(sample,120) #once available copy to ram os.system('cp '+sample+' '+ramdisksample) td.readFromRootFile(ramdisksample,self.means, self.weighter) + #wrlck.acquire() td.writeOut(newpath) - print('converted and written '+newname+'.z in ',sw.getAndReset(),' sec -', index) + #wrlck.release() + print('converted and written '+newname+' in ',sw.getAndReset(),' sec -', index) - out_samplename=newname+'.z' + out_samplename=newname out_sampleentries=td.nsamples success=True td.clear() @@ -492,15 +523,20 @@ def removefile(): def __collectWriteInfo(successful,samplename,sampleentries,outputDir): if not successful: raise Exception("write not successful, stopping") - + import os self.samples.append(samplename) self.nsamples+=sampleentries self.sampleentries.append(sampleentries) - self.writeToFile(outputDir+'/snapshot.dc') + self.writeToFile(outputDir+'/snapshot_tmp.dc')#avoid to overwrite directly + os.system('mv '+outputDir+'/snapshot_tmp.dc '+outputDir+'/snapshot.dc') processes=[] + processrunning=[] + processfinished=[] for i in range(startindex,len(self.originRoots)): - processes.append(Process(target=writeData_async, args=(i,wo_queue) ) ) + processes.append(Process(target=writeData_async, args=(i,wo_queue,writelock) ) ) + processrunning.append(False) + processfinished.append(False) nchilds = int(cpu_count()/2)-2 if self.nprocs <= 0 else self.nprocs #import os @@ -511,46 +547,55 @@ def __collectWriteInfo(successful,samplename,sampleentries,outputDir): #nchilds=10 - index=0 + + + lastindex=startindex-1 alldone=False + results=[] + import time try: while not alldone: - if index+nchilds >= len(processes): - nchilds=len(processes)-index - alldone=True + nrunning=0 + for runs in processrunning: + if runs: nrunning+=1 - - logging.info('starting %d child processes' % nchilds) - for i in range(nchilds): - logging.info('starting %s...' % self.originRoots[startindex+i+index]) - processes[i+index].start() - - results=[] - import time - time.sleep(1) - - while 1: - running = len(results)=nchilds: break + if processrunning[i]:continue + if processfinished[i]:continue time.sleep(0.1) - - logging.info('joining') - for i in range(nchilds): - processes[i+index].join(5) + logging.info('starting %s...' % self.originRoots[startindex+i]) + processes[i].start() + processrunning[i]=True + nrunning+=1 - results.sort() - results = [r[1] for r in results] - for i in range(nchilds): - logging.info(results[i]) - __collectWriteInfo(results[i][0],results[i][1],results[i][2],outputDir) - index+=nchilds - + + if not wo_queue.empty(): + res=wo_queue.get() + results.append(res) + originrootindex=res[0] + logging.info('finished %s...' % self.originRoots[originrootindex]) + processfinished[originrootindex-startindex]=True + processes [originrootindex-startindex].join(5) + processrunning [originrootindex-startindex]=False + #immediately send the next + continue + + + for r in results: + thisidx=r[0] + if thisidx==lastindex+1: + logging.info('>>>> collected result %d of %d' % (thisidx,len(self.originRoots))) + __collectWriteInfo(r[1][0],r[1][1],r[1][2],outputDir) + lastindex=thisidx + + if nrunning==0: + alldone=True + continue + time.sleep(0.1) + except: os.system('rm -rf '+tempstoragepath) raise @@ -623,55 +668,85 @@ def generator(self): import numpy import copy from sklearn.utils import shuffle - + import shutil + import uuid + import os + import copy + import threading + import time + print('start generator') #helper class class tdreader(object): def __init__(self,filelist,maxopen,tdclass): - #print('init reader for '+str(len(filelist))+' files:') - #print(filelist) - self.filelist=filelist - self.max=maxopen self.nfiles=len(filelist) + self.max=min(maxopen,len(filelist)) self.tdlist=[] self.tdopen=[] self.tdclass=copy.deepcopy(tdclass) self.tdclass.clear()#only use the format, no data - for i in range(maxopen): + #self.copylock=thread.allocate_lock() + for i in range(self.nfiles): self.tdlist.append(copy.deepcopy(tdclass)) self.tdopen.append(False) self.closeAll() #reset state + self.shuffleseed=0 def start(self): + for i in range(self.max): self.__readNext() + time.sleep(1) + + def __readNext(self): + #make sure this fast function has exited before getLast tries to read the file import copy readfilename=self.filelist[self.filecounter] + if len(filelist)>1: + self.tdlist[self.nextcounter].clear() self.tdlist[self.nextcounter]=copy.deepcopy(self.tdclass) - self.tdlist[self.nextcounter].readIn_async(readfilename) - - #print('reading file '+readfilename)#DEBUG - + self.tdlist[self.nextcounter].readthread=None + + def startRead(counter,filename,shuffleseed): + excounter=0 + while excounter<10: + try: + self.tdlist[counter].readIn_async(filename,ramdiskpath='/dev/shm/', + randomseed=shuffleseed) + break + except Exception as d: + print(self.filelist[counter]+' read error, retry...') + self.tdlist[counter].readIn_abort() + excounter=excounter+1 + if excounter<10: + time.sleep(5) + continue + traceback.print_exc(file=sys.stdout) + raise d + + t=threading.Thread(target=startRead, args=(self.nextcounter,readfilename,self.shuffleseed)) + t.start() + self.shuffleseed+=1 + if self.shuffleseed>1e5: + self.shuffleseed=0 + #startRead(self.nextcounter,readfilename,self.shuffleseed) self.tdopen[self.nextcounter]=True self.filecounter=self.__increment(self.filecounter,self.nfiles) + self.nextcounter=self.__increment(self.nextcounter,self.nfiles) - #if self.filecounter==0: - # print('file counter reset to 0 after file '+readfilename) - self.nextcounter=self.__increment(self.nextcounter,self.max) def __getLast(self): + self.tdlist[self.lastcounter].readIn_join(wasasync=True,waitforStart=True) td=self.tdlist[self.lastcounter] - td.readIn_join() - - #print('got '+td.samplename+' with '+str(len(td.x[0]))+' samples') + #print('got ',self.lastcounter) self.tdopen[self.lastcounter]=False - self.lastcounter=self.__increment(self.lastcounter,self.max) + self.lastcounter=self.__increment(self.lastcounter,self.nfiles) return td def __increment(self,counter,maxval): @@ -679,25 +754,26 @@ def __increment(self,counter,maxval): if counter>=maxval: counter=0 return counter - def NOT__del__(self): - #print('del') + + def __del__(self): self.closeAll() def closeAll(self): - #close all - for i in range(self.max): - if self.tdopen[i]: - self.tdlist[i].readIn_abort() - self.tdlist[i].clear() - self.tdopen[i]=False + for i in range(len(self.tdopen)): + try: + if self.tdopen[i]: + self.tdlist[i].readIn_abort() + self.tdlist[i].clear() + self.tdopen[i]=False + except: pass + self.tdlist[i].removeRamDiskFile() self.nextcounter=0 self.lastcounter=0 self.filecounter=0 - #print('closed') def get(self): - #returns last reads next circular + td=self.__getLast() self.__readNext() return td @@ -730,7 +806,7 @@ def get(self): TDReader=tdreader(filelist, self.maxFilesOpen, self.dataclass) #print('generator: total batches '+str(totalbatches)) - + print('start file buffering...') TDReader.start() #### # @@ -740,9 +816,11 @@ def get(self): # check if really the right ones are read.... # psamples=0 #for random shuffling + nepoch=0 while 1: if processedbatches == totalbatches: processedbatches=0 + nepoch+=1 lastbatchrest=0 if processedbatches == 0: #reset buffer and start new @@ -774,6 +852,10 @@ def get(self): td=TDReader.get() except: traceback.print_exc(file=sys.stdout) + + if td.x[0].shape[0] == 0: + print('Found empty (corrupted?) file, skipping') + continue if xstored[0].shape[0] ==0: #print('dc:read direct') #DEBUG @@ -796,10 +878,18 @@ def get(self): wout.append([]) else: - if td.x[0].shape == 0: - print('Found empty (corrupted?) file, skipping') - continue - #print('dc:append read sample') #DEBUG + + #randomly^2 shuffle - not needed every time + if psamples%2==0 and nepoch%2==1: + for i in range(0,dimx): + td.x[i]=shuffle(td.x[i], random_state=psamples) + for i in range(0,dimy): + td.y[i]=shuffle(td.y[i], random_state=psamples) + for i in range(0,dimw): + td.w[i]=shuffle(td.w[i], random_state=psamples) + + + for i in range(0,dimx): if(xstored[i].ndim==1): xstored[i] = numpy.append(xstored[i],td.x[i]) @@ -821,17 +911,6 @@ def get(self): if xstored[0].shape[0] >= self.__batchsize: batchcomplete = True - #random shuffle each time - for i in range(0,dimx): - xstored[i]=shuffle(xstored[i], random_state=psamples) - for i in range(0,dimy): - ystored[i]=shuffle(ystored[i], random_state=psamples) - for i in range(0,dimw): - wstored[i]=shuffle(wstored[i], random_state=psamples) - - - #randomize elements - #limit of the random generator number psamples+= td.x[0].shape[0] if psamples > 4e8: diff --git a/modules/DeepJet_callbacks.py b/modules/DeepJet_callbacks.py index 228db4e..19cad27 100644 --- a/modules/DeepJet_callbacks.py +++ b/modules/DeepJet_callbacks.py @@ -11,14 +11,16 @@ from time import time from pdb import set_trace import json +from keras import backend as K class newline_callbacks_begin(Callback): - def __init__(self,outputDir): + def __init__(self,outputDir,plotLoss=False): self.outputDir=outputDir self.loss=[] self.val_loss=[] self.full_logs=[] + self.plotLoss=plotLoss def on_epoch_end(self,epoch, epoch_logs={}): import os @@ -26,13 +28,16 @@ def on_epoch_end(self,epoch, epoch_logs={}): print('\n***callbacks***\nsaving losses to '+lossfile) self.loss.append(epoch_logs.get('loss')) self.val_loss.append(epoch_logs.get('val_loss')) - f = open(lossfile, 'w') - for i in range(len(self.loss)): - f.write(str(self.loss[i])) - f.write(" ") - f.write(str(self.val_loss[i])) - f.write("\n") + f = open(lossfile, 'a') + f.write(str(epoch_logs.get('loss'))) + f.write(" ") + f.write(str(epoch_logs.get('val_loss'))) + f.write("\n") f.close() + learnfile=os.path.join( self.outputDir, 'learn.log') + with open(learnfile, 'a') as f: + f.write(str(float(K.get_value(self.model.optimizer.lr)))+'\n') + normed = {} for vv in epoch_logs: normed[vv] = float(epoch_logs[vv]) @@ -40,6 +45,10 @@ def on_epoch_end(self,epoch, epoch_logs={}): lossfile=os.path.join( self.outputDir, 'full_info.log') with open(lossfile, 'w') as out: out.write(json.dumps(self.full_logs)) + + if self.plotLoss: + from testing import plotLoss + plotLoss(self.outputDir+'/losses.log',self.outputDir+'/losses.pdf',[]) class newline_callbacks_end(Callback): def on_epoch_end(self,epoch, epoch_logs={}): @@ -73,9 +82,23 @@ def on_epoch_begin(self, epoch, logs=None): from tokenTools import checkTokens checkTokens(self.cutofftime_hours) +class saveCheckPointDeepJet(Callback): + ''' + this seems obvious, however for some reason the keras model checkpoint fails + to save the optimizer state, needed for resuming a training. Therefore this explicit + implementation. + ''' + + def __init__(self,outputDir,model): + self.outputDir=outputDir + self.djmodel=model + def on_epoch_end(self,epoch, epoch_logs={}): + self.djmodel.save(self.outputDir+"/KERAS_check_model_last.h5") + class DeepJet_callbacks(object): def __init__(self, + model, stop_patience=10, lr_factor=0.5, lr_patience=1, @@ -83,11 +106,13 @@ def __init__(self, lr_cooldown=4, lr_minimum=1e-5, outputDir='', - minTokenLifetime=5): + minTokenLifetime=5, + checkperiod=10, + plotLossEachEpoch=True): - self.nl_begin=newline_callbacks_begin(outputDir) + self.nl_begin=newline_callbacks_begin(outputDir,plotLossEachEpoch) self.nl_end=newline_callbacks_end() self.stopping = EarlyStopping(monitor='val_loss', @@ -100,11 +125,11 @@ def __init__(self, self.modelbestcheck=ModelCheckpoint(outputDir+"/KERAS_check_best_model.h5", monitor='val_loss', verbose=1, - save_best_only=True) + save_best_only=True, save_weights_only=False) - self.modelcheckperiod=ModelCheckpoint(outputDir+"/KERAS_check_model_epoch{epoch:02d}.h5", verbose=1,period=10) + self.modelcheckperiod=ModelCheckpoint(outputDir+"/KERAS_check_model_epoch{epoch:02d}.h5", verbose=1,period=checkperiod, save_weights_only=False) - self.modelcheck=ModelCheckpoint(outputDir+"/KERAS_check_model_last.h5", verbose=1) + self.modelcheck=saveCheckPointDeepJet(outputDir,model) diff --git a/modules/Layers.py b/modules/Layers.py new file mode 100644 index 0000000..0be3edf --- /dev/null +++ b/modules/Layers.py @@ -0,0 +1,41 @@ +import tensorflow as tf +from keras.engine import Layer +import keras.backend as K + +def reverse_gradient(X, hp_lambda): + '''Flips the sign of the incoming gradient during training.''' + try: + reverse_gradient.num_calls += 1 + except AttributeError: + reverse_gradient.num_calls = 1 + + grad_name = "GradientReversal%d" % reverse_gradient.num_calls + @tf.RegisterGradient(grad_name) + def _flip_gradients(op, grad): + return [tf.negative(grad) * hp_lambda] + + g = K.get_session().graph + with g.gradient_override_map({'Identity': grad_name}): + y = tf.identity(X) + return y + +class GradientReversal(Layer): + '''Flip the sign of gradient during training.''' + def __init__(self, hp_lambda=1., **kwargs): + super(GradientReversal, self).__init__(**kwargs) + self.supports_masking = False + self.hp_lambda = hp_lambda + + def build(self, input_shape): + self.trainable_weights = [] + + def call(self, x, mask=None): + return reverse_gradient(x, self.hp_lambda) + + def get_output_shape_for(self, input_shape): + return input_shape + + def get_config(self): + config = {} + base_config = super(GradientReversal, self).get_config() + return dict(list(base_config.items()) + list(config.items())) diff --git a/modules/Losses.py b/modules/Losses.py index 370c7e6..3299bfe 100644 --- a/modules/Losses.py +++ b/modules/Losses.py @@ -1,11 +1,37 @@ from keras import backend as K +from tensorflow import where, greater, abs, zeros_like, exp +import tensorflow as tf global_loss_list={} #whenever a new loss function is created, please add it to the global_loss_list dictionary! +def huberishLoss_noUnc(y_true, x_pred): + + + dxrel=(x_pred - y_true)/1#(K.clip(K.abs(y_true+0.1),K.epsilon(),None)) + dxrel=K.clip(dxrel,-1e6,1e6) + + #defines the inverse of starting point of the linear behaviour + scaler=2 + + dxabs=K.abs(scaler* dxrel) + dxsq=K.square(scaler * dxrel) + dxp4=K.square(dxsq) + + lossval=dxsq / (1+dxp4) + (2*dxabs -1)/(1 + 1/dxp4) + #K.clip(lossval,-1e6,1e6) + + return K.mean( lossval , axis=-1) + + + +global_loss_list['huberishLoss_noUnc']=huberishLoss_noUnc + + + def loss_NLL(y_true, x): """ This loss is the negative log likelyhood for gaussian pdf. @@ -22,10 +48,8 @@ def loss_NLL(y_true, x): def loss_meansquared(y_true, x): """ - This loss is the negative log likelyhood for gaussian pdf. - See e.g. http://bayesiandeeplearning.org/papers/BDL_29.pdf for details - Generally it might be better to even use Mixture density networks (i.e. more complex pdf than single gauss, see: - https://publications.aston.ac.uk/373/1/NCRG_94_004.pdf + This loss is a standard mean squared error loss with a dummy for the uncertainty, + which will just get minimised to 0. """ x_pred = x[:,1:] x_sig = x[:,:1] @@ -35,6 +59,52 @@ def loss_meansquared(y_true, x): global_loss_list['loss_meansquared']=loss_meansquared +def loss_logcosh(y_true, x): + """ + This loss implements a logcosh loss with a dummy for the uncertainty. + It approximates a mean-squared loss for small differences and a linear one for + large differences, therefore it is conceptually similar to the Huber loss. + This loss here is scaled, such that it start becoming linear around 4-5 sigma + """ + scalefactor_a=30 + scalefactor_b=0.4 + + from tensorflow import where, greater, abs, zeros_like, exp + + x_pred = x[:,1:] + x_sig = x[:,:1] + def cosh(y): + return (K.exp(y) + K.exp(-y)) / 2 + + return K.mean(0.5*K.square(x_sig)) + K.mean(scalefactor_a* K.log(cosh( scalefactor_b*(x_pred - y_true))), axis=-1) + + + +global_loss_list['loss_logcosh']=loss_logcosh + + +def loss_logcosh_noUnc(y_true, x_pred): + """ + This loss implements a logcosh loss without a dummy for the uncertainty. + It approximates a mean-squared loss for small differences and a linear one for + large differences, therefore it is conceptually similar to the Huber loss. + This loss here is scaled, such that it start becoming linear around 4-5 sigma + """ + scalefactor_a=1. + scalefactor_b=3. + + from tensorflow import where, greater, abs, zeros_like, exp + + dxrel=(x_pred - y_true)/(y_true+0.0001) + def cosh(x): + return (K.exp(x) + K.exp(-x)) / 2 + + return scalefactor_a*K.mean( K.log(cosh(scalefactor_b*dxrel)), axis=-1) + + + +global_loss_list['loss_logcosh_noUnc']=loss_logcosh_noUnc + # The below is to use multiple gaussians for regression ## https://github.com/axelbrando/Mixture-Density-Networks-for-distribution-and-uncertainty-estimation/blob/master/MDN-DNN-Regression.ipynb @@ -97,3 +167,138 @@ def mean_log_LaPlace_like(y_true, parameters): global_loss_list['mean_log_LaPlace_like']=mean_log_LaPlace_like + +def moment_loss(y_true, y_pred): + '''this methods calculates moments for different "bins", e.g. two; 1 data and 2 mc, and returns the difference between these moments of the different bins. The bins are passed in the true labels. This loss supports only one prediction output (the prediction is a single value per element)''' + # The below counts the entries in the histogram vector, i.e. the actual mini batch size + h_entr = K.sum(y_true,axis=0) + + ## first moment ## + # Multiply the histogram vectors with estimated propability y_pred + h_fill = y_true * y_pred + + # Sum each histogram vector + Sum =K.sum(h_fill,axis=0) + + # Devide sum by entries (batch size) (i.e. mean, first moment) + Sum = Sum/h_entr + + # Devide per vector mean by average mean, i.e. now SUM is a vector of the relative deviations from the absolute mean + Sum = Sum/K.mean(y_pred) + + ## second moment, same logic as before, just squared + y_pred2 = y_pred-K.mean(y_pred) + h_fill2 = y_true * y_pred2*y_pred2 + Sum2 =K.sum(h_fill2,axis=0) + Sum2 = Sum2/h_entr + Sum2 = Sum2/K.mean(y_pred2*y_pred2) + + ## third moment + + y_pred3 = y_pred-K.mean(y_pred) + h_fill3 = y_true * y_pred3*y_pred3*y_pred3 + Sum3 =K.sum(h_fill3,axis=0) + Sum3 = Sum3/h_entr + Sum3 = Sum3/K.mean(y_pred2*y_pred2*y_pred2) + + ## fourth moment + + y_pred4 = y_pred-K.mean(y_pred) + h_fill4 = y_true * y_pred4*y_pred4*y_pred4*y_pred4 + Sum4 =K.sum(h_fill4,axis=0) + Sum4 = Sum4/h_entr + Sum4 = Sum4/K.mean(y_pred4*y_pred4*y_pred4*y_pred4) + + return K.mean(K.square(Sum-1)) + K.mean(K.square(Sum2-1)) + K.mean(K.square(Sum3-1)) + K.mean(K.square(Sum4-1)) + +global_loss_list['moment_loss'] = moment_loss + + + +def nd_moment_loss(y_true, y_pred): + '''Extension of the moment_loss to the case where the prediction in multi-dimensional. +This methods calculates moments for different "bins", e.g. two; 1 data and 2 mc, and returns the difference between these moments of the different bins. The bins are passed in the true labels.''' + # The below counts the entries in the histogram vector, i.e. the actual mini batch size + h_entr = K.sum(y_true,axis=0) + + ## first moment ## + # Multiply the histogram vectors with estimated propability y_pred + # and sum each histogram vector + #Rows: predition classes, Colums: bins + Sum = tf.transpose(tf.matmul( + tf.transpose(y_true), y_pred + )) + + # Devide sum by entries (batch size) (i.e. mean, first moment) + Sum /= h_entr + + # Devide per vector mean by average mean in each class, i.e. now SUM is a vector of the relative deviations from the absolute mean + #Rows: bins, Columns: prediction classes + Sum = tf.transpose(Sum) / K.mean(y_pred, axis=0) + + #higer moments: common var + y_pred_deviation = y_pred - K.mean(y_pred, axis=0) + + ## second moment, same logic as before, just squared + #Rows: predition classes, Colums: bins + Sum2 = tf.transpose(tf.matmul( + tf.transpose(y_true), y_pred_deviation**2 + )) + Sum2 /= h_entr + Sum2 = tf.transpose(Sum2)/K.mean(y_pred_deviation**2, axis=0) + + ## third moment + Sum3 = tf.transpose(tf.matmul( + tf.transpose(y_true), y_pred_deviation**3 + )) + Sum3 /= h_entr + Sum3 = tf.transpose(Sum3)/K.mean(y_pred_deviation**3, axis=0) + + ## fourth moment + Sum4 = tf.transpose(tf.matmul( + tf.transpose(y_true), y_pred_deviation**4 + )) + Sum4 /= h_entr + Sum4 = tf.transpose(Sum4)/K.mean(y_pred_deviation**4, axis=0) + + return K.mean(K.square(Sum-1)) + K.mean(K.square(Sum2-1)) + K.mean(K.square(Sum3-1)) + K.mean(K.square(Sum4-1)) + +global_loss_list['nd_moment_loss'] = nd_moment_loss + +def nd_moment_factory(nmoments): + if not isinstance(nmoments, int) and nmoments < 1: + raise ValueError('The number of moments used must be integer and > 1') + def nd_moments_(y_true, y_pred): + # The below counts the entries in the histogram vector, i.e. the actual mini batch size + h_entr = K.sum(y_true,axis=0) + + ## first moment, it's always there ## + # Multiply the histogram vectors with estimated propability y_pred + # and sum each histogram vector + #Rows: predition classes, Colums: bins + Sum = tf.transpose(tf.matmul( + tf.transpose(y_true), y_pred + )) + + # Devide sum by entries (batch size) (i.e. mean, first moment) + Sum /= h_entr + + # Devide per vector mean by average mean in each class, i.e. now SUM is a vector of the relative deviations from the absolute mean + #Rows: bins, Columns: prediction classes + Sum = tf.transpose(Sum) / K.mean(y_pred, axis=0) + + #higer moments: common var + y_pred_deviation = y_pred - K.mean(y_pred, axis=0) + nsums = [K.mean(K.square(Sum-1))] + for idx in range(2, nmoments+1): #from 2 to N (included) + isum = tf.transpose(tf.matmul( + tf.transpose(y_true), y_pred_deviation**idx + )) + isum /= h_entr + isum = tf.transpose(isum)/K.mean(y_pred_deviation**idx, axis=0) + nsums.append(K.mean(K.square(isum-1))) + return tf.add_n(nsums) + return nd_moments_ + +for i in range(1, 5): + global_loss_list['nd_%dmoment_loss' % i] = nd_moment_factory(i) diff --git a/modules/Makefile b/modules/Makefile index 93a1bab..fe68c72 100644 --- a/modules/Makefile +++ b/modules/Makefile @@ -24,6 +24,8 @@ MODULES_SHARED_LIBS := $(addprefix ./,$(notdir $(MODULES:.C=.so))) all: $(MODULES_SHARED_LIBS) #helpers +libquicklz.so: + gcc -shared -O2 -fPIC src/quicklz.c -o libquicklz.so obj/%.o: src/%.cpp g++ $(CFLAGS) $(ROOTSTUFF) -I./interface -O2 -fPIC -c -o $@ $< @@ -33,15 +35,16 @@ libdeepjethelpers.so: $(OBJ_FILES) g++ -shared $(LINUXADD) $(ROOTSTUFF) obj/*.o -o $@ -%.so: %.o libdeepjethelpers.so - g++ -shared $(LINUXADD) $(ROOTSTUFF) -L./ -ldeepjethelpers -L$(BOOST_LIB) -lboost_python -L${CONDA_PREFIX}/lib/python$(PYTHON_VERSION)/config -lpython2.7 $< -o $(@) +%.so: %.o libdeepjethelpers.so libquicklz.so + g++ -shared $(LINUXADD) $(ROOTSTUFF) -lquicklz -L./ -ldeepjethelpers -L$(BOOST_LIB) -lboost_python -L${CONDA_PREFIX}/lib/python$(PYTHON_VERSION)/config -lpython2.7 $< -o $(@) %.o: src/%.C g++ $(ROOTSTUFF) -O2 -I./interface -I$(PYTHON_INCLUDE) -I$(BOOST_INC) -fPIC -c -o $(@) $< - - clean: - rm -f $(OBJ_FILES) $(SHARED_LIBS) $(MODULES_SHARED_LIBS) $(MODULES_OBJ_FILES) libdeepjethelpers.so \ No newline at end of file + rm -f $(OBJ_FILES) $(SHARED_LIBS) $(MODULES_SHARED_LIBS) $(MODULES_OBJ_FILES) libdeepjethelpers.so + + + \ No newline at end of file diff --git a/modules/MultiDataCollection.py b/modules/MultiDataCollection.py new file mode 100644 index 0000000..6e4e2e9 --- /dev/null +++ b/modules/MultiDataCollection.py @@ -0,0 +1,192 @@ +from DataCollection import DataCollection +from multiprocessing import cpu_count +from itertools import izip +from pdb import set_trace +import numpy as np +from copy import deepcopy + +class MultiDataCollection(object): + '''This class allows the simultaneous use of multiple +DataCollections for a training, it provides the same interface +and adds the functionality of adding Y targets as well as flags returned instead of the weights. +In case of weights the flag is multiplied by the weight value +Constructor ([infiles = None[, nprocs = -1[, add_ys = [][, flags=[]]]]]) +optional parameters: +infiles: list of input dataCollection files to be opened +nprocs: number of processors to use +add_ys: list of additional Y targets to be added at generator time, must be the same length of the input collections. The list content must be iterable, each iteration produces a new target, only scalar type supprted for now +flags: like add_ys, same rules apply. The flags gets multiplied by the event weight is case weights are used. The lenght of the flags must be the same as the TOTAL number of Y targets. Flags are returned instead of the event weights +''' + def __init__(self, infiles = None, nprocs = -1, add_ys = [] ,flags=[]): + '''Constructor''' + self.collections = [] + self.nprocs = nprocs + self.meansnormslimit=500000 + self.flags = [] + self.generator_modifier = lambda x: x + self.additional_ys = [] + if infiles: + self.collections = [ + DataCollection( + i, + cpu_count()/len(infiles) if nprocs == -1 else nprocs/len(infiles) + ) for i in infiles] + if flags: + self.setFlags(flags) + if add_ys: + self.addYs(add_ys) + + @property + def useweights(self): + return all(i.useweights for i in self.collections) + + @useweights.setter + def useweights(self, val): + for i in self.collections: + i.useweights = val + + def addYs(self, add_ys): + 'adds Ys that will be appended on the fly to the generator, Ys are a list of iterables' + if len(add_ys) != len(self.collections): + raise ValueError('The Ys must be the same lenght of the input collections') + self.additional_ys = add_ys + + def readFromFile(self, infiles): + self.collections = [ + DataCollection( + i, cpu_count()/len(infiles) if self.nprocs == -1 else self.nprocs/len(infiles) + ) for i in infiles] + + def setFlags(self, flags): + 'adds flags that will be added on the fly to the generator, flags are a list of iterables' + if len(flags) != len(self.collections): + raise ValueError('The flags must be the same lenght of the input collections') + self.flags = flags + + def getInputShapes(self): + 'Gets the input shapes from the data class description' + shapes = [i.getInputShapes() for i in self.collections] + if not all(i == shapes[0] for i in shapes): + raise ValueError('Input collections have different input shapes!') + return shapes[0] + + def getTruthShape(self): + shapes = [i.getTruthShape() for i in self.collections] + if not all(i == shapes[0] for i in shapes): + raise ValueError('Input collections have different input shapes!') + return shapes[0] + + def getNRegressionTargets(self): + shapes = [i.getNRegressionTargets() for i in self.collections] + if not all(i == shapes[0] for i in shapes): + raise ValueError('Input collections have different input shapes!') + return shapes[0] + + def getNClassificationTargets(self): + shapes = [i.getNClassificationTargets() for i in self.collections] + if not all(i == shapes[0] for i in shapes): + raise ValueError('Input collections have different input shapes!') + return shapes[0] + + def getUsedTruth(self): + shapes = [i.getUsedTruth() for i in self.collections] + if not all(i == shapes[0] for i in shapes): + raise ValueError('Input collections have different input shapes!') + return shapes[0] + + def split(self,ratio): + 'splits the sample into two parts, one is kept as the new collection, the other is returned' + out = [i.split(ratio) for i in self.collections] + retval = deepcopy(self) + retval.collections = out + return retval + + def writeToFile(self, fname): + for idx, i in enumerate(self.collections): + i.writeToFile(fname.replace('.dc', '%d.dc' % idx)) + + def generator(self): + '''Batch generator. Heavily based on the DataCollection one. +Adds flags on the fly at the end of each Y''' + generators = [i.generator() for i in self.collections] + flags = self.flags if self.flags else [None for i in self.collections] + add_ys = self.additional_ys if self.additional_ys else [[] for i in self.collections] + for zipped in izip(*generators): + xtot, wtot, ytot = None, None, None + for xyw, flag, add_y in zip(zipped, flags, add_ys): + if len(xyw) == 3: + x, y, w = deepcopy(xyw) + else: #len(xyw) == 3: + x, y = deepcopy(xyw) + w = [np.ones((x[0].shape[0]))] if self.flags else None + + batch_size = x[0].shape[0] + ones = np.ones((batch_size, 1)) + for template in add_y: + y_to_add = np.hstack([ones*i for i in template]) \ + if hasattr(template, '__iter__') else \ + ones*template + y.append(y_to_add) + + #create the flags + if self.flags: + if len(flag) != len(y): + raise ValueError( + 'Flags (if any) and total Y number MUST' + ' be the same! Got: %d and %d' % (len(flag), len(y))) + w = [w[0]*i for i in flag] + + if xtot is None: + xtot = x + ytot = y + wtot = w + else: + xtot = [np.vstack([itot, ix]) for itot, ix in zip(xtot, x)] + ytot = [np.vstack([itot, iy]) for itot, iy in zip(ytot, y)] + if w is not None: + wtot = [np.concatenate([itot, iw]) for itot, iw in zip(wtot, w)] + + if wtot is None: + yield self.generator_modifier((xtot, ytot)) + else: + yield self.generator_modifier((xtot, ytot, wtot)) + + def __len__(self): + return sum(len(i) for i in self.collections) + + @property + def sizes(self): + return [len(i) for i in self.collections] + + @property + def nsamples(self): + return len(self) + + def setBatchSize(self,bsize): + if bsize > len(self): + raise Exception('Batch size must not be bigger than total sample size') + for i in self.collections: + batch = bsize*len(i)/len(self) + i.setBatchSize(batch) + + @property + def batches(self): + return [i.batch_size for i in self.collections] + + def getAvEntriesPerFile(self): + return min(i.getAvEntriesPerFile() for i in self.collections) + + @property + def maxFilesOpen(self): + return max(i.maxFilesOpen for i in self.collections) + + @maxFilesOpen.setter + def maxFilesOpen(self, val): + for i in self.collections: + i.maxFilesOpen = val + + def getNBatchesPerEpoch(self): + return sum(i.getNBatchesPerEpoch() for i in self.collections)/len(self.collections) + + + diff --git a/modules/ReduceLROnPlateau.py b/modules/ReduceLROnPlateau.py index 298a4ea..f276587 100644 --- a/modules/ReduceLROnPlateau.py +++ b/modules/ReduceLROnPlateau.py @@ -1,7 +1,8 @@ ''' Created on 16 Mar 2017 -backported from original keras source to current version +backported from original keras source to current version. +Added more functionality (e.g. intervals) ''' from __future__ import absolute_import from __future__ import print_function @@ -52,7 +53,8 @@ class ReduceLROnPlateau(Callback): """ def __init__(self, monitor='val_loss', factor=0.1, patience=10, - verbose=0, mode='auto', epsilon=1e-4, cooldown=0, min_lr=0): + verbose=0, mode='auto', epsilon=1e-4, cooldown=0, min_lr=0, + nexecuted=0): super(ReduceLROnPlateau, self).__init__() self.monitor = monitor @@ -68,6 +70,7 @@ def __init__(self, monitor='val_loss', factor=0.1, patience=10, self.cooldown_counter = 0 # Cooldown counter. self.wait = 0 self.best = 0 + self.nexecuted=nexecuted self.mode = mode self.monitor_op = None self._reset() @@ -95,6 +98,20 @@ def on_train_begin(self, logs=None): self._reset() def on_epoch_end(self, epoch, logs=None): + + + + thisfactor=self.factor + #factors is list + if (not hasattr(self.factor, "strip") and + hasattr(self.factor, "__getitem__") or + hasattr(self.factor, "__iter__")): + if self.nexecuted0: self.best = current self.wait = 0 elif not self.in_cooldown(): @@ -120,6 +137,7 @@ def on_epoch_end(self, epoch, logs=None): print('\nEpoch %05d: reducing learning rate to %s.' % (epoch, new_lr)) self.cooldown_counter = self.cooldown self.wait = 0 + self.nexecuted+=1 self.wait += 1 def in_cooldown(self): diff --git a/modules/TrainData.py b/modules/TrainData.py index 26f9876..31df7be 100644 --- a/modules/TrainData.py +++ b/modules/TrainData.py @@ -6,11 +6,19 @@ from __future__ import print_function + from Weighter import Weighter from pdb import set_trace import numpy import logging +import threading +import multiprocessing + +threadingfileandmem_lock=threading.Lock() +#threadingfileandmem_lock.release() +#multiproc_fileandmem_lock=multiprocessing.Lock() + def fileTimeOut(fileName, timeOut): ''' simple wait function in case the file system has a glitch. @@ -32,20 +40,35 @@ def fileTimeOut(fileName, timeOut): counter+=1 time.sleep(1) -def _read_arrs_(arrwl,arrxl,arryl,doneVal,fileprefix): + +def _read_arrs_(arrwl,arrxl,arryl,doneVal,fileprefix,tdref=None,randomSeed=None): + import gc + gc.collect() + import h5py - idstrs=['w','x','y'] - h5f = h5py.File(fileprefix,'r') - alllists=[arrwl,arrxl,arryl] - for j in range(len(idstrs)): - fidstr=idstrs[j] - arl=alllists[j] - for i in range(len(arl)): - idstr=fidstr+str(i) - h5f[idstr].read_direct(arl[i]) - doneVal.value=True - h5f.close() - + from sklearn.utils import shuffle + try: + idstrs=['w','x','y'] + h5f = h5py.File(fileprefix,'r') + alllists=[arrwl,arrxl,arryl] + for j in range(len(idstrs)): + fidstr=idstrs[j] + arl=alllists[j] + for i in range(len(arl)): + idstr=fidstr+str(i) + h5f[idstr].read_direct(arl[i]) + #shuffle each read-in, but each array with the same seed (keeps right asso) + if randomSeed: + arl[i]=shuffle(arl[i], random_state=randomSeed) + + doneVal.value=True + h5f.close() + del h5f + except Exception as d: + raise d + finally: + if tdref: + tdref.removeRamDiskFile() class ShowProgress(object): @@ -131,17 +154,27 @@ def __init__(self): def __del__(self): self.readIn_abort() + self.clear() def clear(self): self.samplename='' + self.readIn_abort() self.readthread=None self.readdone=None + if hasattr(self, 'x'): + del self.x + del self.y + del self.w + if hasattr(self, 'w_list'): + del self.w_list + del self.x_list + del self.y_list + self.x=[numpy.array([])] self.y=[numpy.array([])] self.w=[numpy.array([])] - self.nsamples=None @@ -206,6 +239,7 @@ def reduceTruth(self, tuple_in=None): return numpy.array(tuple_in.tolist()) def writeOut(self,fileprefix): + import h5py fileTimeOut(fileprefix,120) h5f = h5py.File(fileprefix, 'w') @@ -222,7 +256,13 @@ def _writeoutArrays(arrlist,fidstr,h5F): for i in range(len(arrlist)): idstr=fidstr+str(i) arr=arrlist[i] - h5F.create_dataset(idstr, data=arr, compression="lzf") + if "meta" in fileprefix[-4:]: + from c_readArrThreaded import writeArray + if arr.dtype!='float32': + arr=arr.astype('float32') + writeArray(arr.ctypes.data,fileprefix[:-4]+fidstr+'.'+str(i),list(arr.shape)) + else: + h5F.create_dataset(idstr, data=arr, compression="lzf") arr=numpy.array([self.nsamples],dtype='int') @@ -238,6 +278,8 @@ def _writeoutArrays(arrlist,fidstr,h5F): h5f.close() + + def __createArr(self,shapeinfo): import ctypes @@ -254,14 +296,28 @@ def __createArr(self,shapeinfo): shared_array = shared_array.reshape(shapeinfo) #print('gave shape',shapeinfo) return shared_array - - def readIn_async(self,fileprefix,read_async=True): + + def removeRamDiskFile(self): + if hasattr(self, 'ramdiskfile'): + import os + try: + if self.ramdiskfile and os.path.exists(self.ramdiskfile): + if "meta" in self.ramdiskfile[-4:]: + os.system('rm -f '+self.ramdiskfile[:-4]+"*") + else: + os.remove(self.ramdiskfile) + except OSError: + pass + self.ramdiskfile=None + + def readIn_async(self,fileprefix,read_async=True,shapesOnly=False,ramdiskpath='',randomseed=None): if self.readthread and read_async: print('\nTrainData::readIn_async: started new read before old was finished. Intended? Waiting for first to finish...\n') self.readIn_join() #print('read') + import h5py import multiprocessing @@ -286,103 +342,214 @@ def _readListInfo_(idstr): return sharedlist, shapeinfos - try: - self.h5f = h5py.File(fileprefix,'r') - except: - raise IOError('File %s could not be opened properly, it may be corrupted' % fileprefix) - self.nsamples=self.h5f['n'] - self.nsamples=self.nsamples[0] - if True or not hasattr(self, 'w_shapes'): - self.w_list,self.w_shapes=_readListInfo_('w') - self.x_list,self.x_shapes=_readListInfo_('x') - self.y_list,self.y_shapes=_readListInfo_('y') - else: - print('\nshape known\n') - self.w_list,_=_readListInfo_('w') - self.x_list,_=_readListInfo_('x') - self.y_list,_=_readListInfo_('y') - - self.h5f.close() - - #create shared mem in sync mode - for i in range(len(self.w_list)): - self.w_list[i]=self.__createArr(self.w_shapes[i]) + with threadingfileandmem_lock: + try: + self.h5f = h5py.File(fileprefix,'r') + except: + raise IOError('File %s could not be opened properly, it may be corrupted' % fileprefix) + self.nsamples=self.h5f['n'] + self.nsamples=self.nsamples[0] + if True or not hasattr(self, 'w_shapes'): + self.w_list,self.w_shapes=_readListInfo_('w') + self.x_list,self.x_shapes=_readListInfo_('x') + self.y_list,self.y_shapes=_readListInfo_('y') + else: + print('\nshape known\n') + self.w_list,_=_readListInfo_('w') + self.x_list,_=_readListInfo_('x') + self.y_list,_=_readListInfo_('y') + + self.h5f.close() + del self.h5f + self.h5f=None + if shapesOnly: + return - for i in range(len(self.x_list)): - self.x_list[i]=self.__createArr(self.x_shapes[i]) + readfile=fileprefix - for i in range(len(self.y_list)): - self.y_list[i]=self.__createArr(self.y_shapes[i]) + isRamDisk=len(ramdiskpath)>0 + if isRamDisk: + import shutil + import uuid + import os + import copy + unique_filename='' + + unique_filename = ramdiskpath+'/'+str(uuid.uuid4())+'.z' + if "meta" in readfile[-4:]: + filebase=readfile[:-4] + unique_filename = ramdiskpath+'/'+str(uuid.uuid4()) + shutil.copyfile(filebase+'meta',unique_filename+'.meta') + for i in range(len(self.w_list)): + shutil.copyfile(filebase+'w.'+str(i),unique_filename+'.w.'+str(i)) + for i in range(len(self.x_list)): + shutil.copyfile(filebase+'x.'+str(i),unique_filename+'.x.'+str(i)) + for i in range(len(self.y_list)): + shutil.copyfile(filebase+'y.'+str(i),unique_filename+'.y.'+str(i)) + unique_filename+='.meta' + + else: + unique_filename = ramdiskpath+'/'+str(uuid.uuid4())+'.z' + shutil.copyfile(fileprefix, unique_filename) + readfile=unique_filename + self.ramdiskfile=readfile + #create shared mem in sync mode + for i in range(len(self.w_list)): + self.w_list[i]=self.__createArr(self.w_shapes[i]) + + for i in range(len(self.x_list)): + self.x_list[i]=self.__createArr(self.x_shapes[i]) + + for i in range(len(self.y_list)): + self.y_list[i]=self.__createArr(self.y_shapes[i]) + + if read_async: + self.readdone=multiprocessing.Value('b',False) + if read_async: - self.readdone=multiprocessing.Value('b',False) - self.readthread=multiprocessing.Process(target=_read_arrs_, args=(self.w_list,self.x_list,self.y_list,self.readdone,fileprefix)) - self.readthread.start() + if "meta" in readfile[-4:]: + #new format + from c_readArrThreaded import startReading + self.readthreadids=[] + filebase=readfile[:-4] + for i in range(len(self.w_list)): + self.readthreadids.append(startReading(self.w_list[i].ctypes.data, + filebase+'w.'+str(i), + list(self.w_list[i].shape), + isRamDisk)) + for i in range(len(self.x_list)): + self.readthreadids.append(startReading(self.x_list[i].ctypes.data, + filebase+'x.'+str(i), + list(self.x_list[i].shape), + isRamDisk)) + for i in range(len(self.y_list)): + self.readthreadids.append(startReading(self.y_list[i].ctypes.data, + filebase+'y.'+str(i), + list(self.y_list[i].shape), + isRamDisk)) + + + else: + self.readthread=multiprocessing.Process(target=_read_arrs_, + args=(self.w_list, + self.x_list, + self.y_list, + self.readdone, + readfile, + self,randomseed)) + self.readthread.start() else: - self.readdone=multiprocessing.Value('b',False) - _read_arrs_(self.w_list,self.x_list,self.y_list,self.readdone,fileprefix) + if "meta" in readfile[-4:]: + from c_readArrThreaded import readBlocking + filebase=readfile[:-4] + self.readthreadids=[] + for i in range(len(self.w_list)): + (readBlocking(self.w_list[i].ctypes.data, + filebase+'w.'+str(i), + list(self.w_list[i].shape), + isRamDisk)) + for i in range(len(self.x_list)): + (readBlocking(self.x_list[i].ctypes.data, + filebase+'x.'+str(i), + list(self.x_list[i].shape), + isRamDisk)) + for i in range(len(self.y_list)): + (readBlocking(self.y_list[i].ctypes.data, + filebase+'y.'+str(i), + list(self.y_list[i].shape), + isRamDisk)) + + else: + self.readdone=multiprocessing.Value('b',False) + _read_arrs_(self.w_list,self.x_list,self.y_list,self.readdone,readfile,self,randomseed) - - def readIn_async_NEW(self,fileprefix): - - #if self.readthread: - # print('\nTrainData::readIn_async: started new read before old was finished. Intended? Waiting for first to finish...\n') - # self.readIn_join() - # - # - #import multiprocessing - # - #self.samplename=fileprefix - #self.readqueue=multiprocessing.Queue() - # - #self.readdone=multiprocessing.Value('b',False) - #self.readthread=multiprocessing.Process(target=_read_arrs, args=(fileprefix,self.readdone,self.readqueue)) - #self.readthread.start() - # - print('\nstarted async thread\n') - - - #def __del__(self): - # self.readIn_abort() - def readIn_abort(self): + self.removeRamDiskFile() if not self.readthread: return self.readthread.terminate() self.readthread=None self.readdone=None - def readIn_join(self,wasasync=True): - #print('joining async read') - if not self.readthread and wasasync: - print('\nreadIn_join:read never started\n') + def readIn_join(self,wasasync=True,waitforStart=True): - counter=0 - while not self.readdone.value and wasasync: - self.readthread.join(1) - counter+=1 - if counter>10: #read failed. do synchronous read - print('\nfalling back to sync read\n') - self.readthread.terminate() - self.readthread=None - self.readIn(self.samplename) - return - if self.readdone.value: - self.readthread.join(1) + try: + if not not hasattr(self, 'readthreadids') and not waitforStart and not self.readthread and wasasync: + print('\nreadIn_join:read never started\n') + + import time + if waitforStart: + while (not hasattr(self, 'readthreadids')) and not self.readthread: + time.sleep(0.1) + if hasattr(self, 'readthreadids'): + while not self.readthreadids: + time.sleep(0.1) + + counter=0 + + if hasattr(self, 'readthreadids') and self.readthreadids: + from c_readArrThreaded import isDone + doneids=[] + while True: + for id in self.readthreadids: + if id in doneids: continue + if isDone(id): + doneids.append(id) + if len(self.readthreadids) == len(doneids): + break + time.sleep(0.1) + counter+=1 + if counter>3000: #read failed. do synchronous read, safety option if threads died + print('\nfalling back to sync read\n') + self.readthread.terminate() + self.readthread=None + self.readIn(self.samplename) + return - import copy - #move away from shared memory - #this costs performance but seems necessary - self.w=copy.deepcopy(self.w_list) - del self.w_list - self.x=copy.deepcopy(self.x_list) - del self.x_list - self.y=copy.deepcopy(self.y_list) - del self.y_list - + else: #will be removed at some point + while wasasync and (not self.readdone or not self.readdone.value): + if not self.readthread: + time.sleep(.1) + continue + self.readthread.join(.1) + counter+=1 + if counter>3000: #read failed. do synchronous read, safety option if threads died + print('\nfalling back to sync read\n') + self.readthread.terminate() + self.readthread=None + self.readIn(self.samplename) + return + if self.readdone.value: + self.readthread.join(.1) + + import copy + #move away from shared memory + #this costs performance but seems necessary + direct=False + with threadingfileandmem_lock: + if direct: + self.w=self.w_list + self.x=self.x_list + self.y=self.y_list + else: + self.w=copy.deepcopy(self.w_list) + self.x=copy.deepcopy(self.x_list) + self.y=copy.deepcopy(self.y_list) + + del self.w_list + del self.x_list + del self.y_list + #in case of some errors during read-in + + except Exception as d: + raise d + finally: + self.removeRamDiskFile() + #check if this is really neccessary def reshape_fast(arr,shapeinfo): if len(shapeinfo)<2: shapeinfo=numpy.array([arr.shape[0],1]) @@ -400,24 +567,39 @@ def reshape_fast(arr,shapeinfo): self.w_list=None self.x_list=None self.y_list=None - if wasasync: + if wasasync and self.readthread: self.readthread.terminate() self.readthread=None self.readdone=None - def readIn(self,fileprefix): - self.readIn_async(fileprefix,False) - self.w=(self.w_list) - self.x=(self.x_list) - self.y=(self.y_list) + def readIn(self,fileprefix,shapesOnly=False): + self.readIn_async(fileprefix,False,shapesOnly) + direct=True + if direct: + self.w=self.w_list + self.x=self.x_list + self.y=self.y_list + else: + import copy + self.w=copy.deepcopy(self.w_list) + del self.w_list + self.x=copy.deepcopy(self.x_list) + del self.x_list + self.y=copy.deepcopy(self.y_list) + del self.y_list def reshape_fast(arr,shapeinfo): if len(shapeinfo)<2: shapeinfo=numpy.array([arr.shape[0],1]) - arr=arr.reshape(shapeinfo) + if shapesOnly: + arr=numpy.zeros(shape=shapeinfo) + else: + arr=arr.reshape(shapeinfo) return arr + + for i in range(len(self.w)): self.w[i]=reshape_fast(self.w[i],self.w_shapes[i]) for i in range(len(self.x)): @@ -575,6 +757,33 @@ def getFlavourClassificationData(self,filename,TupleMeanStd, weighter): #print('took in total ', swall.getAndReset(),' seconds for conversion') return weights,x_all,alltruth, notremoves + + def _normalize_input_(self, weighter, npy_array): + weights = None + if self.weight: + weights=weighter.getJetWeights(npy_array) + self.w = [weights for _ in self.y] + elif self.remove: + notremoves=weighter.createNotRemoveIndices(npy_array) + undef=npy_array['isUndefined'] + notremoves-=undef + print(' to created remove indices') + weights=notremoves + + print('remove') + self.x = [x[notremoves > 0] for x in self.x] + self.y = [y[notremoves > 0] for y in self.y] + weights=weights[notremoves > 0] + self.w = [weights for _ in self.y] + newnsamp=self.x[0].shape[0] + print('reduced content to ', int(float(newnsamp)/float(self.nsamples)*100),'%') + self.nsamples = newnsamp + else: + print('neither remove nor weight') + weights=numpy.empty(self.nsamples) + weights.fill(1.) + self.w = [weights for _ in self.y] + from preprocessing import MeanNormApply, MeanNormZeroPad @@ -608,20 +817,24 @@ def reduceTruth(self, tuple_in): self.reducedtruthclasses=['isB','isBB','isC','isUDSG'] if tuple_in is not None: b = tuple_in['isB'].view(numpy.ndarray) - bb = tuple_in['isBB'].view(numpy.ndarray) bl = tuple_in['isLeptonicB'].view(numpy.ndarray) blc = tuple_in['isLeptonicB_C'].view(numpy.ndarray) allb = b+bl+blc - + + bb = tuple_in['isBB'].view(numpy.ndarray) + gbb = tuple_in['isGBB'].view(numpy.ndarray) c = tuple_in['isC'].view(numpy.ndarray) + cc = tuple_in['isCC'].view(numpy.ndarray) + gcc = tuple_in['isGCC'].view(numpy.ndarray) ud = tuple_in['isUD'].view(numpy.ndarray) s = tuple_in['isS'].view(numpy.ndarray) uds=ud+s g = tuple_in['isG'].view(numpy.ndarray) l = g + uds - return numpy.vstack((allb,bb,c,l)).transpose() + + return numpy.vstack((allb,bb+gbb,c+cc+gcc,l)).transpose() class TrainData_leptTruth(TrainData): diff --git a/modules/TrainData_PT_recur.py b/modules/TrainData_PT_recur.py index 8cbad03..b15ae56 100644 --- a/modules/TrainData_PT_recur.py +++ b/modules/TrainData_PT_recur.py @@ -7,7 +7,7 @@ class TrainData_QG_simple(TrainData_fullTruth): def __init__(self): super(TrainData_QG_simple, self).__init__() - self.addBranches(['QG_ptD', 'QG_axis2', 'QG_mult']) + self.addBranches(['jet_pt', 'jet_eta', 'rho', 'QG_ptD', 'QG_axis2', 'QG_mult']) def readFromRootFile(self,filename,TupleMeanStd, weighter): from preprocessing import MeanNormZeroPad @@ -95,13 +95,13 @@ def __init__(self): Constructor ''' super(TrainData_PT_recur, self).__init__() - + self.regressiontargetclasses=['uncPt','Pt'] self.addBranches([ #base - 'jet_pt', 'jet_eta','nCpfcand','nNpfcand','nsv','npv', + 'jet_pt', 'jet_eta','nCpfcand','nNpfcand','nsv','rho', #q/g enhancements - 'QG_ptD', 'QG_axis2', 'QG_mult', + #'QG_ptD', 'QG_axis2', 'QG_mult', ]) self.addBranches([ @@ -333,13 +333,13 @@ def __init__(self): Constructor ''' super(TrainData_recurrent_fullTruth, self).__init__() - + self.regressiontargetclasses=['uncPt','Pt'] self.addBranches([ #base - 'jet_pt', 'jet_eta','nCpfcand','nNpfcand','nsv','npv', + 'jet_pt', 'jet_eta','nCpfcand','nNpfcand','nsv','rho', #q/g enhancements - 'QG_ptD', 'QG_axis2', 'QG_mult', + #'QG_ptD', 'QG_axis2', 'QG_mult', ]) self.addBranches([ @@ -410,7 +410,6 @@ def readFromRootFile(self,filename,TupleMeanStd, weighter): if self.remove: notremoves=weighter.createNotRemoveIndices(nparray) undef=nparray['isUndefined'] - hf = np_slice.any(axis=1) notremoves -= undef print('took ', sw.getAndReset(), ' to create remove indices') diff --git a/modules/TrainData_deepCSV.py b/modules/TrainData_deepCSV.py index 60e0cca..5e5ea13 100644 --- a/modules/TrainData_deepCSV.py +++ b/modules/TrainData_deepCSV.py @@ -3,7 +3,7 @@ @author: jkiesele ''' -from TrainData import TrainData_Flavour, TrainData_simpleTruth +from TrainData import TrainData_Flavour, TrainData_simpleTruth, TrainData_fullTruth, fileTimeOut @@ -49,7 +49,139 @@ def __init__(self): 'TagVarCSV_flightDistance3dSig'], 1) + def readFromRootFile(self,filename,TupleMeanStd, weighter): + super(TrainData_deepCSV, self).readFromRootFile(filename, TupleMeanStd, weighter) + ys = self.y[0] + flav_sum = ys.sum(axis=1) + if (flav_sum > 1).any(): + raise ValueError('In file: %s I get a jet with multiple flavours assigned!' % filename) + mask = (flav_sum == 1) + self.x = [self.x[0][mask]] + self.y = [self.y[0][mask]] + self.w = [self.w[0][mask]] + +class TrainData_deepCSV_RNN(TrainData_fullTruth): + ''' + same as TrainData_deepCSV but with 4 truth labels: B BB C UDSG + ''' + def __init__(self): + ''' + Constructor + ''' + super(TrainData_deepCSV_RNN, self).__init__() + + self.addBranches([ + 'jet_pt', 'jet_eta', + 'TagVarCSV_jetNSecondaryVertices', + 'TagVarCSV_trackSumJetEtRatio', 'TagVarCSV_trackSumJetDeltaR', + 'TagVarCSV_vertexCategory', 'TagVarCSV_trackSip2dValAboveCharm', + 'TagVarCSV_trackSip2dSigAboveCharm', 'TagVarCSV_trackSip3dValAboveCharm', + 'TagVarCSV_trackSip3dSigAboveCharm', 'TagVarCSV_jetNSelectedTracks', + 'TagVarCSV_jetNTracksEtaRel']) + + self.addBranches([ + 'TagVarCSVTrk_trackJetDistVal', + 'TagVarCSVTrk_trackPtRel', + 'TagVarCSVTrk_trackDeltaR', + 'TagVarCSVTrk_trackPtRatio', + 'TagVarCSVTrk_trackSip3dSig', + 'TagVarCSVTrk_trackSip2dSig', + 'TagVarCSVTrk_trackDecayLenVal' + ], 6) + + + self.addBranches(['TagVarCSV_trackEtaRel'],4) + + self.addBranches([ + 'TagVarCSV_vertexMass', + 'TagVarCSV_vertexNTracks', + 'TagVarCSV_vertexEnergyRatio', + 'TagVarCSV_vertexJetDeltaR', + 'TagVarCSV_flightDistance2dVal', + 'TagVarCSV_flightDistance2dSig', + 'TagVarCSV_flightDistance3dVal', + 'TagVarCSV_flightDistance3dSig' + ], 1) + + self.addBranches(['jet_corr_pt']) + self.registerBranches(['gen_pt_WithNu']) + self.regressiontargetclasses=['uncPt','Pt'] + + + def readFromRootFile(self,filename,TupleMeanStd, weighter): + from preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles + import numpy + from stopwatch import stopwatch + + sw=stopwatch() + swall=stopwatch() + + import ROOT + + fileTimeOut(filename,120) #give eos a minute to recover + rfile = ROOT.TFile(filename) + tree = rfile.Get("deepntuplizer/tree") + self.nsamples=tree.GetEntries() + + print('took ', sw.getAndReset(), ' seconds for getting tree entries') + + + # split for convolutional network + + x_global = MeanNormZeroPad( + filename,None, + [self.branches[0]], + [self.branchcutoffs[0]],self.nsamples + ) + + x_cpf = MeanNormZeroPadParticles( + filename,None, + self.branches[1], + self.branchcutoffs[1],self.nsamples + ) + + x_etarel = MeanNormZeroPadParticles( + filename,None, + self.branches[2], + self.branchcutoffs[2],self.nsamples + ) + + x_sv = MeanNormZeroPadParticles( + filename,None, + self.branches[3], + self.branchcutoffs[3],self.nsamples + ) + + print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') + + npy_array = self.readTreeFromRootToTuple(filename) + + reg_truth=npy_array['gen_pt_WithNu'].view(numpy.ndarray) + reco_pt=npy_array['jet_corr_pt'].view(numpy.ndarray) + + correctionfactor=numpy.zeros(self.nsamples) + for i in range(self.nsamples): + correctionfactor[i]=reg_truth[i]/reco_pt[i] + + truthtuple = npy_array[self.truthclasses] + alltruth=self.reduceTruth(truthtuple) + + self.x=[x_global, x_cpf, x_etarel, x_sv, reco_pt] + self.y=[alltruth,correctionfactor] + self._normalize_input_(weighter, npy_array) + +class TrainData_deepCSV_RNN_Deeper(TrainData_deepCSV_RNN): + ''' + same as TrainData_deepCSV but with 4 truth labels: B BB C UDSG + ''' + def __init__(self): + ''' + Constructor + ''' + super(TrainData_deepCSV_RNN_Deeper, self).__init__() + self.branchcutoffs = [1, 20, 13, 4, 1] + diff --git a/modules/TrainData_deepFlavour.py b/modules/TrainData_deepFlavour.py index b7eff83..db354eb 100644 --- a/modules/TrainData_deepFlavour.py +++ b/modules/TrainData_deepFlavour.py @@ -962,8 +962,9 @@ def __init__(self): Constructor ''' super(TrainData_image,self).__init__() + self.regressiontargetclasses=['uncPt','Pt'] - self.addBranches(['jet_pt', 'jet_eta','nCpfcand','nNpfcand','nsv','npv']) + self.addBranches(['jet_pt', 'jet_eta','nCpfcand','nNpfcand','nsv','rho']) self.registerBranches(['Cpfcan_ptrel','Cpfcan_eta','Cpfcan_phi', 'Npfcan_ptrel','Npfcan_eta','Npfcan_phi', 'nCpfcand','nNpfcand', @@ -1003,7 +1004,7 @@ def readFromRootFile(self,filename,TupleMeanStd, weighter): self.nsamples, ['Cpfcan_eta','jet_eta',20,0.5], ['Cpfcan_phi','jet_phi',20,0.5], - 'nCpfcand',-1) + 'nCpfcand',-1, weightbranch='Cpfcan_puppiw') x_chcount = createCountMap(filename,TupleMeanStd, self.nsamples, @@ -1016,7 +1017,7 @@ def readFromRootFile(self,filename,TupleMeanStd, weighter): self.nsamples, ['Npfcan_eta','jet_eta',20,0.5], ['Npfcan_phi','jet_phi',20,0.5], - 'nNpfcand',-1) + 'nNpfcand',-1, weightbranch='Npfcan_puppiw') x_neucount = createCountMap(filename,TupleMeanStd, self.nsamples, @@ -1123,3 +1124,160 @@ def model(input_shapes, nclasses): ] return Model(inputs=inputs, outputs=predictions) + + +class TrainData_deepFlavour_cleaninput(TrainData_deepFlavour_FT_reg_noScale): + def __init__(self): + ''' + Constructor + ''' + super(TrainData_deepFlavour_cleaninput, self).__init__() + self.branches[1].remove('Cpfcan_quality') + self.branches[1].append('Cpfcan_lostInnerHits')#Cpfcan_numberOfPixelHits + +class TrainData_deepFlavour_cleanBTVOnly(TrainData_fullTruth): + def __init__(self): + ''' + Constructor + ''' + super(TrainData_deepFlavour_cleanBTVOnly, self).__init__() + ## ---- GLOBAL ---- + ## 'jetPt': False, #$ + ## 'jetAbsEta': False, #$ + ## 'jetNSecondaryVertices': False, #$ + ## 'jetNSelectedTracks': False, #$ + ## 'jetNTracksEtaRel': False, #$ + ## 'vertexCategory': False, #$ + ## 'trackSip3dValAboveCharm': False, #$ + ## 'trackSip2dSigAboveCharm': False, #$ + ## 'trackSip2dValAboveCharm': False, #$ + ## 'trackSip3dSigAboveCharm': False #$ + + ## 'trackSumJetEtRatio': False, #$ + ## 'trackSumJetDeltaR': False, #$ + + ## ---- VERTEX ---- + ## 'vertexJetDeltaR': False, #$ + ## 'vertexMass': False, #$ + ## 'vertexNTracks': False, #$ + ## 'vertexEnergyRatio': False, #$ + ## 'flightDistance3dSig': False, #$ + ## 'flightDistance3dVal': False, #$ + ## 'flightDistance2dVal': False, #$ + ## 'flightDistance2dSig': False, #$ + + ## ---- TRACKS ---- + ## 'trackEtaRel': True, #$ + ## 'trackDecayLenVal': True, + ## 'trackJetDist': True, #$ + ## 'trackPtRatio': True, #$ + ## 'trackDeltaR': True, #$ + ## 'trackSip2dSig': True, #$ + ## 'trackPtRel': True, #$ + ## 'trackSip3dSig': True, #$ + self.addBranches([ + 'jet_pt', 'jet_eta', #$ + 'TagVarCSV_jetNSecondaryVertices', #$ + 'TagVarCSV_trackSumJetEtRatio', #$ + 'TagVarCSV_trackSumJetDeltaR', + 'TagVarCSV_vertexCategory', #$ + 'TagVarCSV_trackSip2dValAboveCharm', #$ + 'TagVarCSV_trackSip2dSigAboveCharm', #$ + 'TagVarCSV_trackSip3dValAboveCharm', #$ + 'TagVarCSV_trackSip3dSigAboveCharm', #$ + 'TagVarCSV_jetNSelectedTracks', #$ + 'TagVarCSV_jetNTracksEtaRel' #$ + ]) + + self.addBranches([ + 'Cpfcan_BtagPf_trackEtaRel', #$ + 'Cpfcan_BtagPf_trackPtRel', #$ + 'Cpfcan_BtagPf_trackDeltaR', #$ + 'Cpfcan_BtagPf_trackSip2dSig', #$ + 'Cpfcan_BtagPf_trackSip3dSig', #$ + 'Cpfcan_BtagPf_trackJetDistVal', #$ + 'Cpfcan_ptrel', #$ + #'Cpfcan_BtagPf_trackJetDistSig', #? + #trackDecayLenVal #? + ], 20) + + self.addBranches([ + 'sv_deltaR', #$ + 'sv_mass', #$ + 'sv_ntracks', #$ + 'sv_dxy', #$ + 'sv_dxysig', #$ + 'sv_d3d', #$ + 'sv_d3dsig', #$ + 'sv_enratio', #$ + ], 4) + + self.addBranches(['jet_corr_pt']) + self.registerBranches(['gen_pt_WithNu']) + self.regressiontargetclasses=['uncPt','Pt'] + + def readFromRootFile(self,filename,TupleMeanStd, weighter): + from preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles + import numpy + from stopwatch import stopwatch + + sw=stopwatch() + swall=stopwatch() + + import ROOT + + fileTimeOut(filename,120) #give eos a minute to recover + rfile = ROOT.TFile(filename) + tree = rfile.Get("deepntuplizer/tree") + self.nsamples=tree.GetEntries() + + print('took ', sw.getAndReset(), ' seconds for getting tree entries') + + + # split for convolutional network + + x_global = MeanNormZeroPad( + filename,None, + [self.branches[0]], + [self.branchcutoffs[0]],self.nsamples + ) + + x_cpf = MeanNormZeroPadParticles( + filename,None, + self.branches[1], + self.branchcutoffs[1],self.nsamples + ) + + x_sv = MeanNormZeroPadParticles( + filename,None, + self.branches[2], + self.branchcutoffs[2],self.nsamples + ) + + print('took ', sw.getAndReset(), ' seconds for mean norm and zero padding (C module)') + + npy_array = self.readTreeFromRootToTuple(filename) + + reg_truth=npy_array['gen_pt_WithNu'].view(numpy.ndarray) + reco_pt=npy_array['jet_corr_pt'].view(numpy.ndarray) + + correctionfactor=numpy.zeros(self.nsamples) + for i in range(self.nsamples): + correctionfactor[i]=reg_truth[i]/reco_pt[i] + + truthtuple = npy_array[self.truthclasses] + alltruth=self.reduceTruth(truthtuple) + + self.x=[x_global, x_cpf, x_sv, reco_pt] + self.y=[alltruth,correctionfactor] + self._normalize_input_(weighter, npy_array) + + +class TrainData_deepFlavour_nopuppi(TrainData_deepFlavour_FT_reg_noScale): + def __init__(self): + ''' + Constructor FIXME + ''' + super(TrainData_deepFlavour_nopuppi, self).__init__() + self.branches[1].remove('Cpfcan_puppiw') + self.branches[2].remove('Npfcan_puppiw') diff --git a/modules/TrainData_domainAda.py b/modules/TrainData_domainAda.py new file mode 100644 index 0000000..5785130 --- /dev/null +++ b/modules/TrainData_domainAda.py @@ -0,0 +1,23 @@ +''' +Created on 21 Feb 2017 + +@author: jkiesele +''' +from TrainData import TrainData_Flavour, TrainData_simpleTruth, TrainData_fullTruth, fileTimeOut + + + +class TrainData_sampleCheck(TrainData_Flavour, TrainData_simpleTruth): + ''' + Simple train data to check we are not introducing and bias due to sample + differences + ''' + + def __init__(self): + ''' + Constructor + ''' + TrainData_Flavour.__init__(self) + self.addBranches(['jet_pt', 'jet_eta', 'jet_phi']) + + diff --git a/modules/TrainData_test.py b/modules/TrainData_test.py index 02c744c..1364740 100644 --- a/modules/TrainData_test.py +++ b/modules/TrainData_test.py @@ -1,61 +1,77 @@ from TrainData import TrainData, fileTimeOut -from TrainData import TrainData_simpleTruth import numpy -class TrainData_test(TrainData_simpleTruth): +class TrainData_test(TrainData): ''' - class to show and test the density map + class to make tests. + Generates random data regardless of the input files + Can be useful for testing technical things ''' def __init__(self): + import numpy TrainData.__init__(self) - self.addBranches(['jet_pt', 'jet_eta','nCpfcand','nNpfcand','nsv','npv']) - + self.remove=False + + #define truth: + self.undefTruth=[''] + + self.truthclasses=['isClassA', + 'isClassB'] + + self.regressiontargetclasses=['reg'] + + self.reduceTruth() + def produceBinWeighter(self,filenames): + return self.make_empty_weighter() + + def produceMeansFromRootFile(self,files,limit): + return numpy.array([1]) + + def reduceTruth(self, tuple_in=None): + self.reducedtruthclasses=['isClassA', + 'isClassB'] + def readFromRootFile(self,filename,TupleMeanStd, weighter): - from preprocessing import MeanNormApply, MeanNormZeroPad, MeanNormZeroPadParticles,createDensityMap - import numpy - from stopwatch import stopwatch - - sw=stopwatch() - swall=stopwatch() - - import ROOT - - fileTimeOut(filename,120) #give eos a minute to recover - rfile = ROOT.TFile(filename) - tree = rfile.Get("deepntuplizer/tree") - self.nsamples=tree.GetEntries() - - - x_ch = createDensityMap(filename,TupleMeanStd, - 'Cpfcan_erel', - self.nsamples, - ['Cpfcan_eta','jet_eta',7,0.5], - ['Cpfcan_phi','jet_phi',7,0.5], - 'nCpfcand',-1) - x_neu = createDensityMap(filename,TupleMeanStd, - 'Npfcan_erel', - self.nsamples, - ['Npfcan_eta','jet_eta',7,0.5], - ['Npfcan_phi','jet_phi',7,0.5], - 'nNpfcand',-1) - x_sv = createDensityMap(filename,TupleMeanStd, - 'LooseIVF_sv_enratio', - self.nsamples, - ['LooseIVF_sv_eta','jet_eta',5,0.3], - ['LooseIVF_sv_phi','jet_phi',5,0.3], - 'LooseIVF_nsv') - - for i in range(20): - print(x_sv[i]) - - self.w=[numpy.zeros(10)] - self.x=[x_ch,x_neu] - self.y=[numpy.zeros(10)] + + import hashlib + m = hashlib.md5() + seed=int(int(m.hexdigest(),16)/1e36) + + + + numpy.random.seed(seed) + ya=numpy.random.random_integers(0,1,10000) + yb=numpy.zeros(10000) + yb=yb+1 + yb=yb-ya + y=numpy.vstack((ya,yb)) + yclass=y.transpose() + + numpy.random.seed(seed) + xreg=numpy.random.randn(10000) + numpy.random.seed(seed) + y=numpy.random.randn(10000)/2 + y=y+1 + w=numpy.zeros(10000) + w=w+1 + + xclass=y+ya + + print(xclass.shape) + print(xreg.shape) + print(yclass.shape) + print(y.shape) + + self.nsamples=10000 + + self.w=[w,w] + self.x=[xclass,xreg] + self.y=[yclass,y] diff --git a/modules/interface/quicklz.h b/modules/interface/quicklz.h new file mode 100644 index 0000000..1c153c8 --- /dev/null +++ b/modules/interface/quicklz.h @@ -0,0 +1,150 @@ +#ifndef QLZ_HEADER +#define QLZ_HEADER + +// Fast data compression library +// Copyright (C) 2006-2011 Lasse Mikkel Reinhold +// lar@quicklz.com +// +// QuickLZ can be used for free under the GPL 1, 2 or 3 license (where anything +// released into public must be open source) or under a commercial license if such +// has been acquired (see http://www.quicklz.com/order.html). The commercial license +// does not cover derived or ported versions created by third parties under GPL. + +// You can edit following user settings. Data must be decompressed with the same +// setting of QLZ_COMPRESSION_LEVEL and QLZ_STREAMING_BUFFER as it was compressed +// (see manual). If QLZ_STREAMING_BUFFER > 0, scratch buffers must be initially +// zeroed out (see manual). First #ifndef makes it possible to define settings from +// the outside like the compiler command line. + +// 1.5.0 final + +#ifndef QLZ_COMPRESSION_LEVEL + + // 1 gives fastest compression speed. 3 gives fastest decompression speed and best + // compression ratio. + //#define QLZ_COMPRESSION_LEVEL 1 + //#define QLZ_COMPRESSION_LEVEL 2 + #define QLZ_COMPRESSION_LEVEL 3 + + // If > 0, zero out both states prior to first call to qlz_compress() or qlz_decompress() + // and decompress packets in the same order as they were compressed + //#define QLZ_STREAMING_BUFFER 0 + //#define QLZ_STREAMING_BUFFER 100000 + #define QLZ_STREAMING_BUFFER 1000000 + + // Guarantees that decompression of corrupted data cannot crash. Decreases decompression + // speed 10-20%. Compression speed not affected. + //#define QLZ_MEMORY_SAFE +#endif + +#define QLZ_VERSION_MAJOR 1 +#define QLZ_VERSION_MINOR 5 +#define QLZ_VERSION_REVISION 0 + +// Using size_t, memset() and memcpy() +#include + +// Verify compression level +#if QLZ_COMPRESSION_LEVEL != 1 && QLZ_COMPRESSION_LEVEL != 2 && QLZ_COMPRESSION_LEVEL != 3 +#error QLZ_COMPRESSION_LEVEL must be 1, 2 or 3 +#endif + +typedef unsigned int ui32; +typedef unsigned short int ui16; + +// Decrease QLZ_POINTERS for level 3 to increase compression speed. Do not touch any other values! +#if QLZ_COMPRESSION_LEVEL == 1 +#define QLZ_POINTERS 1 +#define QLZ_HASH_VALUES 4096 +#elif QLZ_COMPRESSION_LEVEL == 2 +#define QLZ_POINTERS 4 +#define QLZ_HASH_VALUES 2048 +#elif QLZ_COMPRESSION_LEVEL == 3 +#define QLZ_POINTERS 16 +#define QLZ_HASH_VALUES 4096 +#endif + +// Detect if pointer size is 64-bit. It's not fatal if some 64-bit target is not detected because this is only for adding an optional 64-bit optimization. +#if defined _LP64 || defined __LP64__ || defined __64BIT__ || _ADDR64 || defined _WIN64 || defined __arch64__ || __WORDSIZE == 64 || (defined __sparc && defined __sparcv9) || defined __x86_64 || defined __amd64 || defined __x86_64__ || defined _M_X64 || defined _M_IA64 || defined __ia64 || defined __IA64__ + #define QLZ_PTR_64 +#endif + +// hash entry +typedef struct +{ +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 cache; +#if defined QLZ_PTR_64 && QLZ_STREAMING_BUFFER == 0 + unsigned int offset; +#else + const unsigned char *offset; +#endif +#else + const unsigned char *offset[QLZ_POINTERS]; +#endif + +} qlz_hash_compress; + +typedef struct +{ +#if QLZ_COMPRESSION_LEVEL == 1 + const unsigned char *offset; +#else + const unsigned char *offset[QLZ_POINTERS]; +#endif +} qlz_hash_decompress; + + +// states +typedef struct +{ + #if QLZ_STREAMING_BUFFER > 0 + unsigned char stream_buffer[QLZ_STREAMING_BUFFER]; + #endif + size_t stream_counter; + qlz_hash_compress hash[QLZ_HASH_VALUES]; + unsigned char hash_counter[QLZ_HASH_VALUES]; +} qlz_state_compress; + + +#if QLZ_COMPRESSION_LEVEL == 1 || QLZ_COMPRESSION_LEVEL == 2 + typedef struct + { +#if QLZ_STREAMING_BUFFER > 0 + unsigned char stream_buffer[QLZ_STREAMING_BUFFER]; +#endif + qlz_hash_decompress hash[QLZ_HASH_VALUES]; + unsigned char hash_counter[QLZ_HASH_VALUES]; + size_t stream_counter; + } qlz_state_decompress; +#elif QLZ_COMPRESSION_LEVEL == 3 + typedef struct + { +#if QLZ_STREAMING_BUFFER > 0 + unsigned char stream_buffer[QLZ_STREAMING_BUFFER]; +#endif +#if QLZ_COMPRESSION_LEVEL <= 2 + qlz_hash_decompress hash[QLZ_HASH_VALUES]; +#endif + size_t stream_counter; + } qlz_state_decompress; +#endif + + +#if defined (__cplusplus) +extern "C" { +#endif + +// Public functions of QuickLZ +size_t qlz_size_decompressed(const char *source); +size_t qlz_size_compressed(const char *source); +size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state); +size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state); +int qlz_get_setting(int setting); + +#if defined (__cplusplus) +} +#endif + +#endif + diff --git a/modules/interface/rocCurveCollection.h b/modules/interface/rocCurveCollection.h index 2b02f0d..8c7dd78 100644 --- a/modules/interface/rocCurveCollection.h +++ b/modules/interface/rocCurveCollection.h @@ -15,10 +15,11 @@ #include "TFile.h" #include "TCanvas.h" #include +class TLatex; class rocCurveCollection{ public: - rocCurveCollection():leg_(0),linewidth_(2),cmsstyle_(false),logy_(true){} + rocCurveCollection():leg_(0),linewidth_(2),cmsstyle_(false),logy_(true),nbins_(100){} ~rocCurveCollection(){ if(leg_) delete leg_; @@ -37,11 +38,16 @@ class rocCurveCollection{ comment1_=l; } + void setNBins(size_t nbins){ + nbins_=nbins; + } + void addExtraLegendEntry(const TString& entr); void setCMSStyle(bool cmsst){cmsstyle_=cmsst;} void setLogY(bool logy){logy_=logy;} void setXaxis(TString axis){xaxis_=axis;} + void setYaxis(TString axis){yaxis_=axis;} // void addROC(const TString& name, const TString& probability, const TString& truth, // const TString& vetotruth, int linecolstyle, const TString& cuts="",int linestyle=1); @@ -49,9 +55,10 @@ class rocCurveCollection{ void addROC(const TString& name, const TString& probability, const TString& truth, const TString& vetotruth, const TString& linecolstyle, const TString& cuts="",const TString& invalidateif=""); + void addText(TLatex *l){additionaltext_.push_back(l);} void printRocs(TChain* c, const TString& outpdf,const TString&outfile="",TCanvas* cv=0, TFile * f=0, - std::vector* chainvec=0); + std::vector* chainvec=0,double xmin_in=-1); private: TLegend * leg_; @@ -62,7 +69,9 @@ class rocCurveCollection{ bool cmsstyle_; TString comment0_, comment1_; bool logy_; - TString xaxis_; + TString xaxis_,yaxis_; + size_t nbins_; + std::vector additionaltext_; }; diff --git a/modules/modelTools.py b/modules/modelTools.py index 50a69cf..9ed4575 100644 --- a/modules/modelTools.py +++ b/modules/modelTools.py @@ -1,9 +1,10 @@ -def printLayerInfosAndWeights(model): +def printLayerInfosAndWeights(model, noweights=False): for layer in model.layers: g=layer.get_config() h=layer.get_weights() print (g) + if noweights: continue print (h) diff --git a/modules/models/buildingBlocks.py b/modules/models/buildingBlocks.py index 131f768..3f791c0 100644 --- a/modules/models/buildingBlocks.py +++ b/modules/models/buildingBlocks.py @@ -6,6 +6,49 @@ from keras.layers.pooling import MaxPooling2D from keras.layers.normalization import BatchNormalization +def block_deepFlavourBTVConvolutions(charged,vertices,dropoutRate,active=True,batchnorm=False,batchmomentum=0.6): + ''' + deep Flavour convolution part. + ''' + cpf=charged + if active: + cpf = Convolution1D(64, 1, kernel_initializer='lecun_uniform', activation='relu', name='cpf_conv0')(cpf) + if batchnorm: + cpf = BatchNormalization(momentum=batchmomentum ,name='cpf_batchnorm0')(cpf) + cpf = Dropout(dropoutRate,name='cpf_dropout0')(cpf) + cpf = Convolution1D(32, 1, kernel_initializer='lecun_uniform', activation='relu', name='cpf_conv1')(cpf) + if batchnorm: + cpf = BatchNormalization(momentum=batchmomentum,name='cpf_batchnorm1')(cpf) + cpf = Dropout(dropoutRate,name='cpf_dropout1')(cpf) + cpf = Convolution1D(32, 1, kernel_initializer='lecun_uniform', activation='relu', name='cpf_conv2')(cpf) + if batchnorm: + cpf = BatchNormalization(momentum=batchmomentum,name='cpf_batchnorm2')(cpf) + cpf = Dropout(dropoutRate,name='cpf_dropout2')(cpf) + cpf = Convolution1D(8, 1, kernel_initializer='lecun_uniform', activation='relu' , name='cpf_conv3')(cpf) + else: + cpf = Convolution1D(1,1, kernel_initializer='zeros',trainable=False)(cpf) + + vtx = vertices + if active: + vtx = Convolution1D(64, 1, kernel_initializer='lecun_uniform', activation='relu', name='vtx_conv0')(vtx) + if batchnorm: + vtx = BatchNormalization(momentum=batchmomentum,name='vtx_batchnorm0')(vtx) + vtx = Dropout(dropoutRate,name='vtx_dropout0')(vtx) + vtx = Convolution1D(32, 1, kernel_initializer='lecun_uniform', activation='relu', name='vtx_conv1')(vtx) + if batchnorm: + vtx = BatchNormalization(momentum=batchmomentum,name='vtx_batchnorm1')(vtx) + vtx = Dropout(dropoutRate,name='vtx_dropout1')(vtx) + vtx = Convolution1D(32, 1, kernel_initializer='lecun_uniform', activation='relu', name='vtx_conv2')(vtx) + if batchnorm: + vtx = BatchNormalization(momentum=batchmomentum,name='vtx_batchnorm2')(vtx) + vtx = Dropout(dropoutRate,name='vtx_dropout2')(vtx) + vtx = Convolution1D(8, 1, kernel_initializer='lecun_uniform', activation='relu', name='vtx_conv3')(vtx) + else: + vtx = Convolution1D(1,1, kernel_initializer='zeros',trainable=False)(vtx) + + return cpf,vtx + + def block_deepFlavourConvolutions(charged,neutrals,vertices,dropoutRate,active=True,batchnorm=False,batchmomentum=0.6): ''' deep Flavour convolution part. diff --git a/modules/models/convolutional.py b/modules/models/convolutional.py index 3b010e7..7442d19 100644 --- a/modules/models/convolutional.py +++ b/modules/models/convolutional.py @@ -2,10 +2,9 @@ from keras.models import Model from keras.layers.normalization import BatchNormalization from keras.layers.merge import Add, Multiply -from buildingBlocks import block_deepFlavourConvolutions, block_deepFlavourDense, block_SchwartzImage +from buildingBlocks import block_deepFlavourConvolutions, block_deepFlavourDense, block_SchwartzImage, block_deepFlavourBTVConvolutions - -def model_deepFlavourReference(Inputs,nclasses,nregclasses,dropoutRate=0.1): +def model_deepFlavourNoNeutralReference(Inputs,nclasses,nregclasses,dropoutRate=0.1): """ reference 1x1 convolutional model for 'deepFlavour' with recurrent layers and batch normalisation @@ -16,35 +15,83 @@ def model_deepFlavourReference(Inputs,nclasses,nregclasses,dropoutRate=0.1): """ globalvars = BatchNormalization(momentum=0.6,name='globals_input_batchnorm') (Inputs[0]) cpf = BatchNormalization(momentum=0.6,name='cpf_input_batchnorm') (Inputs[1]) - npf = BatchNormalization(momentum=0.6,name='npf_input_batchnorm') (Inputs[2]) - vtx = BatchNormalization(momentum=0.6,name='vtx_input_batchnorm') (Inputs[3]) - ptreginput = BatchNormalization(momentum=0.6,name='reg_input_batchnorm') (Inputs[4]) + vtx = BatchNormalization(momentum=0.6,name='vtx_input_batchnorm') (Inputs[2]) + ptreginput = BatchNormalization(momentum=0.6,name='reg_input_batchnorm') (Inputs[3]) + + cpf, vtx = block_deepFlavourBTVConvolutions( + charged=cpf, + vertices=vtx, + dropoutRate=dropoutRate, + active=True, + batchnorm=True + ) + + + # + cpf = LSTM(150,go_backwards=True,implementation=2, name='cpf_lstm')(cpf) + cpf=BatchNormalization(momentum=0.6,name='cpflstm_batchnorm')(cpf) + cpf = Dropout(dropoutRate)(cpf) + + vtx = LSTM(50,go_backwards=True,implementation=2, name='vtx_lstm')(vtx) + vtx=BatchNormalization(momentum=0.6,name='vtxlstm_batchnorm')(vtx) + vtx = Dropout(dropoutRate)(vtx) + + + x = Concatenate()( [globalvars,cpf,vtx ]) + + x = block_deepFlavourDense(x,dropoutRate,active=True,batchnorm=True,batchmomentum=0.6) + + flavour_pred=Dense(nclasses, activation='softmax',kernel_initializer='lecun_uniform',name='ID_pred')(x) + + reg = Concatenate()( [flavour_pred, ptreginput ] ) + + reg_pred=Dense(nregclasses, activation='linear',kernel_initializer='ones',name='regression_pred',trainable=True)(reg) + + predictions = [flavour_pred,reg_pred] + model = Model(inputs=Inputs, outputs=predictions) + return model + + +def model_deepFlavourReference(Inputs,nclasses,nregclasses,dropoutRate=0.1,momentum=0.6): + """ + reference 1x1 convolutional model for 'deepFlavour' + with recurrent layers and batch normalisation + standard dropout rate it 0.1 + should be trained for flavour prediction first. afterwards, all layers can be fixed + that do not include 'regression' and the training can be repeated focusing on the regression part + (check function fixLayersContaining with invert=True) + """ + globalvars = BatchNormalization(momentum=momentum,name='globals_input_batchnorm') (Inputs[0]) + cpf = BatchNormalization(momentum=momentum,name='cpf_input_batchnorm') (Inputs[1]) + npf = BatchNormalization(momentum=momentum,name='npf_input_batchnorm') (Inputs[2]) + vtx = BatchNormalization(momentum=momentum,name='vtx_input_batchnorm') (Inputs[3]) + ptreginput = BatchNormalization(momentum=momentum,name='reg_input_batchnorm') (Inputs[4]) cpf,npf,vtx = block_deepFlavourConvolutions(charged=cpf, neutrals=npf, vertices=vtx, dropoutRate=dropoutRate, active=True, - batchnorm=True) + batchnorm=True, batchmomentum=momentum) # cpf = LSTM(150,go_backwards=True,implementation=2, name='cpf_lstm')(cpf) - cpf=BatchNormalization(momentum=0.6,name='cpflstm_batchnorm')(cpf) + cpf=BatchNormalization(momentum=momentum,name='cpflstm_batchnorm')(cpf) cpf = Dropout(dropoutRate)(cpf) npf = LSTM(50,go_backwards=True,implementation=2, name='npf_lstm')(npf) - npf=BatchNormalization(momentum=0.6,name='npflstm_batchnorm')(npf) + npf=BatchNormalization(momentum=momentum,name='npflstm_batchnorm')(npf) npf = Dropout(dropoutRate)(npf) vtx = LSTM(50,go_backwards=True,implementation=2, name='vtx_lstm')(vtx) - vtx=BatchNormalization(momentum=0.6,name='vtxlstm_batchnorm')(vtx) + vtx=BatchNormalization(momentum=momentum,name='vtxlstm_batchnorm')(vtx) vtx = Dropout(dropoutRate)(vtx) x = Concatenate()( [globalvars,cpf,npf,vtx ]) - x = block_deepFlavourDense(x,dropoutRate,active=True,batchnorm=True,batchmomentum=0.6) + x = block_deepFlavourDense(x,dropoutRate,active=True,batchnorm=True,batchmomentum=momentum) flavour_pred=Dense(nclasses, activation='softmax',kernel_initializer='lecun_uniform',name='ID_pred')(x) diff --git a/modules/models/dense.py b/modules/models/dense.py index fd55675..cb91552 100644 --- a/modules/models/dense.py +++ b/modules/models/dense.py @@ -1,12 +1,16 @@ from keras.layers import Dense, Dropout, Flatten, Convolution2D, merge, Convolution1D, Conv2D from keras.models import Model +from pdb import set_trace +from Layers import GradientReversal -def dense_model(Inputs,nclasses,Inputshape,dropoutRate=0.25): +def dense_model(Inputs,nclasses,nregressions,dropoutRate=0.25): """ Dense matrix, defaults similat to 2016 training """ + if nregressions: raise ValueError('The dense model does not support regression, only classification') + #set_trace() # Here add e.g. the normal dense stuff from DeepCSV - x = Dense(100, activation='relu',kernel_initializer='lecun_uniform',input_shape=Inputshape)(Inputs) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(Inputs[0]) x = Dropout(dropoutRate)(x) x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) x = Dropout(dropoutRate)(x) @@ -19,9 +23,61 @@ def dense_model(Inputs,nclasses,Inputshape,dropoutRate=0.25): model = Model(inputs=Inputs, outputs=predictions) return model + +def dense_model_moments(Inputs,nclasses,nregressions,dropoutRate=0.25): + """ + Dense matrix, defaults similat to 2016 training. Adaptation for moment-based domain adaptation + """ + if nregressions: raise ValueError( + 'The dense model does not support regression, only classification') + #set_trace() + # Here add e.g. the normal dense stuff from DeepCSV + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(Inputs[0]) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x = Dropout(dropoutRate)(x) + x= Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + predictions = Dense(nclasses, activation='softmax',kernel_initializer='lecun_uniform')(x) + model = Model(inputs=Inputs, outputs=[predictions, predictions]) + return model + +def dense_model_gradientReversal(Inputs,nclasses,nregressions,dropoutRate=0.25): + """ + Dense matrix, defaults similat to 2016 training. Adaptation for gradient reversal + domain adaptation + """ + if nregressions: raise ValueError( + 'The dense model does not support regression, only classification') + #set_trace() + # Here add e.g. the normal dense stuff from DeepCSV + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(Inputs[0]) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + x_mid = Dropout(dropoutRate)(x) + x = Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x_mid) + x = Dropout(dropoutRate)(x) + x= Dense(100, activation='relu',kernel_initializer='lecun_uniform')(x) + predictions = Dense(nclasses, activation='softmax',kernel_initializer='lecun_uniform')(x) + + dom_ada = GradientReversal()(x_mid) + dom_ada = Dense(100, activation='relu', kernel_initializer='lecun_uniform')(dom_ada) + dom_ada = Dropout(dropoutRate)(dom_ada) + dom_ada = Dense(100, activation='relu', kernel_initializer='lecun_uniform')(dom_ada) + dom_ada = Dense(1, activation='sigmoid', kernel_initializer='lecun_uniform')(dom_ada) + + model = Model(inputs=Inputs, outputs=[predictions, dom_ada]) + return model + + def dense_model_reg_fake(Inputs,nclasses,Inputshape,dropoutRate=0.25): """ - Somewhat of a fake to test how much the BTV variables helped, only give REC PT and genPT. BTV and reco do not get merged! You need to set BTV loss to weight 0! + Somewhat of a fake to test how much the BTV variables helped, only give REC PT and genPT. BTV and recvo do not get merged! You need to set BTV loss to weight 0! """ x = Dense(1, activation='relu',kernel_initializer='lecun_uniform')(Inputs[0]) diff --git a/modules/preprocessing.py b/modules/preprocessing.py index 68015fe..fc79da7 100644 --- a/modules/preprocessing.py +++ b/modules/preprocessing.py @@ -419,12 +419,19 @@ def createDensityLayers(Filename_in, nevents, dimension1, dimension2, - counterbranch): + counterbranch, + scales=None): import c_meanNormZeroPad + if not scales: + norms = [1 for x in range(len(inbranches))] + else: + norms=scales + if not len(scales) == len(inbranches): + raise ValueError('Scales length must match number of branches') + - norms = [1 for x in range(len(inbranches))] means = [0 for x in range(len(inbranches))] x_branch, x_center, x_bins, x_width = dimension1 diff --git a/modules/src/c_makePlots.C b/modules/src/c_makePlots.C index dc923a0..7c942fe 100644 --- a/modules/src/c_makePlots.C +++ b/modules/src/c_makePlots.C @@ -26,6 +26,10 @@ using namespace boost::python; //for some reason.... +static void mergeOverflow(TH1F*h){ + h->SetBinContent(h->GetNbinsX(),h->GetBinContent(h->GetNbinsX())+h->GetBinContent(h->GetNbinsX()+1)); + h->SetBinContent(1,h->GetBinContent(1)+h->GetBinContent(0)); +} void makePlots( @@ -40,9 +44,12 @@ void makePlots( bool normalized, bool makeProfile=false, bool makeWidthProfile=false, - float OverrideMin=-1e100, - float OverrideMax=1e100, - std::string sourcetreename="deepntuplizer/tree") { + float OverrideMin=1e100, + float OverrideMax=-1e100, + std::string sourcetreename="deepntuplizer/tree", + size_t nbins=0, + float xmin=0, + float xmax=0) { std::vector s_intextfiles=toSTLVector(intextfiles); @@ -137,8 +144,16 @@ void makePlots( for(size_t i=0;iDraw(s_vars.at(i)+">>"+tmpname,s_cuts.at(i),addstr); - TH1F *histo = (TH1F*) gROOT->FindObject(tmpname); + if(nbins<1){ + histo = (TH1F*) gROOT->FindObject(tmpname); + } + mergeOverflow(histo); histo->SetLineColor(colorToTColor(s_colors.at(i))); histo->SetLineStyle(lineToTLineStyle(s_colors.at(i))); histo->SetTitle(s_names.at(i)); @@ -158,7 +173,7 @@ void makePlots( float tmin=histo->GetMinimum(); if(tmax>max)max=tmax; if(tminGetMinimum(); if(tmax>max)max=tmax; if(tmin-.5e100){ + if(OverrideMinGetYaxis()->SetRangeUser(min,1.3*max); //space for legend on top allhistos.at(0)->GetXaxis()->SetTitle(xaxis.data()); - if(Xmin!=-1) allhistos.at(0)->GetXaxis()->SetRangeUser(Xmin,Xmax); + if(XminGetXaxis()->SetRangeUser(Xmin,Xmax); allhistos.at(0)->GetYaxis()->SetTitle(yaxis.data()); allhistos.at(0)->Draw("AXIS"); diff --git a/modules/src/c_makeROCs.C b/modules/src/c_makeROCs.C index 4d2b2e3..caf7270 100644 --- a/modules/src/c_makeROCs.C +++ b/modules/src/c_makeROCs.C @@ -35,7 +35,8 @@ void makeROCs( const boost::python::list extralegend, bool logy, bool individual, - std::string xaxis + std::string xaxis, + int nbins ) { @@ -110,6 +111,7 @@ void makeROCs( TString xaxisstr=xaxis; rocCurveCollection rocs; + rocs.setNBins(nbins); rocs.setXaxis(xaxisstr); rocs.setCommentLine0(firstcomment.data()); diff --git a/modules/src/c_meanNormZeroPad.C b/modules/src/c_meanNormZeroPad.C index 3e88544..833a248 100644 --- a/modules/src/c_meanNormZeroPad.C +++ b/modules/src/c_meanNormZeroPad.C @@ -513,7 +513,7 @@ void fillDensityLayers(boost::python::numeric::array numpyarray, float featval=0; if(maskedlayerbranch>=0 && (size_t)maskedlayerbranch==i_feat) - featval=layer; + featval=(float)layer/s_norms.at(i_feat); else featval=branch.getData(i_feat, elem); diff --git a/modules/src/c_randomSelect.C b/modules/src/c_randomSelect.C new file mode 100644 index 0000000..71641ca --- /dev/null +++ b/modules/src/c_randomSelect.C @@ -0,0 +1,81 @@ +/* + * c_randomSelect.C + * + * Created on: 14 Aug 2017 + * Author: jkiesele + */ + + + + +#include +#include "boost/python/extract.hpp" +#include "boost/python/numeric.hpp" +#include "boost/python/list.hpp" +#include "boost/python/str.hpp" +#include + + +#include + +#include "TRandom3.h" + +using namespace boost::python; + +class randomSelector{ +public: + randomSelector(){ + rand_=new TRandom3(); + } + ~randomSelector(){ + delete rand_; + } + void select(boost::python::numeric::array probs , boost::python::numeric::array indices, const size_t nselect); +private: + TRandom3* rand_; +} sel; + + +void randomSelector::select(boost::python::numeric::array probs, boost::python::numeric::array selects, const size_t nselect){ + + const size_t size = len(probs); + if(nselect>size){ + throw std::logic_error("randomSelector::select: can't select more than given"); + } + std::vector indices(size,0) ; + std::iota(std::begin(indices), std::end(indices), 0); + std::shuffle(indices.begin(), indices.end(), std::random_device());//hardware random device + + size_t nselected=0; + size_t i_it=0; + while(true){ + size_t i=indices.at(i_it); + + float prob=rand_->Uniform(0,1); + if(probs[i]=nselect)break; + i_it++; + if(i_it>=size)i_it=0; + } +} + +//indices are initialised to 0, probs describe the remove probabilities +void randSelect(boost::python::numeric::array probs, + boost::python::numeric::array indices, + int nentries){ + + sel.select(probs,indices,nentries); + +} + +// Expose classes and methods to Python +BOOST_PYTHON_MODULE(c_randomSelect) { + boost::python::numeric::array::set_module_and_type("numpy", "ndarray"); + + def("randSelect", &randSelect); +} + diff --git a/modules/src/c_readArrThreaded.C b/modules/src/c_readArrThreaded.C new file mode 100644 index 0000000..aedb9bc --- /dev/null +++ b/modules/src/c_readArrThreaded.C @@ -0,0 +1,326 @@ + + +#define BOOST_PYTHON_MAX_ARITY 20 +#include +#include "boost/python/extract.hpp" +#include "boost/python/numeric.hpp" +#include "boost/python/list.hpp" +#include "boost/python/str.hpp" +#include +#include +#include "../interface/pythonToSTL.h" +#include "../interface/helper.h" + +#include + +#include + + +#include +#include + +#include "../interface/quicklz.h" + +#define MAXCHUNK (0xffffffff - 400) + +bool debug=false; + +class readThread{ +public: + readThread(long arrpointer, + const std::string& filenamein, + long size,bool rmwhendone){ + + arrbuf=(float*)(void*)arrpointer; + infile=filenamein; + length=size; + length*=sizeof(float); + pthread=new pthread_t(); + done=0; + id=lastid; + lastid++; + if(lastid>0xFFFE) + lastid=0; + removewhendone=rmwhendone; + state_decompress = new qlz_state_decompress(); + } + + ~readThread(){ + if(pthread)delete pthread; + if(state_decompress)delete state_decompress; + } + void start(){ + int iret= pthread_create( pthread, NULL, readArrThread, (void*) this); + if(iret){ + std::cerr << "Error - pthread_create() return code: "<infile <infile.data(), "rb"); + fseek(ifile, 0, SEEK_END); + unsigned int filelength = ftell(ifile); + fseek(ifile, 0, SEEK_SET); + fread(&nchunks, 1, 1, ifile); + std::vector chunksizes(nchunks,0); + size_t vecbytesize=nchunks*sizeof(size_t); + fread(&chunksizes[0], 1, vecbytesize, ifile); + + if(debug){ + std::cout << "file has "<< (int)nchunks << " chunks"<length << std::endl; + while(chunkarrbuf[writepos/sizeof(float)]), thisthread->state_decompress); + if(debug) + std::cout << "allread: " << allread << std::endl; + writepos=totalsize; + chunk++; + delete src; + } + if(debug) + std::cout << "allread "<< allread << " totalsize "<< totalsize <thisthread->length throw + + //totalsize compare to vector + + if(allread!=thisthread->length){ + fclose(ifile); + throw std::runtime_error("readArrThread:target array size does not match "); + } + fclose(ifile); + thisthread->done=1;//atomic + + if(thisthread->removewhendone){//thisthread->infile.data() + std::string rmstring="rm -f "; + rmstring+=thisthread->infile; + system(rmstring.data()); + } + return 0; +} + +using namespace boost::python; + + +//module info and interface + +size_t maxreads=1000; +std::vector allreads(maxreads,0); +size_t acounter=0; + +bool readBlocking(long arrpointer, + std::string filenamein, + const boost::python::list shape, + bool rmwhendone){ + + long length=1; + std::vector sshape=toSTLVector(shape); + for(const auto& s:sshape) + length*=s; + + readThread * t=new readThread(arrpointer,filenamein,length,rmwhendone); + t->readBlocking(); + bool succ=t->isDone(); + delete t; + return succ; +} + +int startReading(long arrpointer, + std::string filenamein, + const boost::python::list shape, + bool rmwhendone){ + + long length=1; + std::vector sshape=toSTLVector(shape); + for(const auto& s:sshape) + length*=s; + + readThread * t=new readThread(arrpointer,filenamein,length,rmwhendone); + t->start(); + if(allreads.at(acounter) && !allreads.at(acounter)->isDone()) + throw std::out_of_range("c_readArrThreaded::startReading: overflow. Increase number of maximum threads (setMax)"); + allreads.at(acounter)=t; + acounter++; + if(acounter>=maxreads) + acounter=0; + return t->getId(); +} + +bool isDone(int id){ + for(auto& t:allreads){ + if(!t)continue; + if(t->getId()==id){ + if(t->isDone()){ + t->join(); + delete t; + t=0; + return true; + } + else{ + return false; + } + } + } + if(debug) + std::cerr<<"isDone: ID "<< id << " not found "<0xFFFE) + throw std::runtime_error("setMax: must be smaller than 65536"); + maxreads=m; + allreads.resize(m,0); +} + + + +void writeArray(long arrpointer, + std::string file, const boost::python::list shape){ + + long length=1; + std::vector sshape=toSTLVector(shape); + for(const auto& s:sshape) + length*=s; + + length*=sizeof(float); + + FILE *ofile; + char *src=(char*)(void*)arrpointer; + char *dst; + + qlz_state_compress *state_compress = new qlz_state_compress(); + + ofile = fopen(file.data(), "wb"); + + // allocate "uncompressed size" + 400 for the destination buffer + dst = new char [length + 400]; + +if(debug) + std::cout << "array has "<< length << " bytes" < chunksizes; + + while(remaininglength){ + + size_t uselength=0; + if(remaininglength > MAXCHUNK){ + uselength=MAXCHUNK; + remaininglength-=MAXCHUNK; + nchunks++; + if(!nchunks){ + //throw etc + //TBI (only kicks in at about 1TB) + } + + } + else{ + uselength=remaininglength; + remaininglength=0; + } + size_t thissize = qlz_compress(&src[startbyte],&dst[len2], uselength, state_compress); + chunksizes.push_back(thissize); + len2+=thissize; + startbyte+=uselength; + } + if(debug){ + std::cout << "writing "<< len2 << " compressed bytes in "<< (int)nchunks <<" chunks: " < 0 && *(src + n) == *src) + n--; + return n == 0 ? 1 : 0; +} +#endif + +static void reset_table_compress(qlz_state_compress *state) +{ + int i; + for(i = 0; i < QLZ_HASH_VALUES; i++) + { +#if QLZ_COMPRESSION_LEVEL == 1 + state->hash[i].offset = 0; +#else + state->hash_counter[i] = 0; +#endif + } +} + +static void reset_table_decompress(qlz_state_decompress *state) +{ + int i; + (void)state; + (void)i; +#if QLZ_COMPRESSION_LEVEL == 2 + for(i = 0; i < QLZ_HASH_VALUES; i++) + { + state->hash_counter[i] = 0; + } +#endif +} + +static __inline ui32 hash_func(ui32 i) +{ +#if QLZ_COMPRESSION_LEVEL == 2 + return ((i >> 9) ^ (i >> 13) ^ i) & (QLZ_HASH_VALUES - 1); +#else + return ((i >> 12) ^ i) & (QLZ_HASH_VALUES - 1); +#endif +} + +static __inline ui32 fast_read(void const *src, ui32 bytes) +{ +#ifndef X86X64 + unsigned char *p = (unsigned char*)src; + switch (bytes) + { + case 4: + return(*p | *(p + 1) << 8 | *(p + 2) << 16 | *(p + 3) << 24); + case 3: + return(*p | *(p + 1) << 8 | *(p + 2) << 16); + case 2: + return(*p | *(p + 1) << 8); + case 1: + return(*p); + } + return 0; +#else + if (bytes >= 1 && bytes <= 4) + return *((ui32*)src); + else + return 0; +#endif +} + +static __inline ui32 hashat(const unsigned char *src) +{ + ui32 fetch, hash; + fetch = fast_read(src, 3); + hash = hash_func(fetch); + return hash; +} + +static __inline void fast_write(ui32 f, void *dst, size_t bytes) +{ +#ifndef X86X64 + unsigned char *p = (unsigned char*)dst; + + switch (bytes) + { + case 4: + *p = (unsigned char)f; + *(p + 1) = (unsigned char)(f >> 8); + *(p + 2) = (unsigned char)(f >> 16); + *(p + 3) = (unsigned char)(f >> 24); + return; + case 3: + *p = (unsigned char)f; + *(p + 1) = (unsigned char)(f >> 8); + *(p + 2) = (unsigned char)(f >> 16); + return; + case 2: + *p = (unsigned char)f; + *(p + 1) = (unsigned char)(f >> 8); + return; + case 1: + *p = (unsigned char)f; + return; + } +#else + switch (bytes) + { + case 4: + *((ui32*)dst) = f; + return; + case 3: + *((ui32*)dst) = f; + return; + case 2: + *((ui16 *)dst) = (ui16)f; + return; + case 1: + *((unsigned char*)dst) = (unsigned char)f; + return; + } +#endif +} + + +size_t qlz_size_decompressed(const char *source) +{ + ui32 n, r; + n = (((*source) & 2) == 2) ? 4 : 1; + r = fast_read(source + 1 + n, n); + r = r & (0xffffffff >> ((4 - n)*8)); + return r; +} + +size_t qlz_size_compressed(const char *source) +{ + ui32 n, r; + n = (((*source) & 2) == 2) ? 4 : 1; + r = fast_read(source + 1, n); + r = r & (0xffffffff >> ((4 - n)*8)); + return r; +} + +size_t qlz_size_header(const char *source) +{ + size_t n = 2*((((*source) & 2) == 2) ? 4 : 1) + 1; + return n; +} + + +static __inline void memcpy_up(unsigned char *dst, const unsigned char *src, ui32 n) +{ + // Caution if modifying memcpy_up! Overlap of dst and src must be special handled. +#ifndef X86X64 + unsigned char *end = dst + n; + while(dst < end) + { + *dst = *src; + dst++; + src++; + } +#else + ui32 f = 0; + do + { + *(ui32 *)(dst + f) = *(ui32 *)(src + f); + f += MINOFFSET + 1; + } + while (f < n); +#endif +} + +static __inline void update_hash(qlz_state_decompress *state, const unsigned char *s) +{ +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 hash; + hash = hashat(s); + state->hash[hash].offset = s; + state->hash_counter[hash] = 1; +#elif QLZ_COMPRESSION_LEVEL == 2 + ui32 hash; + unsigned char c; + hash = hashat(s); + c = state->hash_counter[hash]; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = s; + c++; + state->hash_counter[hash] = c; +#endif + (void)state; + (void)s; +} + +#if QLZ_COMPRESSION_LEVEL <= 2 +static void update_hash_upto(qlz_state_decompress *state, unsigned char **lh, const unsigned char *max) +{ + while(*lh < max) + { + (*lh)++; + update_hash(state, *lh); + } +} +#endif + +static size_t qlz_compress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_compress *state) +{ + const unsigned char *last_byte = source + size - 1; + const unsigned char *src = source; + unsigned char *cword_ptr = destination; + unsigned char *dst = destination + CWORD_LEN; + ui32 cword_val = 1U << 31; + const unsigned char *last_matchstart = last_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END; + ui32 fetch = 0; + unsigned int lits = 0; + + (void) lits; + + if(src <= last_matchstart) + fetch = fast_read(src, 3); + + while(src <= last_matchstart) + { + if ((cword_val & 1) == 1) + { + // store uncompressed if compression ratio is too low + if (src > source + (size >> 1) && dst - destination > src - source - ((src - source) >> 5)) + return 0; + + fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN); + + cword_ptr = dst; + dst += CWORD_LEN; + cword_val = 1U << 31; + fetch = fast_read(src, 3); + } +#if QLZ_COMPRESSION_LEVEL == 1 + { + const unsigned char *o; + ui32 hash, cached; + + hash = hash_func(fetch); + cached = fetch ^ state->hash[hash].cache; + state->hash[hash].cache = fetch; + + o = state->hash[hash].offset + OFFSET_BASE; + state->hash[hash].offset = CAST(src - OFFSET_BASE); + +#ifdef X86X64 + if ((cached & 0xffffff) == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6)))) + { + if(cached != 0) + { +#else + if (cached == 0 && o != OFFSET_BASE && (src - o > MINOFFSET || (src == o + 1 && lits >= 3 && src > source + 3 && same(src - 3, 6)))) + { + if (*(o + 3) != *(src + 3)) + { +#endif + hash <<= 4; + cword_val = (cword_val >> 1) | (1U << 31); + fast_write((3 - 2) | hash, dst, 2); + src += 3; + dst += 2; + } + else + { + const unsigned char *old_src = src; + size_t matchlen; + hash <<= 4; + + cword_val = (cword_val >> 1) | (1U << 31); + src += 4; + + if(*(o + (src - old_src)) == *src) + { + src++; + if(*(o + (src - old_src)) == *src) + { + size_t q = last_byte - UNCOMPRESSED_END - (src - 5) + 1; + size_t remaining = q > 255 ? 255 : q; + src++; + while(*(o + (src - old_src)) == *src && (size_t)(src - old_src) < remaining) + src++; + } + } + + matchlen = src - old_src; + if (matchlen < 18) + { + fast_write((ui32)(matchlen - 2) | hash, dst, 2); + dst += 2; + } + else + { + fast_write((ui32)(matchlen << 16) | hash, dst, 3); + dst += 3; + } + } + fetch = fast_read(src, 3); + lits = 0; + } + else + { + lits++; + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); +#ifdef X86X64 + fetch = fast_read(src, 3); +#else + fetch = (fetch >> 8 & 0xffff) | (*(src + 2) << 16); +#endif + } + } +#elif QLZ_COMPRESSION_LEVEL >= 2 + { + const unsigned char *o, *offset2; + ui32 hash, matchlen, k, m, best_k = 0; + unsigned char c; + size_t remaining = (last_byte - UNCOMPRESSED_END - src + 1) > 255 ? 255 : (last_byte - UNCOMPRESSED_END - src + 1); + (void)best_k; + + + //hash = hashat(src); + fetch = fast_read(src, 3); + hash = hash_func(fetch); + + c = state->hash_counter[hash]; + + offset2 = state->hash[hash].offset[0]; + if(offset2 < src - MINOFFSET && c > 0 && ((fast_read(offset2, 3) ^ fetch) & 0xffffff) == 0) + { + matchlen = 3; + if(*(offset2 + matchlen) == *(src + matchlen)) + { + matchlen = 4; + while(*(offset2 + matchlen) == *(src + matchlen) && matchlen < remaining) + matchlen++; + } + } + else + matchlen = 0; + for(k = 1; k < QLZ_POINTERS && c > k; k++) + { + o = state->hash[hash].offset[k]; +#if QLZ_COMPRESSION_LEVEL == 3 + if(((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET) +#elif QLZ_COMPRESSION_LEVEL == 2 + if(*(src + matchlen) == *(o + matchlen) && ((fast_read(o, 3) ^ fetch) & 0xffffff) == 0 && o < src - MINOFFSET) +#endif + { + m = 3; + while(*(o + m) == *(src + m) && m < remaining) + m++; +#if QLZ_COMPRESSION_LEVEL == 3 + if ((m > matchlen) || (m == matchlen && o > offset2)) +#elif QLZ_COMPRESSION_LEVEL == 2 + if (m > matchlen) +#endif + { + offset2 = o; + matchlen = m; + best_k = k; + } + } + } + o = offset2; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src; + c++; + state->hash_counter[hash] = c; + +#if QLZ_COMPRESSION_LEVEL == 3 + if(matchlen > 2 && src - o < 131071) + { + ui32 u; + size_t offset = src - o; + + for(u = 1; u < matchlen; u++) + { + hash = hashat(src + u); + c = state->hash_counter[hash]++; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src + u; + } + + cword_val = (cword_val >> 1) | (1U << 31); + src += matchlen; + + if(matchlen == 3 && offset <= 63) + { + *dst = (unsigned char)(offset << 2); + dst++; + } + else if (matchlen == 3 && offset <= 16383) + { + ui32 f = (ui32)((offset << 2) | 1); + fast_write(f, dst, 2); + dst += 2; + } + else if (matchlen <= 18 && offset <= 1023) + { + ui32 f = ((matchlen - 3) << 2) | ((ui32)offset << 6) | 2; + fast_write(f, dst, 2); + dst += 2; + } + + else if(matchlen <= 33) + { + ui32 f = ((matchlen - 2) << 2) | ((ui32)offset << 7) | 3; + fast_write(f, dst, 3); + dst += 3; + } + else + { + ui32 f = ((matchlen - 3) << 7) | ((ui32)offset << 15) | 3; + fast_write(f, dst, 4); + dst += 4; + } + } + else + { + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); + } +#elif QLZ_COMPRESSION_LEVEL == 2 + + if(matchlen > 2) + { + cword_val = (cword_val >> 1) | (1U << 31); + src += matchlen; + + if (matchlen < 10) + { + ui32 f = best_k | ((matchlen - 2) << 2) | (hash << 5); + fast_write(f, dst, 2); + dst += 2; + } + else + { + ui32 f = best_k | (matchlen << 16) | (hash << 5); + fast_write(f, dst, 3); + dst += 3; + } + } + else + { + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); + } +#endif + } +#endif + } + while (src <= last_byte) + { + if ((cword_val & 1) == 1) + { + fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN); + cword_ptr = dst; + dst += CWORD_LEN; + cword_val = 1U << 31; + } +#if QLZ_COMPRESSION_LEVEL < 3 + if (src <= last_byte - 3) + { +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 hash, fetch; + fetch = fast_read(src, 3); + hash = hash_func(fetch); + state->hash[hash].offset = CAST(src - OFFSET_BASE); + state->hash[hash].cache = fetch; +#elif QLZ_COMPRESSION_LEVEL == 2 + ui32 hash; + unsigned char c; + hash = hashat(src); + c = state->hash_counter[hash]; + state->hash[hash].offset[c & (QLZ_POINTERS - 1)] = src; + c++; + state->hash_counter[hash] = c; +#endif + } +#endif + *dst = *src; + src++; + dst++; + cword_val = (cword_val >> 1); + } + + while((cword_val & 1) != 1) + cword_val = (cword_val >> 1); + + fast_write((cword_val >> 1) | (1U << 31), cword_ptr, CWORD_LEN); + + // min. size must be 9 bytes so that the qlz_size functions can take 9 bytes as argument + return dst - destination < 9 ? 9 : dst - destination; +} + +static size_t qlz_decompress_core(const unsigned char *source, unsigned char *destination, size_t size, qlz_state_decompress *state, const unsigned char *history) +{ + const unsigned char *src = source + qlz_size_header((const char *)source); + unsigned char *dst = destination; + const unsigned char *last_destination_byte = destination + size - 1; + ui32 cword_val = 1; + const unsigned char *last_matchstart = last_destination_byte - UNCONDITIONAL_MATCHLEN - UNCOMPRESSED_END; + unsigned char *last_hashed = destination - 1; + const unsigned char *last_source_byte = source + qlz_size_compressed((const char *)source) - 1; + static const ui32 bitlut[16] = {4, 0, 1, 0, 2, 0, 1, 0, 3, 0, 1, 0, 2, 0, 1, 0}; + + (void) last_source_byte; + (void) last_hashed; + (void) state; + (void) history; + + for(;;) + { + ui32 fetch; + + if (cword_val == 1) + { +#ifdef QLZ_MEMORY_SAFE + if(src + CWORD_LEN - 1 > last_source_byte) + return 0; +#endif + cword_val = fast_read(src, CWORD_LEN); + src += CWORD_LEN; + } + +#ifdef QLZ_MEMORY_SAFE + if(src + 4 - 1 > last_source_byte) + return 0; +#endif + + fetch = fast_read(src, 4); + + if ((cword_val & 1) == 1) + { + ui32 matchlen; + const unsigned char *offset2; + +#if QLZ_COMPRESSION_LEVEL == 1 + ui32 hash; + cword_val = cword_val >> 1; + hash = (fetch >> 4) & 0xfff; + offset2 = (const unsigned char *)(size_t)state->hash[hash].offset; + + if((fetch & 0xf) != 0) + { + matchlen = (fetch & 0xf) + 2; + src += 2; + } + else + { + matchlen = *(src + 2); + src += 3; + } + +#elif QLZ_COMPRESSION_LEVEL == 2 + ui32 hash; + unsigned char c; + cword_val = cword_val >> 1; + hash = (fetch >> 5) & 0x7ff; + c = (unsigned char)(fetch & 0x3); + offset2 = state->hash[hash].offset[c]; + + if((fetch & (28)) != 0) + { + matchlen = ((fetch >> 2) & 0x7) + 2; + src += 2; + } + else + { + matchlen = *(src + 2); + src += 3; + } + +#elif QLZ_COMPRESSION_LEVEL == 3 + ui32 offset; + cword_val = cword_val >> 1; + if ((fetch & 3) == 0) + { + offset = (fetch & 0xff) >> 2; + matchlen = 3; + src++; + } + else if ((fetch & 2) == 0) + { + offset = (fetch & 0xffff) >> 2; + matchlen = 3; + src += 2; + } + else if ((fetch & 1) == 0) + { + offset = (fetch & 0xffff) >> 6; + matchlen = ((fetch >> 2) & 15) + 3; + src += 2; + } + else if ((fetch & 127) != 3) + { + offset = (fetch >> 7) & 0x1ffff; + matchlen = ((fetch >> 2) & 0x1f) + 2; + src += 3; + } + else + { + offset = (fetch >> 15); + matchlen = ((fetch >> 7) & 255) + 3; + src += 4; + } + + offset2 = dst - offset; +#endif + +#ifdef QLZ_MEMORY_SAFE + if(offset2 < history || offset2 > dst - MINOFFSET - 1) + return 0; + + if(matchlen > (ui32)(last_destination_byte - dst - UNCOMPRESSED_END + 1)) + return 0; +#endif + + memcpy_up(dst, offset2, matchlen); + dst += matchlen; + +#if QLZ_COMPRESSION_LEVEL <= 2 + update_hash_upto(state, &last_hashed, dst - matchlen); + last_hashed = dst - 1; +#endif + } + else + { + if (dst < last_matchstart) + { + unsigned int n = bitlut[cword_val & 0xf]; +#ifdef X86X64 + *(ui32 *)dst = *(ui32 *)src; +#else + memcpy_up(dst, src, 4); +#endif + cword_val = cword_val >> n; + dst += n; + src += n; +#if QLZ_COMPRESSION_LEVEL <= 2 + update_hash_upto(state, &last_hashed, dst - 3); +#endif + } + else + { + while(dst <= last_destination_byte) + { + if (cword_val == 1) + { + src += CWORD_LEN; + cword_val = 1U << 31; + } +#ifdef QLZ_MEMORY_SAFE + if(src >= last_source_byte + 1) + return 0; +#endif + *dst = *src; + dst++; + src++; + cword_val = cword_val >> 1; + } + +#if QLZ_COMPRESSION_LEVEL <= 2 + update_hash_upto(state, &last_hashed, last_destination_byte - 3); // todo, use constant +#endif + return size; + } + + } + } +} + +size_t qlz_compress(const void *source, char *destination, size_t size, qlz_state_compress *state) +{ + size_t r; + ui32 compressed; + size_t base; + + if(size == 0 || size > 0xffffffff - 400) + return 0; + + if(size < 216) + base = 3; + else + base = 9; + +#if QLZ_STREAMING_BUFFER > 0 + if (state->stream_counter + size - 1 >= QLZ_STREAMING_BUFFER) +#endif + { + reset_table_compress(state); + r = base + qlz_compress_core((const unsigned char *)source, (unsigned char*)destination + base, size, state); +#if QLZ_STREAMING_BUFFER > 0 + reset_table_compress(state); +#endif + if(r == base) + { + memcpy(destination + base, source, size); + r = size + base; + compressed = 0; + } + else + { + compressed = 1; + } + state->stream_counter = 0; + } +#if QLZ_STREAMING_BUFFER > 0 + else + { + unsigned char *src = state->stream_buffer + state->stream_counter; + + memcpy(src, source, size); + r = base + qlz_compress_core(src, (unsigned char*)destination + base, size, state); + + if(r == base) + { + memcpy(destination + base, src, size); + r = size + base; + compressed = 0; + reset_table_compress(state); + } + else + { + compressed = 1; + } + state->stream_counter += size; + } +#endif + if(base == 3) + { + *destination = (unsigned char)(0 | compressed); + *(destination + 1) = (unsigned char)r; + *(destination + 2) = (unsigned char)size; + } + else + { + *destination = (unsigned char)(2 | compressed); + fast_write((ui32)r, destination + 1, 4); + fast_write((ui32)size, destination + 5, 4); + } + + *destination |= (QLZ_COMPRESSION_LEVEL << 2); + *destination |= (1 << 6); + *destination |= ((QLZ_STREAMING_BUFFER == 0 ? 0 : (QLZ_STREAMING_BUFFER == 100000 ? 1 : (QLZ_STREAMING_BUFFER == 1000000 ? 2 : 3))) << 4); + +// 76543210 +// 01SSLLHC + + return r; +} + +size_t qlz_decompress(const char *source, void *destination, qlz_state_decompress *state) +{ + size_t dsiz = qlz_size_decompressed(source); + +#if QLZ_STREAMING_BUFFER > 0 + if (state->stream_counter + qlz_size_decompressed(source) - 1 >= QLZ_STREAMING_BUFFER) +#endif + { + if((*source & 1) == 1) + { + reset_table_decompress(state); + dsiz = qlz_decompress_core((const unsigned char *)source, (unsigned char *)destination, dsiz, state, (const unsigned char *)destination); + } + else + { + memcpy(destination, source + qlz_size_header(source), dsiz); + } + state->stream_counter = 0; + reset_table_decompress(state); + } +#if QLZ_STREAMING_BUFFER > 0 + else + { + unsigned char *dst = state->stream_buffer + state->stream_counter; + if((*source & 1) == 1) + { + dsiz = qlz_decompress_core((const unsigned char *)source, dst, dsiz, state, (const unsigned char *)state->stream_buffer); + } + else + { + memcpy(dst, source + qlz_size_header(source), dsiz); + reset_table_decompress(state); + } + memcpy(destination, dst, dsiz); + state->stream_counter += dsiz; + } +#endif + return dsiz; +} diff --git a/modules/src/rocCurve.cpp b/modules/src/rocCurve.cpp index 9667271..34aaf77 100644 --- a/modules/src/rocCurve.cpp +++ b/modules/src/rocCurve.cpp @@ -28,6 +28,22 @@ size_t rocCurve::nrocsCounter=0; +static std::vector loglist(double first, double last, double size){ + if(first>last) std::swap(first,last); + double logfirst = log(first)/log(10); + double loglast = log(last)/log(10); + double step = (loglast-logfirst)/(size-1); + std::vector out; + for(double x=logfirst; x<=loglast; x+=step) + { + double a = pow(10,x); + out.push_back(a); + } + return out; + +} + + rocCurve::rocCurve():nbins_(100),linecol_(kBlack),linewidth_(1),linestyle_(1),fullanalysis_(true){ nrocsCounter++; } @@ -160,14 +176,14 @@ void rocCurve::process(TChain *c,std::ostream& out){ roc_.Draw("L");//necessary for some weird root reason - - float misidset[]={0.001,0.01,0.1,0.2,1.}; + out << "eff @ misid @ discr value\n\n"; + std::vector misidset=loglist(0.001,1,100); int count=0; double integral=0; - for(float eff=0;eff<1;eff+=0.0001){ + for(float eff=0;eff<1;eff+=0.00001){ float misid=roc_.Eval(eff); - integral+=misid*0.0001; + integral+=misid*0.00001; if(misid>misidset[count]){ out << eff <<"@"<< misid; count++; diff --git a/modules/src/rocCurveCollection.cpp b/modules/src/rocCurveCollection.cpp index 237a500..678e98e 100644 --- a/modules/src/rocCurveCollection.cpp +++ b/modules/src/rocCurveCollection.cpp @@ -44,7 +44,7 @@ void rocCurveCollection::addROC(const TString& name, const TString& probability, void rocCurveCollection::printRocs(TChain* c, const TString& outpdf, - const TString&outfile, TCanvas* cv, TFile * f, std::vector* chainvec){ + const TString&outfile, TCanvas* cv, TFile * f, std::vector* chainvec,double xmin_in){ gROOT->SetBatch(); @@ -60,7 +60,7 @@ void rocCurveCollection::printRocs(TChain* c, const TString& outpdf, for(size_t i=0;iSetGrid(); cv->SetTicks(1, 1); cv->SetBottomMargin(0.12); - cv->SetTopMargin(0.06); + cv->SetTopMargin(0.069); cv->SetLeftMargin(0.15); cv->SetRightMargin(0.03); + cv->Draw(); + cv->cd(); TH1D haxis=TH1D("AXIS","AXIS",10,0,1); //haxis.Draw("AXIS"); haxis.GetYaxis()->SetRangeUser(8e-4,1); //haxis.GetYaxis()->SetNdivisions(510); - haxis.GetYaxis()->SetTitle("misid. probability"); + haxis.GetYaxis()->SetTitleSize(0.05); //haxis.GetYaxis()->SetTitleOffset(0.9); haxis.GetYaxis()->SetLabelSize(0.045); @@ -109,6 +111,10 @@ void rocCurveCollection::printRocs(TChain* c, const TString& outpdf, haxis.GetXaxis()->SetTitle(xaxis_); else haxis.GetXaxis()->SetTitle("efficiency"); + if(yaxis_.Length()) + haxis.GetYaxis()->SetTitle(yaxis_); + else + haxis.GetYaxis()->SetTitle("misid. probability"); haxis.GetXaxis()->SetTitleSize(0.05); haxis.GetXaxis()->SetLabelSize(0.045); //haxis.GetXaxis()->SetTitleOffset(0.95); @@ -172,7 +178,10 @@ void rocCurveCollection::printRocs(TChain* c, const TString& outpdf, xmin*=10; xmin=(int)xmin; xmin/=10; - haxis.GetXaxis()->SetRangeUser(xmin,1); + if(xmin_in>0) + haxis.GetXaxis()->SetRangeUser(xmin_in,1); + else + haxis.GetXaxis()->SetRangeUser(xmin,1); if(cmsstyle_){ @@ -226,6 +235,8 @@ void rocCurveCollection::printRocs(TChain* c, const TString& outpdf, ////// + for(auto& t:additionaltext_) + t->Draw(); //comment lines TLatex *tex = new TLatex(0.18,0.805,comment0_); tex->SetNDC(true); diff --git a/modules/testing.py b/modules/testing.py index a3ff7b4..98ac6f0 100644 --- a/modules/testing.py +++ b/modules/testing.py @@ -219,7 +219,8 @@ def makeROCs_async(intextfile, name_list, probabilities_list, truths_list, vetos extralegend=None, logY=True, individual=False, - xaxis=""):#['solid?udsg','hatched?c']): + xaxis="", + nbins=200):#['solid?udsg','hatched?c']): import copy @@ -267,7 +268,7 @@ def worker(): vetos_list, colors_list, outpdffile,allcuts,cmsstyle, firstcomment,secondcomment,invalidlist,extralegcopy,logY, - individual,xaxis) + individual,xaxis,nbins) except Exception as e: print('error for these inputs:') @@ -291,7 +292,9 @@ def makePlots_async(intextfile, name_list, variables, cuts, colours, outpdffile, xaxis='',yaxis='', normalized=False,profiles=False, minimum=-1e100,maximum=1e100,widthprofile=False, - treename="deepntuplizer/tree"): + treename="deepntuplizer/tree", + nbins=0,xmin=0,xmax=0, + ): files_list=makeASequence(intextfile,len(name_list)) @@ -311,7 +314,8 @@ def worker(): else: c_makePlots.makePlots(files_list,name_list, variables_list,cuts_list,colours_list, - outpdffile,xaxis,yaxis,normalized,profiles,widthprofile,minimum,maximum,treename) + outpdffile,xaxis,yaxis,normalized,profiles,widthprofile,minimum,maximum, + treename,nbins,xmin,xmax) # return worker() import multiprocessing @@ -321,8 +325,8 @@ def worker(): def makeEffPlots_async(intextfile, name_list, variables, cutsnum,cutsden, colours, outpdffile, xaxis='',yaxis='', - minimum=-1e100,maximum=1e100, - rebinfactor=1, SetLogY = False, Xmin = -1., Xmax = 1000. , + minimum=1e100,maximum=-1e100, + rebinfactor=1, SetLogY = False, Xmin = 100, Xmax = -100. , treename="deepntuplizer/tree"): @@ -389,6 +393,44 @@ def association(fname): +def plotLoss(infilename,outfilename,range): + + import matplotlib + + matplotlib.use('Agg') + + infile=open(infilename,'r') + trainloss=[] + valloss=[] + epochs=[] + i=0 + automax=0 + automin=100 + for line in infile: + if len(line)<1: continue + tl=float(line.split(' ')[0]) + vl=float(line.split(' ')[1]) + trainloss.append(tl) + valloss.append(vl) + epochs.append(i) + i=i+1 + if i==5: + automax=max(tl,vl) + automin=min(automin,vl,tl) + + + import matplotlib.pyplot as plt + f = plt.figure() + plt.plot(epochs,trainloss,'r',label='train') + plt.plot(epochs,valloss,'b',label='val') + plt.ylabel('loss') + plt.xlabel('epoch') + plt.legend() + if len(range)==2: + plt.ylim(range) + elif automax>0: + plt.ylim([automin*0.9,automax]) + f.savefig(outfilename) ######### old part - keep for reference, might be useful some day diff --git a/modules/training_base.py b/modules/training_base.py index 6e5aaa8..50cf5ac 100644 --- a/modules/training_base.py +++ b/modules/training_base.py @@ -11,26 +11,45 @@ import os from argparse import ArgumentParser import shutil +from DataCollection import DataCollection +from pdb import set_trace # argument parsing and bookkeeping from Losses import * class training_base(object): - def __init__(self, - splittrainandtest=0.8, - useweights=False, - testrun=False): + def __init__( + self, splittrainandtest=0.85, + useweights=False, testrun=False, + resumeSilently=False, collection_class=DataCollection): + + + + parser = ArgumentParser('Run the training') + parser.add_argument('inputDataCollection') + parser.add_argument('outputDir') + parser.add_argument("--gpu", help="select specific GPU", type=int, metavar="OPT", default=-1) + + args = parser.parse_args() + import os + import matplotlib #if no X11 use below matplotlib.use('Agg') - import imp - try: - imp.find_module('setGPU') - import setGPU - except ImportError: - found = False + if args.gpu<0: + import imp + try: + imp.find_module('setGPU') + import setGPU + except ImportError: + found = False + else: + os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID' + os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpu) + print('running on GPU '+str(args.gpu)) + import keras @@ -41,31 +60,30 @@ def __init__(self, self.train_data=None self.val_data=None self.startlearningrate=None + self.optimizer=None self.trainedepoches=0 self.compiled=False self.checkpointcounter=0 - parser = ArgumentParser('Run the training') - parser.add_argument('inputDataCollection') - parser.add_argument('outputDir') - args = parser.parse_args() - self.inputData = os.path.abspath(args.inputDataCollection) + self.inputData = os.path.abspath(args.inputDataCollection) \ + if ',' not in args.inputDataCollection else \ + [os.path.abspath(i) for i in args.inputDataCollection.split(',')] self.outputDir=args.outputDir # create output dir isNewTraining=True if os.path.isdir(self.outputDir): - var = raw_input('output dir exists. To recover a training, please type "yes"\n') - if not var == 'yes': - raise Exception('output directory must not exists yet') + if not resumeSilently: + var = raw_input('output dir exists. To recover a training, please type "yes"\n') + if not var == 'yes': + raise Exception('output directory must not exists yet') isNewTraining=False else: os.mkdir(self.outputDir) self.outputDir = os.path.abspath(self.outputDir) self.outputDir+='/' - from DataCollection import DataCollection #copy configuration to output dir if isNewTraining: djsource= os.environ['DEEPJET'] @@ -74,12 +92,12 @@ def __init__(self, - self.train_data=DataCollection() + self.train_data = collection_class() self.train_data.readFromFile(self.inputData) self.train_data.useweights=useweights if testrun: - self.train_data.split(0.02) + self.train_data.split(0.002) self.val_data=self.train_data.split(splittrainandtest) @@ -88,6 +106,7 @@ def __init__(self, shapes=self.train_data.getInputShapes() + self.train_data.maxFilesOpen=-1 self.keras_inputs=[] self.keras_inputsshapes=[] @@ -99,9 +118,17 @@ def __init__(self, self.keras_inputsshapes.append(s) if not isNewTraining: - self.loadModel(self.outputDir+'KERAS_check_last_model.h5') + if not os.path.isfile(self.outputDir+'/KERAS_check_model_last.h5'): + print('you cannot resume a training that did not train for at least one epoch.\nplease start a new training.') + exit() + self.loadModel(self.outputDir+'/KERAS_check_model_last.h5') self.trainedepoches=sum(1 for line in open(self.outputDir+'losses.log')) + + def __del__(self): + del self.train_data + del self.val_data + def modelSet(self): return not self.keras_model==None @@ -125,6 +152,7 @@ def loadModel(self,filename): #del f['optimizer_weights'] from keras.models import load_model self.keras_model=load_model(filename, custom_objects=global_loss_list) + self.optimizer=self.keras_model.optimizer self.compiled=True def compileModel(self, @@ -136,8 +164,8 @@ def compileModel(self, # return from keras.optimizers import Adam self.startlearningrate=learningrate - adam = Adam(lr=self.startlearningrate) - self.keras_model.compile(optimizer=adam,**compileargs) + self.optimizer = Adam(lr=self.startlearningrate) + self.keras_model.compile(optimizer=self.optimizer,**compileargs) self.compiled=True def saveModel(self,outfile): @@ -148,7 +176,7 @@ def saveModel(self,outfile): saver = tf.train.Saver() tfoutpath=self.outputDir+outfile+'_tfsession/tf' import os - os.system('rm -f '+tfoutpath) + os.system('rm -rf '+tfoutpath) os.system('mkdir -p '+tfoutpath) saver.save(tfsession, tfoutpath) @@ -168,31 +196,49 @@ def trainModel(self, lr_cooldown=6, lr_minimum=0.000001, maxqsize=20, + checkperiod=10, **trainargs): #make sure tokens don't expire from tokenTools import checkTokens, renew_token_process from thread import start_new_thread + print('starting afs backgrounder') checkTokens() start_new_thread(renew_token_process,()) self.train_data.setBatchSize(batchsize) self.val_data.setBatchSize(batchsize) - self.keras_model.save(self.outputDir+'KERAS_check_last_model.h5') - + averagesamplesperfile=self.train_data.getAvEntriesPerFile() + samplespreread=maxqsize*batchsize + nfilespre=max(int(samplespreread/averagesamplesperfile),2) + nfilespre+=1 + #if nfilespre>15:nfilespre=15 + print('best pre read: '+str(nfilespre)+' a: '+str(averagesamplesperfile)) + print('total sample size: '+str(self.train_data.nsamples)) + #exit() + + if self.train_data.maxFilesOpen<0: + self.train_data.maxFilesOpen=nfilespre + self.val_data.maxFilesOpen=min(int(nfilespre/2),1) + + #self.keras_model.save(self.outputDir+'KERAS_check_last_model.h5') + print('setting up callbacks') from DeepJet_callbacks import DeepJet_callbacks - callbacks=DeepJet_callbacks(stop_patience=stop_patience, + + callbacks=DeepJet_callbacks(self.keras_model, + stop_patience=stop_patience, lr_factor=lr_factor, lr_patience=lr_patience, lr_epsilon=lr_epsilon, lr_cooldown=lr_cooldown, lr_minimum=lr_minimum, - outputDir=self.outputDir) + outputDir=self.outputDir, + checkperiod=checkperiod) nepochs=nepochs-self.trainedepoches - + print('starting training') self.keras_model.fit_generator(self.train_data.generator() , steps_per_epoch=self.train_data.getNBatchesPerEpoch(), epochs=nepochs, @@ -219,4 +265,4 @@ def trainModel(self, - \ No newline at end of file + diff --git a/scripts/loadTreeAssociation.C b/scripts/loadTreeAssociation.C new file mode 100644 index 0000000..7d1834c --- /dev/null +++ b/scripts/loadTreeAssociation.C @@ -0,0 +1,19 @@ + + +//root hack-'macro' to load a full tree association for interactive plotting + +//int main(){ + +#include "TROOT.h" +#include "../src/friendTreeInjector.cpp" +#include "TSystem.h" +friendTreeInjector * injector=0; +TChain* tree=0; + +void loadTreeAssociation(){ + TString __infile=getenv("ROOT_TREEASSOCIATIONINFILE"); + injector = new friendTreeInjector(); + injector->addFromFile(__infile); + injector->createChain(); + tree= injector->getChain(); +} diff --git a/scripts/plotLoss.py b/scripts/plotLoss.py index 75cd6fe..b1aaf12 100755 --- a/scripts/plotLoss.py +++ b/scripts/plotLoss.py @@ -1,8 +1,6 @@ #!/usr/bin/env python - -import matplotlib - -#matplotlib.use('Agg') + +from testing import plotLoss from argparse import ArgumentParser @@ -15,26 +13,6 @@ infilename=args.inputDir+'/'+args.file -infile=open(infilename,'r') - -trainloss=[] -valloss=[] -epochs=[] -i=0 -for line in infile: - if len(line)<1: continue - trainloss.append(line.split(' ')[0]) - valloss.append(line.split(' ')[1]) - epochs.append(i) - i=i+1 - -import matplotlib.pyplot as plt -f = plt.figure() -plt.plot(epochs,trainloss,'r',epochs,valloss,'b') -plt.ylabel('loss') -plt.xlabel('epoch') -if len(args.range)==2: - plt.ylim(args.range) -f.savefig(args.inputDir+'/'+"losses.pdf") +plotLoss(infilename,args.inputDir+'/losses.pdf',args.range) diff --git a/scripts/readPrediction.sh b/scripts/readPrediction.sh new file mode 100755 index 0000000..91982d7 --- /dev/null +++ b/scripts/readPrediction.sh @@ -0,0 +1,9 @@ + +#!/bin/bash + + + +export ROOT_TREEASSOCIATIONINFILE=$1 +export CPLUS_INCLUDE_PATH=$DEEPJET/modules/interface +export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/lib64/root +root -l $DEEPJET/scripts/loadTreeAssociation.C \ No newline at end of file