From bc990d405a382553179b14a2f43272b6fca42777 Mon Sep 17 00:00:00 2001 From: MartinXPN Date: Tue, 23 Feb 2021 01:57:26 +0400 Subject: [PATCH] train with weights and biases, added gradient clipping --- abcde/models.py | 2 +- setup.py | 1 + train.py | 24 ++++++++++++++++-------- 3 files changed, 18 insertions(+), 9 deletions(-) diff --git a/abcde/models.py b/abcde/models.py index d19fba8..1c8b9f9 100644 --- a/abcde/models.py +++ b/abcde/models.py @@ -77,7 +77,7 @@ def validation_step(self, batch, batch_idx: int) -> List[Dict[str, float]]: return history def configure_optimizers(self): - optimizer = torch.optim.Adam(self.parameters(), lr=1e-3) + optimizer = torch.optim.Adam(self.parameters(), lr=1e-2) scheduler = ReduceLROnPlateau(optimizer, mode='max', patience=self.lr_reduce_patience, factor=0.7, min_lr=1e-5) return { 'optimizer': optimizer, diff --git a/setup.py b/setup.py index 5ddc0c9..d7809ff 100644 --- a/setup.py +++ b/setup.py @@ -26,6 +26,7 @@ 'tqdm>=4.54.1', 'fire>=0.3.1', 'aim>=2.1.4', + 'wandb>=0.10.17', 'knockknock>=0.1.8.1', ], extras_require={ diff --git a/train.py b/train.py index 079a235..ec4c15d 100644 --- a/train.py +++ b/train.py @@ -5,7 +5,7 @@ from knockknock import telegram_sender from pytorch_lightning import Trainer from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint, LearningRateMonitor -from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger +from pytorch_lightning.loggers import TensorBoardLogger, CSVLogger, WandbLogger from abcde.data import GraphDataModule from abcde.models import ABCDE @@ -14,35 +14,43 @@ # Fix the seed for reproducibility fix_random_seed(42) -experiment = ExperimentSetup(name='drop', create_latest=True, long_description=""" +experiment = ExperimentSetup(name='grad-clip', create_latest=True, long_description=""" Try dropping edges while training Graphs are only of 'powerlaw' type. Use unique convolutions. Use blocks of convolutions followed with max pooling and skip connections +Use gradient clipping """) torch.multiprocessing.set_sharing_strategy('file_system') + +def fit(t: Trainer): + t.fit(model, datamodule=data) + return t.callback_metrics + + if __name__ == '__main__': loggers = [ CSVLogger(experiment.log_dir, name='history'), TensorBoardLogger(experiment.log_dir, name=experiment.name, default_hp_metric=False), + WandbLogger(name=experiment.name, save_dir=experiment.log_dir, project='abcde', save_code=True, notes=experiment.long_description), AimLogger(experiment=experiment.name), ] # Previous best: nb_gcn_cycles=(4, 4, 6, 6, 8), conv_sizes=(64, 64, 32, 32, 16), drops=(0, 0, 0, 0, 0) model = ABCDE(nb_gcn_cycles=(4, 4, 6, 6, 8, 8), - conv_sizes=(64, 48, 32, 32, 24, 24), - drops=(0.4, 0.3, 0.2, 0.2, 0.1, 0.1), + conv_sizes=(48, 48, 32, 32, 24, 24), + drops=(0.3, 0.3, 0.2, 0.2, 0.1, 0.1), lr_reduce_patience=2, dropout=0.1) data = GraphDataModule(min_nodes=4000, max_nodes=5000, nb_train_graphs=160, nb_valid_graphs=240, batch_size=16, graph_type='powerlaw', repeats=8, regenerate_epoch_interval=10, cache_dir=Path('datasets') / 'cache') - trainer = Trainer(logger=loggers, + trainer = Trainer(logger=loggers, gradient_clip_val=0.1, gpus=-1 if torch.cuda.is_available() else None, auto_select_gpus=True, - max_epochs=100, terminate_on_nan=True, enable_pl_optimizer=True, + max_epochs=50, terminate_on_nan=True, enable_pl_optimizer=True, reload_dataloaders_every_epoch=True, callbacks=[ - EarlyStopping(monitor='val_kendal', patience=5, verbose=True, mode='max'), + EarlyStopping(monitor='val_kendal', patience=6, verbose=True, mode='max'), ModelCheckpoint(dirpath=experiment.model_save_path, filename='drop-{epoch:02d}-{val_kendal:.2f}', monitor='val_kendal', save_top_k=5, verbose=True, mode='max'), LearningRateMonitor(logging_interval='epoch'), ]) - trainer.fit(model, datamodule=data) + fit(trainer)