Commit 34028c63 authored by Haoran Li's avatar Haoran Li Committed by Facebook Github Bot
Browse files

move distributed_init after get_batch_iterator

Summary: There are constantly wait timeout issue for using multiple nodes, even setting copylocallytempdir:/ doesn't help, eg f105637629. It seems to be working after I moved distributed_init after get_batch_iterator, eg f106520580

Reviewed By: myleott

Differential Revision: D14817769

fbshipit-source-id: edbb101a28d8082241c7bdd8c5500c9dad27647c
parent 40ac340b
Loading
Loading
Loading
Loading
+6 −6
Original line number Diff line number Diff line
@@ -41,12 +41,6 @@ def main(args, init_distributed=False):
    # Load dataset splits
    load_dataset_splits(args, task)

    # Initialize distributed training (after data loading)
    if init_distributed:
        import socket
        args.distributed_rank = distributed_utils.distributed_init(args)
        print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank))

    # Build model and criterion
    model = task.build_model(args)
    criterion = task.build_criterion(args)
@@ -89,6 +83,12 @@ def main(args, init_distributed=False):
        num_workers=args.num_workers,
    )

    # Initialize distributed training (after data loading)
    if init_distributed:
        import socket
        args.distributed_rank = distributed_utils.distributed_init(args)
        print('| initialized host {} as rank {}'.format(socket.gethostname(), args.distributed_rank))

    # Load the latest checkpoint if one is available
    if not load_checkpoint(args, trainer, epoch_itr):
        trainer.dummy_train_step([dummy_batch])