Skip to content

Commit

Permalink
setup logging
Browse files Browse the repository at this point in the history
  • Loading branch information
bastiscode committed Aug 7, 2024
1 parent 721b63c commit a3e9b18
Showing 1 changed file with 3 additions and 0 deletions.
3 changes: 3 additions & 0 deletions python/text_utils/api/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -851,6 +851,7 @@ def _train_local_distributed(
directories: dict[str, str],
profile: str | None = None
):
logging.setup_logging()
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = str(port)

Expand Down Expand Up @@ -890,6 +891,7 @@ def train_slurm(cls, work_dir: str, experiment_dir: str, config_path: str):
assert torch.cuda.device_count() > 0, "need at least one GPU for training, but found none"
assert dist.is_available(), "distributed package must be available for training"
assert dist.is_nccl_available(), "nccl backend for distributed training must be available"
logging.setup_logging()
logger = logging.get_logger("SLURM_INITIALIZATION")
num_gpus = torch.cuda.device_count()
logger.info(f"Found {num_gpus} GPU{'s' * (num_gpus > 1)} "
Expand Down Expand Up @@ -967,6 +969,7 @@ def train_slurm(cls, work_dir: str, experiment_dir: str, config_path: str):

@classmethod
def train_local(cls, work_dir: str, experiment_dir: str, config_path: str, profile: str | None = None):
logging.setup_logging()
logger = logging.get_logger("LOCAL_INITIALIZATION")
num_gpus = torch.cuda.device_count()
assert num_gpus > 0, "need at least one GPU for local training"
Expand Down

0 comments on commit a3e9b18

Please sign in to comment.