more runs and added wikitext eval

brando90 · Jan 30, 2024 · 0b72bfa · 0b72bfa
1 parent beffb88
commit 0b72bfa
Show file tree

Hide file tree

Showing 3 changed files with 65 additions and 56 deletions.
diff --git a/main_krbtmux.sh b/main_krbtmux.sh
@@ -40,7 +40,7 @@ reauth
 
 source $AFS/.bashrc
 conda activate beyond_scale
-export CUDA_VISIBLE_DEVICES=0
+export CUDA_VISIBLE_DEVICES=2
 echo CUDA_VISIBLE_DEVICES = $CUDA_VISIBLE_DEVICES
 # export CUDA_VISIBLE_DEVICES=$(nvidia-smi --query-gpu=index,memory.free --format=csv,noheader,nounits | sort -k2 -nr | head -n 1 | awk -F ', ' '{print $1}')
 echo CUDA_VISIBLE_DEVICES = $CUDA_VISIBLE_DEVICES

diff --git a/src/training/train.py b/src/training/train.py
@@ -86,13 +86,14 @@ def train():
     mode = 'dryrun'; seed = 0; report_to = 'none'
 
     # - Online (real experiment)
-    # mode = 'online'; seed = 0; report_to = 'wandb'
+    mode = 'online'; seed = 0; report_to = 'wandb'
 
-    # - train data sets
+    # -- Train data sets
     # path, name, data_files, split = ['c4'], ['en'], [None], ['train']
+    # - UDACA's
     path, name, data_files, split = ['UDACA/PileSubsets'], ['uspto'], [None], ['train']
-    # path, name, data_files, split = ['UDACA/PileSubsets'], ['pubmed'], [None], ['train']
-    # path, name, data_files, split = ['UDACA/PileSubsets', 'UDACA/PileSubsets'], ['uspto', 'pubmed'], [None, None], ['train', 'train']
+    path, name, data_files, split = ['UDACA/PileSubsets'], ['pubmed'], [None], ['train']
+    path, name, data_files, split = ['UDACA/PileSubsets', 'UDACA/PileSubsets'], ['uspto', 'pubmed'], [None, None], ['train', 'train']
     # - models
     # pretrained_model_name_or_path = 'gpt2'  # this is the smallest model gpt2, 124M params https://huggingface.co/gpt2 
     # pretrained_model_name_or_path = 'meta-llama/Llama-2-7b-hf'
@@ -106,7 +107,7 @@ def train():
     max_steps = 2
     # max_steps = 300
     # max_steps = 866 # <- CHANGE THIS 12hs with with baby llama2 v1 36m 1, 32
-    # max_steps = 1_553  # 13.5hs llama2 full reinit 4*8=32=B 1024=L for 6.3M tokens
+    max_steps = 1_553  # 22-24hs llama2 full reinit 4*8=32=B 1024=L for 6.3M tokens
     # max_steps = 5_000
     # max_steps = 61_036  # 3.8 days for B=32 L=512 rate=5.43secs/it for 1B=1e9tokens
     # max_steps = 78_853 # 4.6 days L=512 B=32 r=5.43 ~1.21B 29,999MiB
@@ -239,6 +240,7 @@ def train():
     print(f"CUDA version: {torch.version.cuda=}")
     eval_hf_with_subsample('UDACA/pile_openwebtext2', None, 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=2, print_str='> Eval OpenWebtext rand mdl')
     eval_hf_with_subsample('c4', 'en', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=2, print_str='> Eval C4 rand mdl')
+    eval_hf_with_subsample('wikitext', 'wikitext-103-v1', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=2,  print_str='> Eval wikitext rand mdl')
 
     # --- Load datasets
     # -- Get train data set
@@ -312,25 +314,32 @@ def train():
     trainer.train()
     trainer.save_model(output_dir=output_dir)  # TODO is this really needed? https://discuss.huggingface.co/t/do-we-need-to-explicity-save-the-model-if-the-save-steps-is-not-a-multiple-of-the-num-steps-with-hf/56745
 
-    # -- Evaluation, NOTE: we are evaluating at the end not during training
+    # --- Evaluation, NOTE: we are evaluating at the end not during training
     print()
-    # - Evaluate model on OpenWebtext
+    # -- Eval subsample
     print('---- Evaluate model on OpenWebtext')
-    metrics = eval_hf_with_subsample('UDACA/pile_openwebtext2', None, 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=4)
-    print(f'OpenWebtext: {metrics=}')
-    # - Evaluate on C4
+    metrics = eval_hf_with_subsample('UDACA/pile_openwebtext2', None, 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=8)
+    print(f'OpenWebtext (8 val samples): {metrics=}')
     print('---- Evaluate model on C4')
-    metrics = eval_hf_with_subsample('c4', 'en', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=4)
-    print(f'C4: {metrics=}')
-    # - Evluate on whole datasets
+    metrics = eval_hf_with_subsample('c4', 'en', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=8)
+    print(f'C4 (8 val samples): {metrics=}')
+    print('---- Evaluate model on wikitext-103-v1')
+    metrics = eval_hf_with_subsample('wikitext', 'wikitext-103-v1', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=8)
+    print(f'Wikitext (8 val samples): {metrics=}')
+
+    # -- Eval whole datasets
     print('---- Evaluate model on Whole OpenWebtext')
     metrics = eval_hf_with_subsample('UDACA/pile_openwebtext2', None, 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=None)
-    # eval_hf(trainer=Trainer(model=model, args=eval_args, train_dataset=None, eval_dataset=eval_dataset1))
-    print(f'OpenWebtext: {metrics=}')
+    print(f'OpenWebtext whole: {metrics=}')
     print('---- Evaluate model on Whole C4')
     metrics = eval_hf_with_subsample('c4', 'en', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=None)
-    # eval_hf(trainer=Trainer(model=model, args=eval_args, train_dataset=None, eval_dataset=eval_dataset2))
-    print(f'C4: {metrics=}')
+    print(f'C4 whole: {metrics=}')
+    print('---- Evaluate model on Whole wikitext-103-v1')
+    metrics = eval_hf_with_subsample('wikitext', 'wikitext-103-v1', 'validation', model, tokenizer, block_size, output_dir, max_eval_samples=None)
+    print(f'Wikitext whole: {metrics=}')
+
+    # -- Print config to show in log what this run was especially data set
+    print(f'{wandb.config=}')
     print('Done!\a')
 
 def main():  

diff --git a/src/training/utils.py b/src/training/utils.py
@@ -9,7 +9,7 @@
 from itertools import chain
 import math
 import random
-from typing import Optional
+from typing import Optional, Any
 
 import torch
 
@@ -352,48 +352,48 @@ def compute_metrics(eval_preds):
     preds = preds[:, :-1].reshape(-1)
     return metric.compute(predictions=preds, references=labels)
 
-def whole_eval(model, 
-         path, 
-         name, 
-         split, 
-         tokenizer, 
-         block_size,
-         output_dir,
-         max_eval_samples: int = 1028, 
-         streaming: bool = True,
-         ):
-    """
-    path, name, split = 'suolyer/pile_openwebtext2', None, 'validation'  # the one sudharsan used
-    """
-    eval_dataset = load_dataset(path, name, streaming=streaming, split=split).with_format("torch") 
-    eval_dataset = raw_dataset_2_lm_data(eval_dataset, tokenizer, block_size)
-    eval_dataset = eval_dataset.take(max_eval_samples)
-
-    print(f'Saving eval results at: {output_dir=}') # The output directory where the model predictions and checkpoints will be written.
-    eval_args = TrainingArguments(output_dir=output_dir, fp16=False, bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8)
-
-    trainer = Trainer(model=model, args=eval_args, train_dataset=None, eval_dataset=eval_dataset)
-    metrics = trainer.evaluate()
-    try:
-        perplexity = math.exp(metrics["eval_loss"])
-    except OverflowError:
-        perplexity = float("inf")
-    metrics["perplexity"] = perplexity
-    print(f'Eval metrics: {metrics=}')
-    trainer.log_metrics("eval", metrics)  # display metrics
-    trainer.save_metrics("eval", metrics)
-    return metrics
-
-def eval_hf(trainer: Trainer, path, name, split,):
+# def whole_eval(model, 
+#          path, 
+#          name, 
+#          split, 
+#          tokenizer, 
+#          block_size,
+#          output_dir,
+#          max_eval_samples: int = 1028, 
+#          streaming: bool = True,
+#          ):
+#     """
+#     path, name, split = 'suolyer/pile_openwebtext2', None, 'validation'  # the one sudharsan used
+#     """
+#     eval_dataset = load_dataset(path, name, streaming=streaming, split=split).with_format("torch") 
+#     eval_dataset = raw_dataset_2_lm_data(eval_dataset, tokenizer, block_size)
+#     eval_dataset = eval_dataset.take(max_eval_samples)
+
+#     print(f'Saving eval results at: {output_dir=}') # The output directory where the model predictions and checkpoints will be written.
+#     eval_args = TrainingArguments(output_dir=output_dir, fp16=False, bf16=torch.cuda.get_device_capability(torch.cuda.current_device())[0] >= 8)
+
+#     trainer = Trainer(model=model, args=eval_args, train_dataset=None, eval_dataset=eval_dataset)
+#     metrics = trainer.evaluate()
+#     try:
+#         perplexity = math.exp(metrics["eval_loss"])
+#     except OverflowError:
+#         perplexity = float("inf")
+#     metrics["perplexity"] = perplexity
+#     print(f'Eval metrics: {metrics=}')
+#     trainer.log_metrics("eval", metrics)  # display metrics
+#     trainer.save_metrics("eval", metrics)
+#     return metrics
+
+def eval_hf(trainer: Trainer, path, name, split, max_eval_samples: Any = 'Unknown Eval Max Samples',):
     metrics = trainer.evaluate()
     try:
         perplexity = math.exp(metrics["eval_loss"])
     except OverflowError:
         perplexity = float("inf")
     metrics["perplexity"] = perplexity
-    print(f'Eval metrics: {metrics=}')
-    trainer.log_metrics(f"eval_{path}_{name}_{split}", metrics)  # display metrics
-    trainer.save_metrics(f"eval_{path}_{name}_{split}", metrics)
+    print(f'Eval metrics {path} {name} {split} {max_eval_samples}: {metrics=}')
+    trainer.log_metrics(f"eval_{path}_{name}_{split}_{max_eval_samples}", metrics)  # display metrics
+    trainer.save_metrics(f"eval", metrics)
     return metrics
 
 def eval_hf_with_subsample(path, name, split, model, tokenizer, block_size, output_dir, 
@@ -413,7 +413,7 @@ def eval_hf_with_subsample(path, name, split, model, tokenizer, block_size, outp
     trainer = Trainer(model=model, args=eval_args, train_dataset=None, eval_dataset=eval_batch2)
     metrics = eval_hf(trainer, path, name, split,)
     if verbose:
-        print(f'----> {path=}, {name=}, {split=}, {metrics=}')
+        print(f'----> {path=}, {name=}, {split=}, {metrics=}, {max_eval_samples=}')
     if print_str is not None:
         print(print_str)
     return metrics