From 30bee605264c37931cabfc092551fbd704710b18 Mon Sep 17 00:00:00 2001 From: mpielies Date: Wed, 7 Aug 2024 17:30:04 +0200 Subject: [PATCH 1/3] :zap: :fire: Correct duplicated scaling (scale only once) Correct kld_warmup in training_loop.py: - kld_w as a fraction of beta, no dependence on num_latent --- src/move/tasks/encode_data.py | 6 ------ src/move/training/training_loop.py | 6 ++---- 2 files changed, 2 insertions(+), 10 deletions(-) diff --git a/src/move/tasks/encode_data.py b/src/move/tasks/encode_data.py index f0a83633..2d5076d0 100644 --- a/src/move/tasks/encode_data.py +++ b/src/move/tasks/encode_data.py @@ -59,12 +59,6 @@ def encode_data(config: DataConfig): ) fig.savefig(fig_path) - values, mask_1d = preprocessing.scale(values) - names = names[mask_1d] - logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}") - io.dump_names(interim_data_path / f"{dataset_name}.txt", names) - np.save(interim_data_path / f"{dataset_name}.npy", values) - # Plotting the value distribution for all continuous datasets: fig = plot_value_distributions(values) fig_path = str(output_path / f"Value_distribution_{dataset_name}.png") diff --git a/src/move/training/training_loop.py b/src/move/training/training_loop.py index 2405d676..64d6af11 100644 --- a/src/move/training/training_loop.py +++ b/src/move/training/training_loop.py @@ -72,13 +72,11 @@ def training_loop( counter = 0 kld_weight = 0.0 - kld_rate = 20 / len(kld_warmup_steps) - kld_multiplier = 1 + kld_rate + for epoch in range(1, num_epochs + 1): if epoch in kld_warmup_steps: - kld_weight = 0.05 * kld_multiplier - kld_multiplier += kld_rate + kld_weight += 1 / len(kld_warmup_steps) if epoch in batch_dilation_steps: train_dataloader = dilate_batch(train_dataloader) From feefc767818bff5d0e1e58c23a77a897667b7663 Mon Sep 17 00:00:00 2001 From: mpielies Date: Thu, 8 Aug 2024 13:00:47 +0200 Subject: [PATCH 2/3] :art: :wrench: Log2, scaling and linting - Optional log2 transform of continuous data - Black formatting of training_loop.py --- src/move/conf/schema.py | 1 + src/move/data/preprocessing.py | 6 ++++-- src/move/tasks/encode_data.py | 19 ++++++++++--------- src/move/training/training_loop.py | 1 - tutorial/config/data/random_continuous.yaml | 4 ++++ tutorial/config/data/random_small.yaml | 4 ++++ 6 files changed, 23 insertions(+), 12 deletions(-) diff --git a/src/move/conf/schema.py b/src/move/conf/schema.py index 8a91fe58..1f2fdeb6 100644 --- a/src/move/conf/schema.py +++ b/src/move/conf/schema.py @@ -32,6 +32,7 @@ class InputConfig: @dataclass class ContinuousInputConfig(InputConfig): scale: bool = True + log2: bool = False @dataclass diff --git a/src/move/data/preprocessing.py b/src/move/data/preprocessing.py index e4b5dcce..269774f7 100644 --- a/src/move/data/preprocessing.py +++ b/src/move/data/preprocessing.py @@ -64,7 +64,7 @@ def one_hot_encode_single(mapping: dict[str, int], value: Optional[str]) -> IntA return encoded_value -def scale(x: np.ndarray) -> tuple[FloatArray, BoolArray]: +def scale(x: np.ndarray, log2: bool = False) -> tuple[FloatArray, BoolArray]: """Center to mean and scale to unit variance. Convert NaN values to 0. Args: @@ -74,7 +74,9 @@ def scale(x: np.ndarray) -> tuple[FloatArray, BoolArray]: Tuple containing (1) scaled output and (2) a 1D mask marking columns (i.e., features) without zero variance """ - logx = np.log2(x + 1) + logx = x + if log2: + logx = np.log2(x + 1) mask_1d = ~np.isclose(np.nanstd(logx, axis=0), 0.0) scaled_x = standardize(logx[:, mask_1d], axis=0) scaled_x[np.isnan(scaled_x)] = 0 diff --git a/src/move/tasks/encode_data.py b/src/move/tasks/encode_data.py index 2d5076d0..6951147e 100644 --- a/src/move/tasks/encode_data.py +++ b/src/move/tasks/encode_data.py @@ -55,18 +55,19 @@ def encode_data(config: DataConfig): # before preprocessing: fig = plot_value_distributions(values) fig_path = str( - output_path / "Value_distribution_{}_unprocessed.png".format(dataset_name) + output_path / f"Value_distribution_{dataset_name}_unprocessed.png" ) fig.savefig(fig_path) - # Plotting the value distribution for all continuous datasets: - fig = plot_value_distributions(values) - fig_path = str(output_path / f"Value_distribution_{dataset_name}.png") - fig.savefig(fig_path) - if scale: - values, mask_1d = preprocessing.scale(values) + logger.debug(f"Scaling dataset: {dataset_name}, log2 transform: {input_config.log2}") + values, mask_1d = preprocessing.scale(values, input_config.log2) names = names[mask_1d] logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}") - io.dump_names(interim_data_path / f"{input_config.name}.txt", names) - np.save(interim_data_path / f"{input_config.name}.npy", values) + # Plotting the value distribution for all continuous datasets: + fig = plot_value_distributions(values) + fig_path = str(output_path / f"Value_distribution_{dataset_name}.png") + fig.savefig(fig_path) + + io.dump_names(interim_data_path / f"{dataset_name}.txt", names) + np.save(interim_data_path / f"{dataset_name}.npy", values) diff --git a/src/move/training/training_loop.py b/src/move/training/training_loop.py index 64d6af11..e81667b0 100644 --- a/src/move/training/training_loop.py +++ b/src/move/training/training_loop.py @@ -73,7 +73,6 @@ def training_loop( kld_weight = 0.0 - for epoch in range(1, num_epochs + 1): if epoch in kld_warmup_steps: kld_weight += 1 / len(kld_warmup_steps) diff --git a/tutorial/config/data/random_continuous.yaml b/tutorial/config/data/random_continuous.yaml index 53150799..b33402fa 100755 --- a/tutorial/config/data/random_continuous.yaml +++ b/tutorial/config/data/random_continuous.yaml @@ -17,4 +17,8 @@ categorical_inputs: [] # no categorical inputs continuous_inputs: # a list of continuous datasets - name: random.continuous.proteomics + log2: true + scale: true - name: random.continuous.metagenomics + log2: true + scale: true diff --git a/tutorial/config/data/random_small.yaml b/tutorial/config/data/random_small.yaml index 5f1e8d18..d9e32564 100644 --- a/tutorial/config/data/random_small.yaml +++ b/tutorial/config/data/random_small.yaml @@ -18,4 +18,8 @@ categorical_inputs: # a list of categorical datasets continuous_inputs: # a list of continuous datasets - name: random.small.proteomics + log2: true #apply log2 before scaling + scale: true #scale data (z-score normalize) - name: random.small.metagenomics + log2: true + scale: true From 8d0587a6986e4142e95e4e2ba65963615e784e72 Mon Sep 17 00:00:00 2001 From: mpielies Date: Thu, 8 Aug 2024 13:03:50 +0200 Subject: [PATCH 3/3] Whole repo formatted using black --- src/move/tasks/encode_data.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/move/tasks/encode_data.py b/src/move/tasks/encode_data.py index 6951147e..d90f76f1 100644 --- a/src/move/tasks/encode_data.py +++ b/src/move/tasks/encode_data.py @@ -60,7 +60,9 @@ def encode_data(config: DataConfig): fig.savefig(fig_path) if scale: - logger.debug(f"Scaling dataset: {dataset_name}, log2 transform: {input_config.log2}") + logger.debug( + f"Scaling dataset: {dataset_name}, log2 transform: {input_config.log2}" + ) values, mask_1d = preprocessing.scale(values, input_config.log2) names = names[mask_1d] logger.debug(f"Columns with zero variance: {np.sum(~mask_1d)}")