diff --git a/tsimcne/tsimcne.py b/tsimcne/tsimcne.py index abbb328..de5885c 100644 --- a/tsimcne/tsimcne.py +++ b/tsimcne/tsimcne.py @@ -44,7 +44,6 @@ def __init__( weight_decay=5e-4, momentum=0.9, warmup_epochs=10, - dim_anneal_strategy="pca", batches_per_epoch=None, random_state=None, save_intermediate_feat=False, @@ -79,7 +78,6 @@ def __init__( self.weight_decay = weight_decay self.momentum = momentum self.warmup_epochs = warmup_epochs - self.dim_anneal_strategy = dim_anneal_strategy self.batches_per_epoch = batches_per_epoch self.random_state = random_state self.save_intermediate_feat = save_intermediate_feat @@ -226,21 +224,6 @@ def _handle_parameters(self): f"got {self.optimizer_name}." ) - # if self.lr_scheduler_name not in ["cos_annealing", "constant"]: - # raise ValueError( - # "Only 'cos_annealing' and 'constant' is supported as " - # f"learning rate scheduler, got {self.lr_scheduler_name}." - # ) - - if ( - self.dim_anneal_strategy is not None - and self.dim_anneal_strategy != "pca" - ): - raise ValueError( - "Expected None or 'pca' for dim_anneal strategy, got " - f"{self.dim_anneal_strategy}." - ) - self.alphas = torch.sin( torch.linspace(0, 1, self.n_epochs) * torch.pi / 2 ) @@ -329,100 +312,9 @@ def on_train_epoch_start(self): else: self.log("dof", self.cur_dof, prog_bar=False) - prev_dim_mask = self.dim_mask self.dim_mask = self.dim_mask_schedule[self.current_epoch] self.cur_dof = self.dofs[self.current_epoch] - _embeddings = self._embeddings = self.train_embeddings - - # next_output_dim = self.out_dim + self.dim_mask.stop - # vv and next_output_dim < 10 - do_pca = ( - self.dim_anneal_strategy == "pca" - and prev_dim_mask != self.dim_mask - and not isinstance(self.model.projection_head, torch.nn.Identity) - ) - if do_pca: - layer = self.model.projection_head.layers[2] - self.weights = weights = layer.weight[prev_dim_mask] - # bias = layer.bias[prev_dim_mask] - - # weights.T.cpu().detach() - embs = torch.vstack(_embeddings).cpu().detach().float() - unused_w = ( - layer.weight[self.dim_mask.stop :] - .cpu() - .detach() - .float() - .T.numpy() - ) - # unused_b = ( - # layer.bias[self.dim_mask.stop :].cpu().detach().float().numpy() - # ) - - from sklearn.decomposition import PCA - - self.pca = pca = PCA( - min(self.out_dim, self.out_dim + self.dim_mask.stop) - ).fit(embs) - # make_pipeline( - # StandardScaler(with_std=False), - # TruncatedSVD( - # min(self.out_dim, self.out_dim + self.dim_mask.stop) - # ), - # ) - - # _pca_w = pca[1].transform(weights.T.cpu().detach().float().numpy()) - _pca_w = ( - weights.T.cpu().detach().float().numpy() @ pca.components_.T - ) - # _e = embs.numpy() - # _w = weights.detach().cpu().numpy() - rotated_w = _pca_w # * _w.std() # + _w.mean() - transformed_weight = np.hstack((rotated_w, unused_w)) - - odict = self.optimizers().optimizer.state - mdict = odict[layer.weight] - # mdict.clear() - # print(len(odict.keys()), odict[weights]) - ## attempt at pca-transforming the momentum - mbuf = mdict["momentum_buffer"] - momentum = mdict["momentum_buffer"][prev_dim_mask] - unused_m = ( - mdict["momentum_buffer"][self.dim_mask.stop :] - .cpu() - .detach() - .float() - .T.numpy() - ) - _pca_m = ( - momentum.T.cpu().detach().float().numpy() @ pca.components_.T - ) - # _m = mbuf.detach().cpu().numpy() - rotated_m = _pca_m - transformed_mbuf = np.hstack((rotated_m, unused_m)) - mdict["momentum_buffer"][:] = torch.from_numpy( - transformed_mbuf.T - ).to(dtype=mbuf.dtype) - - # rotated_b = pca.transform( - # np.array([bias.cpu().detach().float().numpy()]) - # ).squeeze() - # transformed_bias = np.hstack((rotated_b, unused_b)) - - self.pca = pca - self.rotated_w = rotated_w - self.transformed_weight = transformed_weight - - # raise RuntimeError("now go inspect") - sd = dict( - weight=torch.from_numpy(transformed_weight.T).to( - dtype=weights.dtype - ), - # bias=torch.from_numpy(transformed_bias).bfloat16(), - ) - layer.load_state_dict(sd, strict=False) - # reset the train embeddings for the next epoch self.train_embeddings = []