-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy path00_6_0_permute_data.py
105 lines (79 loc) · 2.93 KB
/
00_6_0_permute_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# %% [markdown]
# # Permute featues in data
# %%
from pathlib import Path
from typing import Union, List
import numpy as np
import pimmslearn
import pimmslearn.analyzers.analyzers
from pimmslearn.utils import create_random_df
logger = pimmslearn.logging.setup_nb_logger()
logger.info("Split data and make diagnostic plots")
# %%
t = create_random_df(N=10, M=3)
t = t.apply(lambda x: np.arange(len(x)))
t
# %%
rng = np.random.default_rng()
t.apply(rng.permutation).sort_values(by='feat_0')
# %%
# catch passed parameters
args = None
args = dict(globals()).keys()
# %% tags=["parameters"]
FN_INTENSITIES: str = 'data/dev_datasets/df_intensities_proteinGroups_long/Q_Exactive_HF_X_Orbitrap_6070.pkl' # Sample (rows) intensiites for features (columns)
index_col: Union[str, int] = 0 # Can be either a string or position (typical 0 for first column), or a list of these.
column_names: List[str] = ["Gene Names"] # Manuelly set column names (of Index object in columns)
out_folder: str = '' # Output folder for permuted data, optional. Default is to save with suffix '_permuted' in same folder as input data
random_seed: int = 42 # Random seed for reproducibility
file_format: str = 'pkl'
# %%
args = pimmslearn.nb.get_params(args, globals=globals())
args
# %%
args = pimmslearn.nb.Config().from_dict(args)
args
# %%
if isinstance(args.index_col, str) or isinstance(args.index_col, int):
args.overwrite_entry('index_col', [args.index_col])
args.index_col # make sure it is an iterable
# %% [markdown]
# ## Raw data
# %% [markdown]
# process arguments
# %%
logger.info(f"{args.FN_INTENSITIES = }")
FILE_FORMAT_TO_CONSTRUCTOR_IN = {'csv': 'from_csv',
'pkl': 'from_pickle',
'pickle': 'from_pickle',
}
FILE_EXT = Path(args.FN_INTENSITIES).suffix[1:]
logger.info(f"File format (extension): {FILE_EXT} (!specifies data loading function!)")
# %%
constructor = getattr(
pimmslearn.analyzers.analyzers.AnalyzePeptides,
FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv
analysis = constructor(fname=args.FN_INTENSITIES,
index_col=args.index_col,
)
# %%
analysis.df.iloc[:10, :5]
# %%
rng = np.random.default_rng(seed=args.random_seed)
df = analysis.df.apply(rng.permutation)
df.iloc[:10, :5]
# %%
FILE_FORMAT_TO_CONSTRUCTOR = {'csv': 'to_csv',
'pkl': 'to_pickle',
'pickle': 'to_pickle',
}
method = getattr(df, FILE_FORMAT_TO_CONSTRUCTOR.get(FILE_EXT))
fname = pimmslearn.utils.append_to_filepath(args.FN_INTENSITIES, 'permuted')
method(fname)
# %%
constructor = getattr(
pimmslearn.analyzers.analyzers.AnalyzePeptides,
FILE_FORMAT_TO_CONSTRUCTOR_IN[FILE_EXT]) # AnalyzePeptides.from_csv
analysis = constructor(fname=args.FN_INTENSITIES,
index_col=args.index_col,
)