-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_generation.py
54 lines (41 loc) · 1.29 KB
/
data_generation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
# %%
import numpy as np
import pandas as pd
from data_generator import get_y_generator
# %%
def generate_data(N, D, EXP_NO, NOISE_SIGMA):
# %% Generate Xs
df = pd.DataFrame()
for i in range(1, D + 1):
col_name = "x{}".format(i)
if i % 2 != 0:
df[col_name] = np.random.normal(0, 1, size=N)
else:
df[col_name] = np.random.binomial(1, 0.5, size=N)
# %% Generate Base, Effect
gen = get_y_generator(EXP_NO)(Xs=df, noise_sigma=NOISE_SIGMA)
df["base"] = gen.get_base()
df["effect"] = gen.get_effect()
df["y_noise"] = gen.get_y_noise()
# %% Generate y0, y1
df["y0"] = df["base"] - 0.5 * df["effect"]
df["y1"] = df["base"] + 0.5 * df["effect"]
# %% Generate p(z=1), z
df["p(z=1)"] = np.exp(df["y0"]) / (1 + np.exp(df["y0"]))
p_rand = np.random.rand(N)
df["z"] = (p_rand <= df["p(z=1)"]).astype(int)
# %% Get Observable y (with noise)
df["y"] = pd.concat([df["y0"].loc[df["z"] == 0],
df["y1"].loc[df["z"] == 1]])
df["y"] = df["y"] + df["y_noise"]
# %%
return df
# %%
if __name__ == "__main__":
# %% Const
N = 1000
D = 20
EXP_NO = 1
NOISE_SIGMA = 0.5
data = generate_data(N, D, EXP_NO, NOISE_SIGMA)
data.to_csv("data.csv", index=False)