Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH expect pd.DataFrame as input of FaDIn.fit #15

Open
wants to merge 4 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions fadin/solver.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,8 +252,12 @@ def fit(self, events, end_time):

Parameters
----------
events : list of array of size number of timestamps,
list size is self.n_dim.
events : pd.DataFrame
The events to infer the Hawkes Process's parameters.
The event should be formatted as a pd.DataFrame, with columns:
- `'time'` or index: to represent the time of the event.
- `'type'`: to annotate which event type the event belongs to.
- `'mark'` (optional): to represent the mark of the event.

end_time : int
The end time of the Hawkes process.
Expand Down
46 changes: 38 additions & 8 deletions fadin/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,16 +68,46 @@ def check_params(list_params, number_params):

def projected_grid(events, grid_step, n_grid):
"""Project the events on the defined grid.

Parameters
----------
events : pd.DataFrame
The events to infer the Hawkes Process's parameters.
The event should be formatted as a pd.DataFrame, with columns:
- `'time'` or index: to represent the time of the event.
- `'type'`: to annotate which event type the event belongs to.
- `'mark'` (optional): to represent the mark of the event.

grid_step : float
The step of the grid.

n_grid : int
The number of grid points.

Returns
-------
events_grid : torch.Tensor
The events projected on the grid.
"""
n_dim = len(events)
# size_discret = int(1 / grid_step)
# compute time of the event on the grid
events['time_g'] = (events['time'] / grid_step).round().astype(int)

# Compute sum of the marks, or number of events at each time in the grid
if 'mark' in events.columns:
events = events.groupby(['type', 'time_g'])['mark'].sum()
else:
events = events.groupby(['type', 'time_g']).size()

# Make sure the resulting DataFrame has the right columns
events.name = "mark_sum"
events = events.reset_index()

# Initialize the grid and fill it with events
n_dim = events['type'].nunique()
events_grid = torch.zeros(n_dim, n_grid)
for i in range(n_dim):
ei_torch = torch.tensor(events[i])
temp = torch.round(ei_torch / grid_step).long() # * grid_step
# temp2 = torch.round(temp * size_discret)
idx, data = np.unique(temp, return_counts=True)
events_grid[i, idx] += torch.tensor(data)

i, idx = events['type'].values, events['time_g'].values
events_grid[i, idx] += torch.tensor(events["mark_sum"])

return events_grid

Expand Down
140 changes: 39 additions & 101 deletions fadin/utils/utils_simu.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,36 @@
import numpy as np
from scipy.optimize import minimize_scalar
from scipy.interpolate import interp1d
from scipy.integrate import simps
import scipy.stats as stats
import pandas as pd

import scipy
from scipy import stats


def find_max(intensity_function, duration):
"""Find the maximum intensity of a function."""
res = minimize_scalar(lambda x: -intensity_function(x), bounds=(0, duration))
res = scipy.optimize.minimize_scalar(
lambda x: -intensity_function(x), bounds=(0, duration)
)
return -res.fun


def to_pandas(events):

return pd.DataFrame({
'time': np.concatenate(events),
'type': np.concatenate([
[i] * len(evi) for i, evi in enumerate(events)
])
})


def from_pandas(events):
assert 'mark' not in events.columns
return [
events.query("type == @i")['time'].values
for i in events['type'].unique()
]


def check_random_state(seed):
"""Turn seed into a np.random.RandomState instance.
If seed is None, return the RandomState singleton used by np.random.
Expand Down Expand Up @@ -84,7 +104,7 @@ def __init__(self, custom_density, params=dict(), kernel_length=10):
# function to normalise the pdf over chosen domain

def normalisation(self, x):
return simps(self.pdf(x), x)
return scipy.integrate.simps(self.pdf(x), x)

def create_cdf_ppf(self):
# define normalization support with the given kernel length
Expand All @@ -98,8 +118,10 @@ def create_cdf_ppf(self):
# make sure cdf bounded on [0,1]
my_cdf = my_cdf / my_cdf[-1]
# create cdf and ppf
func_cdf = interp1d(discrete_time, my_cdf)
func_ppf = interp1d(my_cdf, discrete_time, fill_value='extrapolate')
func_cdf = scipy.interpolate.interp1d(discrete_time, my_cdf)
func_ppf = scipy.interpolate.interp1d(
my_cdf, discrete_time, fill_value='extrapolate'
)
return func_cdf, func_ppf

# pdf function for averaged normals
Expand Down Expand Up @@ -151,7 +173,7 @@ def simu_poisson(end_time, intensity, upper_bound=None, random_state=None):
assert isinstance(intensity, (int, float))
n_events = rng.poisson(lam=intensity*end_time, size=1)
events = np.sort(rng.uniform(0, end_time, size=n_events))
return events
return to_pandas(events)

if upper_bound is None:
upper_bound = find_max(intensity, end_time)
Expand All @@ -164,7 +186,7 @@ def simu_poisson(end_time, intensity, upper_bound=None, random_state=None):
# ogata's thinning algorithm
accepted = intensity(ev_x) > ev_y
events = np.sort(ev_x[accepted])
return events
return to_pandas(events)


def simu_multi_poisson(end_time, intensity, upper_bound=None, random_state=None):
Expand Down Expand Up @@ -206,7 +228,7 @@ def simu_multi_poisson(end_time, intensity, upper_bound=None, random_state=None)
for i in range(n_dim):
evs = np.sort(rng.uniform(0, end_time, size=n_events[i]))
events.append(evs)
return events
return to_pandas(events)

if upper_bound is None:
upper_bound = np.zeros(n_dim)
Expand All @@ -221,7 +243,7 @@ def simu_multi_poisson(end_time, intensity, upper_bound=None, random_state=None)

events.append(np.sort(ev_xi[accepted_i]))

return events
return to_pandas(events)


def simu_hawkes_cluster(end_time, baseline, alpha, kernel,
Expand Down Expand Up @@ -281,6 +303,7 @@ def simu_hawkes_cluster(end_time, baseline, alpha, kernel,
immigrants = simu_multi_poisson(end_time, baseline,
upper_bound=upper_bound,
random_state=random_state)
immigrants = from_pandas(immigrants)
gen = dict(gen0=immigrants)
events = immigrants.copy()

Expand Down Expand Up @@ -317,7 +340,7 @@ def simu_hawkes_cluster(end_time, baseline, alpha, kernel,

it += 1

return events
return to_pandas(events)


def simu_hawkes_thinning(end_time, baseline, alpha, kernel,
Expand Down Expand Up @@ -346,8 +369,8 @@ def simu_hawkes_thinning(end_time, baseline, alpha, kernel,
kernel: str
The choice of the kernel for the simulation.
Kernel available are probability distribution from scipy.stats module.
Note that this function will automatically convert the scipy kernel to a
finite support kernel of size 'kernel_length'.
Note that this function will automatically convert the scipy kernel to
a finite support kernel of size 'kernel_length'.

kernel_length: int
Length of kernels in the Hawkes process.
Expand Down Expand Up @@ -402,89 +425,4 @@ def simu_hawkes_thinning(end_time, baseline, alpha, kernel,
del events[i][-1]
events[i] = np.array(events[i])

return events


# def simu_hawkes(end_time, baseline, alpha, kernel, upper_bound=None,
# random_state=None):
# """ Simulate a Hawkes process following an immigration-birth procedure.
# Edge effects may be reduced according to the second references below.

# References:

# Møller, J., & Rasmussen, J. G. (2006). Approximate simulation of Hawkes processes.
# Methodology and Computing in Applied Probability, 8, 53-64.

# Møller, J., & Rasmussen, J. G. (2005). Perfect simulation of Hawkes processes.
# Advances in applied probability, 37(3), 629-646.

# Parameters
# ----------
# end_time : int | float
# Duration of the Poisson process.

# baseline : float
# Baseline parameter of the Hawkes process.

# alpha : float
# Weight parameter associated to the kernel function.

# kernel: str
# The choice of the kernel for the simulation.
# Kernel available are probability distribution from scipy.stats module.

# upper_bound : int, float or None, default=None
# Upper bound of the baseline function. If None,
# the maximum of the baseline is taken onto a finite discrete grid.

# random_state : int, RandomState instance or None, default=None
# Set the numpy seed to 'random_state'.

# Returns
# -------
# events : array
# The timestamps of the point process' events.
# """
# if random_state is None:
# np.random.seed(0)
# else:
# np.random.seed(random_state)

# # Simulate events from baseline
# immigrants = simu_poisson(end_time, intensity=baseline, upper_bound=upper_bound)

# # initialize
# gen = dict(gen0=immigrants)
# ev = [immigrants]
# sample_from_kernel = getattr(scipy.stats, kernel) # dic = [4, 3]

# # generate offsprings
# it = 0
# while len(gen[f'gen{it}']):
# Ci = gen[f'gen{it}']

# # number of descendents of an event follow a Poisson law of parameter alpha
# # the expected number of event induced by an event is then alpha

# Di = np.random.poisson(lam=alpha, size=len(Ci))

# # sample interval range according to the given pdf (kernel of the parameter)
# Ci = np.repeat(Ci, repeats=Di)
# n = Di.sum()
# Ei = sample_from_kernel.rvs(size=n) # * dic
# Fi = Ci + Ei


# if len(Fi) > 0:
# gen[f'gen{it+1}'] = np.hstack(Fi)
# ev.append(np.hstack(Fi))
# else:
# break
# it += 1

# # Add immigrants and sort
# events = np.hstack(ev)
# valid_events = events < end_time
# events = np.sort(events[valid_events])

# return events
return to_pandas(events)
3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,8 @@ dependencies = [
"torch>=1.11",
"scipy>=1.8.0",
"numba>=0.55.2",
"matplotlib>=3.7.2"
"matplotlib>=3.7.2",
"pandas>=2.0"
]


Expand Down
Loading