Skip to content

Commit

Permalink
Merge pull request #22 from michellab/feature-improved-calc-set
Browse files Browse the repository at this point in the history
Feature Improved CalcSet
  • Loading branch information
Roy-Haolin-Du authored Jan 2, 2025
2 parents 95f61d2 + 8ae9ac7 commit 2b1479d
Show file tree
Hide file tree
Showing 9 changed files with 257 additions and 57 deletions.
2 changes: 1 addition & 1 deletion a3fe/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.1"
__version__ = "0.3.0"
2 changes: 1 addition & 1 deletion a3fe/data/example_calc_set/input/exp_dgs.csv
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
calc_base_dir,name,exp_dg,exp_er,calc_cor
t4l,t4l,-9.06,0.5,0
mdm2_pip2_short,mdm2_pip2_short,-2.93,0.5,0
t4l,t4l,-9.06,0.5,0
52 changes: 38 additions & 14 deletions a3fe/read/_read_exp_dgs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,25 @@

import os as _os

from typing import Optional as _Optional

import pandas as _pd
import numpy as _np


def read_exp_dgs(dgs_file: str, base_dir: str) -> _pd.DataFrame:
"""Read the experimental free energy changes into a pandas dataframe
def read_exp_dgs(
dgs_file: _Optional[str] = None, base_dir: _Optional[str] = None
) -> _pd.DataFrame:
"""
Read the experimental free energy changes into a pandas dataframe. If
the dgs file is not supplied, return an empty dataframe.
Parameters
----------
dgs_file : str
dgs_file : str, optional, default=None
Path to the experimental free energy changes file.
base_dir: str
base_dir: str, optional, default=None
The base directory to interpret the relative paths in the dgs file
with respect to.
Expand All @@ -24,17 +31,34 @@ def read_exp_dgs(dgs_file: str, base_dir: str) -> _pd.DataFrame:
Dataframe containing the experimental free energy changes
"""
required_columns = ["calc_base_dir", "exp_dg", "exp_er", "calc_cor"]
exp_df = _pd.read_csv(dgs_file, index_col=1) # Use the names in the index col

# Check that we have the required columns
if list(exp_df.columns) != required_columns:
raise ValueError(
f"The experimental values file must have the columns {required_columns} but has the columns {exp_df.columns}"
# Make sure that base_dir is supplied if dgs_file is supplied
if dgs_file is not None and base_dir is None:
raise ValueError("If dgs_file is supplied, base_dir must also be supplied")

# If the dgs file is not supplied, create an empty dataframe
if dgs_file is None:
results_df = _pd.DataFrame(columns=required_columns)

else:
# Read the dgs file
results_df = _pd.read_csv(
dgs_file, index_col=1
) # Use the names in the index col

# Check that we have the required columns
if list(results_df.columns) != required_columns:
raise ValueError(
f"The experimental values file must have the columns {required_columns} but has the columns {results_df.columns}"
)

# Convert the paths to absolute paths relative to the dgs file
results_df["calc_base_dir"] = results_df["calc_base_dir"].apply(
lambda x: _os.path.abspath(_os.path.join(base_dir, x))
)

# Convert the paths to absolute paths relative to the dgs file
exp_df["calc_base_dir"] = exp_df["calc_base_dir"].apply(
lambda x: _os.path.abspath(_os.path.join(base_dir, x))
)
# Add colums for calculated free energies
results_df["calc_dg"] = _np.nan
results_df["calc_er"] = _np.nan

return exp_df
return results_df
4 changes: 2 additions & 2 deletions a3fe/run/_simulation_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -321,7 +321,7 @@ def setup(self) -> None:

def analyse(
self,
slurm: bool = False,
slurm: bool = True,
run_nos: _Optional[_List[int]] = None,
subsampling=False,
fraction: float = 1,
Expand All @@ -334,7 +334,7 @@ def analyse(
Parameters
----------
slurm : bool, optional, default=False
slurm : bool, optional, default=True
Whether to use slurm for the analysis.
run_nos : List[int], Optional, default=None
A list of the run numbers to analyse. If None, all runs are analysed.
Expand Down
199 changes: 168 additions & 31 deletions a3fe/run/calc_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,73 @@ def setup(
)
raise e

def get_optimal_lam_vals(
self,
simtime: float = 0.1,
er_type: str = "root_var",
delta_er: float = 1,
set_relative_sim_cost: bool = True,
reference_sim_cost: float = 0.21,
run_nos: _List[int] = [1],
) -> None:
"""
Determine the optimal lambda windows for each stage of each leg of each
calculation by running short simulations at each lambda value and analysing them,
using only a single run. Optionally, determine the simulation cost
and recursively set the relative simulation cost according reference_sim_cost.
Parameters
----------
simtime : float, Optional, default: 0.1
The length of the short simulations to run, in ns. If None is provided,
it is assumed that the simulations have already been run and the
optimal lambda values are extracted from the output files.
er_type: str, Optional, default="root_var"
Whether to integrate the standard error of the mean ("sem") or root
variance of the gradients ("root_var") to calculate the optimal
lambda values.
delta_er : float, default=1
If er_type == "root_var", the desired integrated root variance of the gradients
between each lambda value, in kcal mol^(-1). If er_type == "sem", the
desired integrated standard error of the mean of the gradients between each lambda
value, in kcal mol^(-1) ns^(1/2). A sensible default for root_var is 1 kcal mol-1,
and 0.1 kcal mol-1 ns^(1/2) for sem.
set_relative_sim_cost: bool, optional, default=True
Whether to recursively set the relative simulation cost for the leg and all
sub simulation runners according to the mean simulation cost of the leg.
reference_sim_cost: float, optional, default=0.16
The reference simulation cost to use if set_relative_sim_cost is True, in hr / ns.
The default of 0.21 is the average bound leg simulation cost from a test set of ligands
of a range of system sizes on RTX 2080s. This is used to set the relative simulation
cost according to average_sim_cost / reference_sim_cost.
run_nos : List[int], optional, default=[1]
The run numbers to use for the calculation. Only 1 is run by default, so by default
we only analyse 1. If using delta_er = "sem", more than one run must be specified.
Returns
-------
None
"""
self._logger.info(
"Determining optimal lambda windows for each stage of every calculation..."
)

for calc in self.calcs:
self._logger.info(
f"Determining optimal lambda windows for {calc.base_dir}..."
)
calc.get_optimal_lam_vals(
simtime=simtime,
er_type=er_type,
delta_er=delta_er,
set_relative_sim_cost=set_relative_sim_cost,
reference_sim_cost=reference_sim_cost,
run_nos=run_nos,
)

# Save state
self._dump()

def run(
self,
run_nos: _Optional[_List[int]] = None,
Expand Down Expand Up @@ -235,27 +302,78 @@ def run(
self._logger.error(f"Error running calculation in {calc.base_dir}: {e}")
raise e

def analyse(self, exp_dgs_path: str, offset: bool) -> None:
def analyse(
self,
exp_dgs_path: _Optional[str] = None,
offset: bool = False,
compare_to_exp: bool = True,
reanalyse: bool = False,
slurm: bool = False,
run_nos: _Optional[_List[int]] = None,
subsampling=False,
fraction: float = 1,
plot_rmsds: bool = False,
) -> None:
"""
Analyse all calculations in the set and plot the
free energy changes with respect to experiment.
Analyse all calculations in the set and, if the experimental free
energies are provided, plot the free energy changes with respect
to experiment.
Parameters
----------
exp_dgs_path : str
exp_dgs_path : str, Optional, default = None
The path to the file containing the experimental free energy
changes. This must be a csv file with the columns:
calc_base_dir, name, exp_dg, exp_err
offset: bool
offset: bool, default = False
If True, the calculated dGs will be offset to match the average
experimental free energies.
compare_to_exp : bool, optional, default=True
Whether to compare the calculated free energies to experimental
free energies. If False, only the calculated free energies will
be analysed. If True, correlation statistics and plots will be
generated.
reanalyse : bool, optional, default=False
Whether to reanalyse the data. If False, any existing data will be used
(e.g. if you have already analysed some calculations). If True, the
data will be reanalysed using the options provided. Reanalysis is useful
when you have changed the analysis options and want to apply them to
existing data.
slurm : bool, optional, default=False
Whether to use slurm for the analysis.
run_nos : List[int], Optional, default=None
A list of the run numbers to analyse. If None, all runs are analysed.
subsampling: bool, optional, default=False
If True, the free energy will be calculated by subsampling using
the methods contained within pymbar.
fraction: float, optional, default=1
The fraction of the data to use for analysis. For example, if
fraction=0.5, only the first half of the data will be used for
analysis. If fraction=1, all data will be used. Note that unequilibrated
data is discarded from the beginning of simulations in all cases.
plot_rmsds: bool, optional, default=False
Whether to plot RMSDS. This is slow and so defaults to False.
"""
# Read the experimental dGs into a pandas dataframe and add the extra
# columns needed for the calculated values
if compare_to_exp:
# Make sure that the experimental dGs are provided
if not exp_dgs_path:
raise ValueError(
"Experimental free energy changes must be provided to compare to experimental values"
)

all_dgs = _read_exp_dgs(exp_dgs_path, self.base_dir)
all_dgs["calc_dg"] = _np.nan
all_dgs["calc_er"] = _np.nan

# If experimental dgs are not provided, populate the base dir column
if exp_dgs_path is None:
all_dgs["calc_base_dir"] = self.calc_paths
# In the absence of supplied names (in the experimental dGs file),
# use the base dir name (the stem of the path)
all_dgs["name"] = [
calc_path.split("/")[-1] for calc_path in self.calc_paths
]
all_dgs.set_index("name", inplace=True)
all_dgs["calc_cor"] = 0

# Get the calculated dGs
for calc in self.calcs:
Expand All @@ -269,9 +387,16 @@ def analyse(self, exp_dgs_path: str, offset: bool) -> None:
f"Found {len(name)} rows matching {calc.base_dir} in experimental dGs file"
)

# Carry out MBAR analysis if it has not been done already
if calc._delta_g is None:
calc.analyse()
# Carry out MBAR analysis if it has not been done already,
# or if reanalysis is requested
if calc._delta_g is None or reanalyse:
calc.analyse(
slurm=slurm,
run_nos=run_nos,
subsampling=subsampling,
fraction=fraction,
plot_rmsds=plot_rmsds,
)

# Get the confidence interval
mean_free_energy = _np.mean(calc._delta_g)
Expand All @@ -294,24 +419,36 @@ def analyse(self, exp_dgs_path: str, offset: bool) -> None:
# Save results including NaNs
all_dgs.to_csv(_os.path.join(self.output_dir, "results_summary.txt"), sep=",")

# Exclude rows with NaN
all_dgs.dropna(inplace=True)

# Offset the results if required
if offset:
shift = all_dgs["exp_dg"].mean() - all_dgs["calc_dg"].mean()
all_dgs["calc_dg"] += shift

# Calculate statistics and save results
stats = _compute_stats(all_dgs)
name = "overall_stats_offset.txt" if offset else "overall_stats.txt"
with open(_os.path.join(self.output_dir, name), "wt") as ofile:
for stat in stats:
ofile.write(
f"{stat}: {stats[stat][0]:.2f} ({stats[stat][1]:.2f}, {stats[stat][2]:.2f})\n"
# Calculate statistics and plot if experimental dGs are provided
if compare_to_exp:
# Check that we have experimental dGs for all calculations
if all_dgs["exp_dg"].isnull().any():
raise ValueError(
"Experimental free energy changes must be provided for all "
"calculations if comparing to experimental values"
)

# Plot
_plt_against_exp(
all_results=all_dgs, output_dir=self.output_dir, offset=offset, stats=stats
)
# Exclude rows with NaN
all_dgs.dropna(inplace=True)

# Offset the results if required
if offset:
shift = all_dgs["exp_dg"].mean() - all_dgs["calc_dg"].mean()
all_dgs["calc_dg"] += shift

# Calculate statistics and save results
stats = _compute_stats(all_dgs)
name = "overall_stats_offset.txt" if offset else "overall_stats.txt"
with open(_os.path.join(self.output_dir, name), "wt") as ofile:
for stat in stats:
ofile.write(
f"{stat}: {stats[stat][0]:.2f} ({stats[stat][1]:.2f}, {stats[stat][2]:.2f})\n"
)

# Plot
_plt_against_exp(
all_results=all_dgs,
output_dir=self.output_dir,
offset=offset,
stats=stats,
)
40 changes: 34 additions & 6 deletions a3fe/tests/test_calc_set.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,26 @@

import pytest

import pandas as pd


def test_calc_set_analysis(calc_set):
"""Test the analysis method of the CalcSet class."""
# Make sure an error is raised if the exp dgs file is not supplied
with pytest.raises(ValueError):
calc_set.analyse(exp_dgs_path=None)

# Run the analysis without the experimental data
calc_set.analyse(compare_to_exp=False)

# First, check that we have expected outputs in the output directory
output_dir = os.path.join(calc_set.base_dir, "output")
results_no_exp_path = os.path.join(output_dir, "results_summary.txt")
assert os.path.exists(results_no_exp_path)
results_no_exp = pd.read_csv(results_no_exp_path, index_col=0)

# Repeat the analysis with the experimental data and check we get
# the same results, and that the results are as expected
exp_dgs_path = os.path.join(calc_set.base_dir, "input/exp_dgs.csv")
calc_set.analyse(exp_dgs_path=exp_dgs_path, offset=False)

Expand All @@ -18,9 +35,20 @@ def test_calc_set_analysis(calc_set):
assert os.path.exists(os.path.join(output_dir, "results_summary.txt"))

# Then, check that the results in results_summary.txt are as expected
with open(os.path.join(output_dir, "results_summary.txt")) as f:
lines = f.readlines()
assert float(lines[1].split(",")[-2]) == pytest.approx(5.0378, abs=1e-2)
assert float(lines[1].split(",")[-1]) == pytest.approx(0.1501, abs=1e-2)
assert float(lines[2].split(",")[-2]) == pytest.approx(8.4956, abs=1e-2)
assert float(lines[2].split(",")[-1]) == pytest.approx(0.0935, abs=1e-2)
results_exp = pd.read_csv(
os.path.join(output_dir, "results_summary.txt"), index_col=0
)
# Check that the results are the same once the experimental data is dropped
assert results_no_exp.drop(columns=["exp_dg", "exp_er"]).equals(
results_exp.drop(columns=["exp_dg", "exp_er"])
)

# Regression test for the results
assert results_exp.loc["t4l", "calc_dg"] == pytest.approx(5.0378, abs=1e-2)
assert results_exp.loc["t4l", "calc_er"] == pytest.approx(0.1501, abs=1e-2)
assert results_exp.loc["mdm2_pip2_short", "calc_dg"] == pytest.approx(
8.4956, abs=1e-2
)
assert results_exp.loc["mdm2_pip2_short", "calc_er"] == pytest.approx(
0.0935, abs=1e-2
)
Loading

0 comments on commit 2b1479d

Please sign in to comment.