Merge pull request #22 from michellab/feature-improved-calc-set

Feature Improved CalcSet
michellab · Jan 2, 2025 · 2b1479d · 2b1479d
2 parents 95f61d2 + 8ae9ac7
commit 2b1479d
Show file tree

Hide file tree

Showing 9 changed files with 257 additions and 57 deletions.
diff --git a/a3fe/_version.py b/a3fe/_version.py
@@ -1 +1 @@
-__version__ = "0.2.1"
+__version__ = "0.3.0"
diff --git a/a3fe/data/example_calc_set/input/exp_dgs.csv b/a3fe/data/example_calc_set/input/exp_dgs.csv
@@ -1,3 +1,3 @@
 calc_base_dir,name,exp_dg,exp_er,calc_cor
-t4l,t4l,-9.06,0.5,0
 mdm2_pip2_short,mdm2_pip2_short,-2.93,0.5,0
+t4l,t4l,-9.06,0.5,0
diff --git a/a3fe/read/_read_exp_dgs.py b/a3fe/read/_read_exp_dgs.py
@@ -3,18 +3,25 @@
 
 import os as _os
 
+from typing import Optional as _Optional
+
 import pandas as _pd
+import numpy as _np
 
 
-def read_exp_dgs(dgs_file: str, base_dir: str) -> _pd.DataFrame:
-    """Read the experimental free energy changes into a pandas dataframe
+def read_exp_dgs(
+    dgs_file: _Optional[str] = None, base_dir: _Optional[str] = None
+) -> _pd.DataFrame:
+    """
+    Read the experimental free energy changes into a pandas dataframe. If
+    the dgs file is not supplied, return an empty dataframe.
 
     Parameters
     ----------
-    dgs_file : str
+    dgs_file : str, optional, default=None
         Path to the experimental free energy changes file.
 
-    base_dir: str
+    base_dir: str, optional, default=None
         The base directory to interpret the relative paths in the dgs file
         with respect to.
 
@@ -24,17 +31,34 @@ def read_exp_dgs(dgs_file: str, base_dir: str) -> _pd.DataFrame:
         Dataframe containing the experimental free energy changes
     """
     required_columns = ["calc_base_dir", "exp_dg", "exp_er", "calc_cor"]
-    exp_df = _pd.read_csv(dgs_file, index_col=1)  # Use the names in the index col
 
-    # Check that we have the required columns
-    if list(exp_df.columns) != required_columns:
-        raise ValueError(
-            f"The experimental values file must have the columns {required_columns} but has the columns {exp_df.columns}"
+    # Make sure that base_dir is supplied if dgs_file is supplied
+    if dgs_file is not None and base_dir is None:
+        raise ValueError("If dgs_file is supplied, base_dir must also be supplied")
+
+    # If the dgs file is not supplied, create an empty dataframe
+    if dgs_file is None:
+        results_df = _pd.DataFrame(columns=required_columns)
+
+    else:
+        # Read the dgs file
+        results_df = _pd.read_csv(
+            dgs_file, index_col=1
+        )  # Use the names in the index col
+
+        # Check that we have the required columns
+        if list(results_df.columns) != required_columns:
+            raise ValueError(
+                f"The experimental values file must have the columns {required_columns} but has the columns {results_df.columns}"
+            )
+
+        # Convert the paths to absolute paths relative to the dgs file
+        results_df["calc_base_dir"] = results_df["calc_base_dir"].apply(
+            lambda x: _os.path.abspath(_os.path.join(base_dir, x))
         )
 
-    # Convert the paths to absolute paths relative to the dgs file
-    exp_df["calc_base_dir"] = exp_df["calc_base_dir"].apply(
-        lambda x: _os.path.abspath(_os.path.join(base_dir, x))
-    )
+    # Add colums for calculated free energies
+    results_df["calc_dg"] = _np.nan
+    results_df["calc_er"] = _np.nan
 
-    return exp_df
+    return results_df
diff --git a/a3fe/run/_simulation_runner.py b/a3fe/run/_simulation_runner.py
@@ -321,7 +321,7 @@ def setup(self) -> None:
 
     def analyse(
         self,
-        slurm: bool = False,
+        slurm: bool = True,
         run_nos: _Optional[_List[int]] = None,
         subsampling=False,
         fraction: float = 1,
@@ -334,7 +334,7 @@ def analyse(
 
         Parameters
         ----------
-        slurm : bool, optional, default=False
+        slurm : bool, optional, default=True
             Whether to use slurm for the analysis.
         run_nos : List[int], Optional, default=None
             A list of the run numbers to analyse. If None, all runs are analysed.

diff --git a/a3fe/run/calc_set.py b/a3fe/run/calc_set.py
@@ -163,6 +163,73 @@ def setup(
                 )
                 raise e
 
+    def get_optimal_lam_vals(
+        self,
+        simtime: float = 0.1,
+        er_type: str = "root_var",
+        delta_er: float = 1,
+        set_relative_sim_cost: bool = True,
+        reference_sim_cost: float = 0.21,
+        run_nos: _List[int] = [1],
+    ) -> None:
+        """
+        Determine the optimal lambda windows for each stage of each leg of each
+        calculation by running short simulations at each lambda value and analysing them,
+        using only a single run. Optionally, determine the simulation cost
+        and recursively set the relative simulation cost according reference_sim_cost.
+
+        Parameters
+        ----------
+        simtime : float, Optional, default: 0.1
+            The length of the short simulations to run, in ns. If None is provided,
+            it is assumed that the simulations have already been run and the
+            optimal lambda values are extracted from the output files.
+        er_type: str, Optional, default="root_var"
+            Whether to integrate the standard error of the mean ("sem") or root
+            variance of the gradients ("root_var") to calculate the optimal
+            lambda values.
+        delta_er : float, default=1
+            If er_type == "root_var", the desired integrated root variance of the gradients
+            between each lambda value, in kcal mol^(-1). If er_type == "sem", the
+            desired integrated standard error of the mean of the gradients between each lambda
+            value, in kcal mol^(-1) ns^(1/2). A sensible default for root_var is 1 kcal mol-1,
+            and 0.1 kcal mol-1 ns^(1/2) for sem.
+        set_relative_sim_cost: bool, optional, default=True
+            Whether to recursively set the relative simulation cost for the leg and all
+            sub simulation runners according to the mean simulation cost of the leg.
+        reference_sim_cost: float, optional, default=0.16
+            The reference simulation cost to use if set_relative_sim_cost is True, in hr / ns.
+            The default of 0.21 is the average bound leg simulation cost from a test set of ligands
+            of a range of system sizes on RTX 2080s. This is used to set the relative simulation
+            cost according to average_sim_cost / reference_sim_cost.
+        run_nos : List[int], optional, default=[1]
+            The run numbers to use for the calculation. Only 1 is run by default, so by default
+            we only analyse 1. If using delta_er = "sem", more than one run must be specified.
+
+        Returns
+        -------
+        None
+        """
+        self._logger.info(
+            "Determining optimal lambda windows for each stage of every calculation..."
+        )
+
+        for calc in self.calcs:
+            self._logger.info(
+                f"Determining optimal lambda windows for {calc.base_dir}..."
+            )
+            calc.get_optimal_lam_vals(
+                simtime=simtime,
+                er_type=er_type,
+                delta_er=delta_er,
+                set_relative_sim_cost=set_relative_sim_cost,
+                reference_sim_cost=reference_sim_cost,
+                run_nos=run_nos,
+            )
+
+        # Save state
+        self._dump()
+
     def run(
         self,
         run_nos: _Optional[_List[int]] = None,
@@ -235,27 +302,78 @@ def run(
                 self._logger.error(f"Error running calculation in {calc.base_dir}: {e}")
                 raise e
 
-    def analyse(self, exp_dgs_path: str, offset: bool) -> None:
+    def analyse(
+        self,
+        exp_dgs_path: _Optional[str] = None,
+        offset: bool = False,
+        compare_to_exp: bool = True,
+        reanalyse: bool = False,
+        slurm: bool = False,
+        run_nos: _Optional[_List[int]] = None,
+        subsampling=False,
+        fraction: float = 1,
+        plot_rmsds: bool = False,
+    ) -> None:
         """
-        Analyse all calculations in the set and plot the
-        free energy changes with respect to experiment.
+        Analyse all calculations in the set and, if the experimental free
+        energies are provided, plot the free energy changes with respect
+        to experiment.
 
         Parameters
         ----------
-        exp_dgs_path : str
+        exp_dgs_path : str, Optional, default = None
             The path to the file containing the experimental free energy
             changes. This must be a csv file with the columns:
 
             calc_base_dir, name, exp_dg, exp_err
-        offset: bool
+        offset: bool, default = False
             If True, the calculated dGs will be offset to match the average
             experimental free energies.
+        compare_to_exp : bool, optional, default=True
+            Whether to compare the calculated free energies to experimental
+            free energies. If False, only the calculated free energies will
+            be analysed. If True, correlation statistics and plots will be
+            generated.
+        reanalyse : bool, optional, default=False
+            Whether to reanalyse the data. If False, any existing data will be used
+            (e.g. if you have already analysed some calculations). If True, the
+            data will be reanalysed using the options provided. Reanalysis is useful
+            when you have changed the analysis options and want to apply them to
+            existing data.
+        slurm : bool, optional, default=False
+            Whether to use slurm for the analysis.
+        run_nos : List[int], Optional, default=None
+            A list of the run numbers to analyse. If None, all runs are analysed.
+        subsampling: bool, optional, default=False
+            If True, the free energy will be calculated by subsampling using
+            the methods contained within pymbar.
+        fraction: float, optional, default=1
+            The fraction of the data to use for analysis. For example, if
+            fraction=0.5, only the first half of the data will be used for
+            analysis. If fraction=1, all data will be used. Note that unequilibrated
+            data is discarded from the beginning of simulations in all cases.
+        plot_rmsds: bool, optional, default=False
+            Whether to plot RMSDS. This is slow and so defaults to False.
         """
-        # Read the experimental dGs into a pandas dataframe and add the extra
-        # columns needed for the calculated values
+        if compare_to_exp:
+            # Make sure that the experimental dGs are provided
+            if not exp_dgs_path:
+                raise ValueError(
+                    "Experimental free energy changes must be provided to compare to experimental values"
+                )
+
         all_dgs = _read_exp_dgs(exp_dgs_path, self.base_dir)
-        all_dgs["calc_dg"] = _np.nan
-        all_dgs["calc_er"] = _np.nan
+
+        # If experimental dgs are not provided, populate the base dir column
+        if exp_dgs_path is None:
+            all_dgs["calc_base_dir"] = self.calc_paths
+            # In the absence of supplied names (in the experimental dGs file),
+            # use the base dir name (the stem of the path)
+            all_dgs["name"] = [
+                calc_path.split("/")[-1] for calc_path in self.calc_paths
+            ]
+            all_dgs.set_index("name", inplace=True)
+            all_dgs["calc_cor"] = 0
 
         # Get the calculated dGs
         for calc in self.calcs:
@@ -269,9 +387,16 @@ def analyse(self, exp_dgs_path: str, offset: bool) -> None:
                     f"Found {len(name)} rows matching {calc.base_dir} in experimental dGs file"
                 )
 
-            # Carry out MBAR analysis if it has not been done already
-            if calc._delta_g is None:
-                calc.analyse()
+            # Carry out MBAR analysis if it has not been done already,
+            # or if reanalysis is requested
+            if calc._delta_g is None or reanalyse:
+                calc.analyse(
+                    slurm=slurm,
+                    run_nos=run_nos,
+                    subsampling=subsampling,
+                    fraction=fraction,
+                    plot_rmsds=plot_rmsds,
+                )
 
             # Get the confidence interval
             mean_free_energy = _np.mean(calc._delta_g)
@@ -294,24 +419,36 @@ def analyse(self, exp_dgs_path: str, offset: bool) -> None:
         # Save results including NaNs
         all_dgs.to_csv(_os.path.join(self.output_dir, "results_summary.txt"), sep=",")
 
-        # Exclude rows with NaN
-        all_dgs.dropna(inplace=True)
-
-        # Offset the results if required
-        if offset:
-            shift = all_dgs["exp_dg"].mean() - all_dgs["calc_dg"].mean()
-            all_dgs["calc_dg"] += shift
-
-        # Calculate statistics and save results
-        stats = _compute_stats(all_dgs)
-        name = "overall_stats_offset.txt" if offset else "overall_stats.txt"
-        with open(_os.path.join(self.output_dir, name), "wt") as ofile:
-            for stat in stats:
-                ofile.write(
-                    f"{stat}: {stats[stat][0]:.2f} ({stats[stat][1]:.2f}, {stats[stat][2]:.2f})\n"
+        # Calculate statistics and plot if experimental dGs are provided
+        if compare_to_exp:
+            # Check that we have experimental dGs for all calculations
+            if all_dgs["exp_dg"].isnull().any():
+                raise ValueError(
+                    "Experimental free energy changes must be provided for all "
+                    "calculations if comparing to experimental values"
                 )
 
-        # Plot
-        _plt_against_exp(
-            all_results=all_dgs, output_dir=self.output_dir, offset=offset, stats=stats
-        )
+            # Exclude rows with NaN
+            all_dgs.dropna(inplace=True)
+
+            # Offset the results if required
+            if offset:
+                shift = all_dgs["exp_dg"].mean() - all_dgs["calc_dg"].mean()
+                all_dgs["calc_dg"] += shift
+
+            # Calculate statistics and save results
+            stats = _compute_stats(all_dgs)
+            name = "overall_stats_offset.txt" if offset else "overall_stats.txt"
+            with open(_os.path.join(self.output_dir, name), "wt") as ofile:
+                for stat in stats:
+                    ofile.write(
+                        f"{stat}: {stats[stat][0]:.2f} ({stats[stat][1]:.2f}, {stats[stat][2]:.2f})\n"
+                    )
+
+            # Plot
+            _plt_against_exp(
+                all_results=all_dgs,
+                output_dir=self.output_dir,
+                offset=offset,
+                stats=stats,
+            )
diff --git a/a3fe/tests/test_calc_set.py b/a3fe/tests/test_calc_set.py
@@ -4,9 +4,26 @@
 
 import pytest
 
+import pandas as pd
+
 
 def test_calc_set_analysis(calc_set):
     """Test the analysis method of the CalcSet class."""
+    # Make sure an error is raised if the exp dgs file is not supplied
+    with pytest.raises(ValueError):
+        calc_set.analyse(exp_dgs_path=None)
+
+    # Run the analysis without the experimental data
+    calc_set.analyse(compare_to_exp=False)
+
+    # First, check that we have expected outputs in the output directory
+    output_dir = os.path.join(calc_set.base_dir, "output")
+    results_no_exp_path = os.path.join(output_dir, "results_summary.txt")
+    assert os.path.exists(results_no_exp_path)
+    results_no_exp = pd.read_csv(results_no_exp_path, index_col=0)
+
+    # Repeat the analysis with the experimental data and check we get
+    # the same results, and that the results are as expected
     exp_dgs_path = os.path.join(calc_set.base_dir, "input/exp_dgs.csv")
     calc_set.analyse(exp_dgs_path=exp_dgs_path, offset=False)
 
@@ -18,9 +35,20 @@ def test_calc_set_analysis(calc_set):
     assert os.path.exists(os.path.join(output_dir, "results_summary.txt"))
 
     # Then, check that the results in results_summary.txt are as expected
-    with open(os.path.join(output_dir, "results_summary.txt")) as f:
-        lines = f.readlines()
-    assert float(lines[1].split(",")[-2]) == pytest.approx(5.0378, abs=1e-2)
-    assert float(lines[1].split(",")[-1]) == pytest.approx(0.1501, abs=1e-2)
-    assert float(lines[2].split(",")[-2]) == pytest.approx(8.4956, abs=1e-2)
-    assert float(lines[2].split(",")[-1]) == pytest.approx(0.0935, abs=1e-2)
+    results_exp = pd.read_csv(
+        os.path.join(output_dir, "results_summary.txt"), index_col=0
+    )
+    # Check that the results are the same once the experimental data is dropped
+    assert results_no_exp.drop(columns=["exp_dg", "exp_er"]).equals(
+        results_exp.drop(columns=["exp_dg", "exp_er"])
+    )
+
+    # Regression test for the results
+    assert results_exp.loc["t4l", "calc_dg"] == pytest.approx(5.0378, abs=1e-2)
+    assert results_exp.loc["t4l", "calc_er"] == pytest.approx(0.1501, abs=1e-2)
+    assert results_exp.loc["mdm2_pip2_short", "calc_dg"] == pytest.approx(
+        8.4956, abs=1e-2
+    )
+    assert results_exp.loc["mdm2_pip2_short", "calc_er"] == pytest.approx(
+        0.0935, abs=1e-2
+    )