Skip to content

Commit

Permalink
added optimizer argument
Browse files Browse the repository at this point in the history
  • Loading branch information
dportik committed Dec 27, 2019
1 parent cd86a2a commit 7db7e7f
Show file tree
Hide file tree
Showing 9 changed files with 131 additions and 21 deletions.
45 changes: 37 additions & 8 deletions Goodness_of_Fit/Optimize_Functions_GOF.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,7 +168,7 @@ def write_log(outfile, model_name, rep_results, roundrep):

def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded,
reps=None, maxiters=None, folds=None, in_params=None,
in_upper=None, in_lower=None, param_labels=None):
in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin"):
#--------------------------------------------------------------------------------------
# Mandatory Arguments =
#(1) fs: spectrum object name
Expand All @@ -188,6 +188,7 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe
#(13) in_upper: a list of upper bound values
#(14) in_lower: a list of lower bound values
#(15) param_labels: list of labels for parameters that will be written to the output file to keep track of their order
#(16) optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method).
#--------------------------------------------------------------------------------------

#call function that determines if our params and bounds have been set or need to be generated for us
Expand All @@ -199,6 +200,9 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe
#start keeping track of time it takes to complete optimizations for this model
tb_round = datetime.now()

#optimizer dict
optdict = {"log":"BFGS method", "log_lbfgsb":"L-BFGS-B method", "log_fmin":"Nelder-Mead method", "log_powell":"Powell's method"}

# We need an output file that will store all summary info for each replicate, across rounds
outname = "{0}.{1}.optimized.txt".format(outfile,model_name)
with open(outname, 'a') as fh_out:
Expand Down Expand Up @@ -242,11 +246,32 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe
print("\n\t\t\tStarting parameters = [{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_perturbed])))

#optimize from perturbed parameters
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
print("\t\t\tOptimized parameters =[{}]\n".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
if optimizer == "log_fmin":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
elif optimizer == "log":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
elif optimizer == "log_lbfgsb":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
elif optimizer == "log_powell":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))

else:
raise ValueError("\n\nERROR: Unrecognized optimizer option: {}\nPlease select from: log, log_lbfgsb, log_fmin, or log_powell.\n\n".format(optimizer))

print("\t\t\tOptimized parameters =[{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
print("\t\t\tOptimized using: {0} ({1})\n".format(optimizer, optdict[optimizer]))

#simulate the model with the optimized parameters
sim_model = func_exec(params_opt, fs.sample_sizes, pts)
Expand Down Expand Up @@ -351,7 +376,8 @@ def Optimize_Empirical(fs, pts, outfile, model_name, func, in_params, fs_folded=


def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number, fs_folded=True,
reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=None):
reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None,
param_labels=None, optimizer="log_fmin"):
#--------------------------------------------------------------------------------------
# Mandatory Arguments =
#(1) sim_number: the number of simulations to perform
Expand All @@ -367,6 +393,8 @@ def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_numb
# reps: a list of integers controlling the number of replicates in each of the optimization rounds
# maxiters: a list of integers controlling the maxiter argument in each of the optimization rounds
# folds: a list of integers controlling the fold argument when perturbing input parameter values
# param_labels: list of labels for parameters that will be written to the output file to keep track of their order
# optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method).
#--------------------------------------------------------------------------------------

#Define number of simulations to perform
Expand All @@ -389,7 +417,8 @@ def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_numb
"\n{}\n============================================================================\n\n".format(outfile))

#optimize the simulated SFS
best_rep = Optimize_Routine_GOF(sim_fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps, maxiters, param_labels=param_labels)
best_rep = Optimize_Routine_GOF(sim_fs, pts, outfile, model_name, func, rounds, param_number, fs_folded,
reps, maxiters, param_labels=param_labels, optimizer=optimizer)
#best_rep list is [roundnum_repnum, log-likelihood, AIC, chi^2 test stat, theta, parameter values, sfs_sum]

#add result for this simulation to output file
Expand Down
5 changes: 4 additions & 1 deletion Goodness_of_Fit/Simulate_and_Optimize.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def sym_mig(params, ns, pts):
'''
We will use a function from the Optimize_Functions_GOF.py script:
Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number, fs_folded,
reps=None, maxiters=None, folds=None)
reps=None, maxiters=None, folds=None, param_labels=None, optimizer="log_fmin")
Mandatory Arguments =
sim_number: the number of simulations to perform
Expand All @@ -201,6 +201,9 @@ def sym_mig(params, ns, pts):
reps: a list of integers controlling the number of replicates in each of the optimization rounds
maxiters: a list of integers controlling the maxiter argument in each of the optimization rounds
folds: a list of integers controlling the fold argument when perturbing input parameter values
param_labels: list of labels for parameters that will be written to the output file to keep track of their order
optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method),
"log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method).
'''

#**************
Expand Down
37 changes: 30 additions & 7 deletions Optimize_Functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,7 +178,7 @@ def write_log(outfile, model_name, rep_results, roundrep):

def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True,
reps=None, maxiters=None, folds=None, in_params=None,
in_upper=None, in_lower=None, param_labels=None):
in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin"):
#--------------------------------------------------------------------------------------
# Mandatory Arguments =
#(1) fs: spectrum object name
Expand All @@ -198,6 +198,7 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f
#(13) in_upper: a list of upper bound values
#(14) in_lower: a list of lower bound values
#(15) param_labels: a string, labels for parameters that will be written to the output file to keep track of their order
#(16) optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method).
#--------------------------------------------------------------------------------------

#call function that determines if our params and bounds have been set or need to be generated for us
Expand All @@ -211,6 +212,9 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f

#start keeping track of time it takes to complete optimizations for this model
tbr = datetime.now()

#optimizer dict
optdict = {"log":"BFGS method", "log_lbfgsb":"L-BFGS-B method", "log_fmin":"Nelder-Mead method", "log_powell":"Powell's method"}

# We need an output file that will store all summary info for each replicate, across rounds
outname = "{0}.{1}.optimized.txt".format(outfile, model_name)
Expand Down Expand Up @@ -256,12 +260,31 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f
print("\n\t\t\tStarting parameters = [{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_perturbed])))

#optimize from perturbed parameters
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))

print("\t\t\tOptimized parameters =[{}]\n".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
if optimizer == "log_fmin":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
elif optimizer == "log":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
elif optimizer == "log_lbfgsb":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
elif optimizer == "log_powell":
params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
lower_bound=lower_bound, upper_bound=upper_bound,
verbose=1, maxiter=maxiters_list[r],
output_file = "{}.log.txt".format(model_name))
else:
raise ValueError("\n\nERROR: Unrecognized optimizer option: {}\nPlease select from: log, log_lbfgsb, log_fmin, or log_powell.\n\n".format(optimizer))

print("\t\t\tOptimized parameters =[{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
print("\t\t\tOptimized using: {0} ({1})\n".format(optimizer, optdict[optimizer]))

#simulate the model with the optimized parameters
sim_model = func_exec(params_opt, fs.sample_sizes, pts)
Expand Down
35 changes: 33 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -36,10 +36,19 @@ The most current version of this pipeline is designed to run in Python 3 and req

What's new in version 3.1+?

Version 3.1.0:

+ All scripts have been upgraded to Python 3 (tested with 3.7), allowing compatibility with dadi v2.
+ Prints parameter labels and perturbed starting parameters to screen for each replicate. This allows quick visual comparisons of the initial and final optimized parameters among replicates.

Version 3.1.1:
+ The 2D island model set has been revised and updated, fixing issues with parameter assignment.
+ Issue causing crash when referencing optimized params list is fixed.

Version 3.1.2:
+ Issue causing crash when referencing optimized params list is fixed. Compatible with dadi v2.0.5.

Version 3.1.3:
+ Added option to select which optimizer to use. Includes the following functions from the dadi `Inferenence.py` module: optimize_log (BFGS method), optimize_log_lbfgsb (L-BFGS-B method), optimize_log_fmin (Nelder-Mead method), and optimize_log_powell (Powell's method). For usage details please see section: [Default Optimization Routine Settings](#DOR).

## **Dadi Pipeline Overview** <a name="PO"></a>

Expand Down Expand Up @@ -72,7 +81,7 @@ I will show several ways to use the main function for model fitting to highlight

We will use always use the following function from the `Optimize_Functions.py` script, which requires some explanation:

***Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=" ")***
***Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin")***

***Mandatory Arguments:***

Expand All @@ -94,6 +103,7 @@ We will use always use the following function from the `Optimize_Functions.py` s
+ **in_upper**: a list of upper bound values
+ **in_lower**: a list of lower bound values
+ **param_labels**: list of labels for parameters that will be written to the output file to keep track of their order
+ **optimizer**: optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method).

The mandatory arguments must always be included when using the ***Optimize_Routine*** function, and the arguments must be provided in the exact order listed above (also known as positional arguments). The optional arguments can be included in any order after the required arguments, and are referred to by their name, followed by an equal sign, followed by a value (example: `reps = 4`). The usage is explained in the following examples.

Expand Down Expand Up @@ -267,6 +277,27 @@ based on the number of rounds selected:
| maxiter | 5 | 5 | 5 | 5 | 5 |
| fold | 3 | 3 | 3 | 2 | 1 |


The default optimizer used is the Nelder-Mead method (`Inference.py` function `optimize_log_fmin`). This can be changed by supplying the optional optimizer argument
with any of the following choices:

+ log - Optimize log(params) to fit model to data using the BFGS method.
+ log_lbfgsb - Optimize log(params) to fit model to data using the L-BFGS-B method.
+ log_fmin - Optimize log(params) to fit model to data using Nelder-Mead. **This is the default option.**
+ log_powell - Optimize log(params) to fit model to data using Powell's method.

In Example 1 above, we could use the L-BFGS-B method instead by using the following command:

```
Optimize_Functions.Optimize_Routine(fs, pts, prefix, "sym_mig", sym_mig, 3, 4, fs_folded=True, optimizer="log_lbfgsb")
```

Or we could use Powell's method with the following command:
```
Optimize_Functions.Optimize_Routine(fs, pts, prefix, "sym_mig", sym_mig, 3, 4, fs_folded=True, optimizer="log_powell")
```


## **Why Perform Multiple Rounds of Optimizations?** <a name="WMR"></a>

When fitting demographic models, it is important to perform multiple runs and ensure that final optimizations are converging on a similar log-likelihood score. In the 2D, 3D, and custom workflows of *dadi_pipeline*, the default starting parameters used for all replicates in first round are random. After each round is completed, the parameters of the best scoring replicate from the previous round are then used to generate perturbed starting parameters for the replicates of the subsequent round. This optimization strategy of focusing the parameter search space improves the log-likelihood scores and generally results in convergence in the final round.
Expand Down
9 changes: 9 additions & 0 deletions Three_Population_Pipeline/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,15 @@ of the arguments across rounds are summarized in the table below.

If you change the number of rounds, you have to change the list length of the reps, maxiters, and folds arguments to match.

The default optimizer used is the Nelder-Mead method (`Inference.py` function `optimize_log_fmin`). This can be changed by supplying the optional optimizer argument
with any of the following choices:

+ log - Optimize log(params) to fit model to data using the BFGS method.
+ log_lbfgsb - Optimize log(params) to fit model to data using the L-BFGS-B method.
+ log_fmin - Optimize log(params) to fit model to data using Nelder-Mead. **This is the default option.**
+ log_powell - Optimize log(params) to fit model to data using Powell's method.


## Why Perform Multiple Rounds of Optimizations?

When fitting demographic models, it is important to perform multiple runs and ensure that final optimizations are converging on a similar log-likelihood score. In this workflow, the starting parameters used for all replicates in first round are random. After each round is completed, the parameters of the best scoring replicate from the previous round are then used to generate perturbed starting parameters for the replicates of the subsequent round. This optimization strategy of focusing the parameter search space improves the log-likelihood scores and generally results in convergence in the final round.
Expand Down
Loading

0 comments on commit 7db7e7f

Please sign in to comment.