From 7db7e7f21434b2e4ce3a5b0630f3d4e5c1a759c3 Mon Sep 17 00:00:00 2001 From: dportik Date: Fri, 27 Dec 2019 13:32:22 -0800 Subject: [PATCH] added optimizer argument --- Goodness_of_Fit/Optimize_Functions_GOF.py | 45 ++++++++++++++++---- Goodness_of_Fit/Simulate_and_Optimize.py | 5 ++- Optimize_Functions.py | 37 +++++++++++++--- README.md | 35 ++++++++++++++- Three_Population_Pipeline/README.md | 9 ++++ Three_Population_Pipeline/dadi_Run_3D_Set.py | 4 +- Two_Population_Pipeline/README.md | 9 ++++ Two_Population_Pipeline/dadi_Run_2D_Set.py | 4 +- dadi_Run_Optimizations.py | 4 +- 9 files changed, 131 insertions(+), 21 deletions(-) diff --git a/Goodness_of_Fit/Optimize_Functions_GOF.py b/Goodness_of_Fit/Optimize_Functions_GOF.py index ed78cc6..b0f668b 100644 --- a/Goodness_of_Fit/Optimize_Functions_GOF.py +++ b/Goodness_of_Fit/Optimize_Functions_GOF.py @@ -168,7 +168,7 @@ def write_log(outfile, model_name, rep_results, roundrep): def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps=None, maxiters=None, folds=None, in_params=None, - in_upper=None, in_lower=None, param_labels=None): + in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin"): #-------------------------------------------------------------------------------------- # Mandatory Arguments = #(1) fs: spectrum object name @@ -188,6 +188,7 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe #(13) in_upper: a list of upper bound values #(14) in_lower: a list of lower bound values #(15) param_labels: list of labels for parameters that will be written to the output file to keep track of their order + #(16) optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method). #-------------------------------------------------------------------------------------- #call function that determines if our params and bounds have been set or need to be generated for us @@ -199,6 +200,9 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe #start keeping track of time it takes to complete optimizations for this model tb_round = datetime.now() + #optimizer dict + optdict = {"log":"BFGS method", "log_lbfgsb":"L-BFGS-B method", "log_fmin":"Nelder-Mead method", "log_powell":"Powell's method"} + # We need an output file that will store all summary info for each replicate, across rounds outname = "{0}.{1}.optimized.txt".format(outfile,model_name) with open(outname, 'a') as fh_out: @@ -242,11 +246,32 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe print("\n\t\t\tStarting parameters = [{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_perturbed]))) #optimize from perturbed parameters - params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, - lower_bound=lower_bound, upper_bound=upper_bound, - verbose=1, maxiter=maxiters_list[r], - output_file = "{}.log.txt".format(model_name)) - print("\t\t\tOptimized parameters =[{}]\n".format(", ".join([str(numpy.around(x, 6)) for x in params_opt]))) + if optimizer == "log_fmin": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + elif optimizer == "log": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + elif optimizer == "log_lbfgsb": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + elif optimizer == "log_powell": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + + else: + raise ValueError("\n\nERROR: Unrecognized optimizer option: {}\nPlease select from: log, log_lbfgsb, log_fmin, or log_powell.\n\n".format(optimizer)) + + print("\t\t\tOptimized parameters =[{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_opt]))) + print("\t\t\tOptimized using: {0} ({1})\n".format(optimizer, optdict[optimizer])) #simulate the model with the optimized parameters sim_model = func_exec(params_opt, fs.sample_sizes, pts) @@ -351,7 +376,8 @@ def Optimize_Empirical(fs, pts, outfile, model_name, func, in_params, fs_folded= def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number, fs_folded=True, - reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=None): + reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, + param_labels=None, optimizer="log_fmin"): #-------------------------------------------------------------------------------------- # Mandatory Arguments = #(1) sim_number: the number of simulations to perform @@ -367,6 +393,8 @@ def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_numb # reps: a list of integers controlling the number of replicates in each of the optimization rounds # maxiters: a list of integers controlling the maxiter argument in each of the optimization rounds # folds: a list of integers controlling the fold argument when perturbing input parameter values + # param_labels: list of labels for parameters that will be written to the output file to keep track of their order + # optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method). #-------------------------------------------------------------------------------------- #Define number of simulations to perform @@ -389,7 +417,8 @@ def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_numb "\n{}\n============================================================================\n\n".format(outfile)) #optimize the simulated SFS - best_rep = Optimize_Routine_GOF(sim_fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps, maxiters, param_labels=param_labels) + best_rep = Optimize_Routine_GOF(sim_fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, + reps, maxiters, param_labels=param_labels, optimizer=optimizer) #best_rep list is [roundnum_repnum, log-likelihood, AIC, chi^2 test stat, theta, parameter values, sfs_sum] #add result for this simulation to output file diff --git a/Goodness_of_Fit/Simulate_and_Optimize.py b/Goodness_of_Fit/Simulate_and_Optimize.py index 23433f8..c7e7f8d 100644 --- a/Goodness_of_Fit/Simulate_and_Optimize.py +++ b/Goodness_of_Fit/Simulate_and_Optimize.py @@ -185,7 +185,7 @@ def sym_mig(params, ns, pts): ''' We will use a function from the Optimize_Functions_GOF.py script: Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number, fs_folded, - reps=None, maxiters=None, folds=None) + reps=None, maxiters=None, folds=None, param_labels=None, optimizer="log_fmin") Mandatory Arguments = sim_number: the number of simulations to perform @@ -201,6 +201,9 @@ def sym_mig(params, ns, pts): reps: a list of integers controlling the number of replicates in each of the optimization rounds maxiters: a list of integers controlling the maxiter argument in each of the optimization rounds folds: a list of integers controlling the fold argument when perturbing input parameter values + param_labels: list of labels for parameters that will be written to the output file to keep track of their order + optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), + "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method). ''' #************** diff --git a/Optimize_Functions.py b/Optimize_Functions.py index 608b878..bd9d98a 100644 --- a/Optimize_Functions.py +++ b/Optimize_Functions.py @@ -178,7 +178,7 @@ def write_log(outfile, model_name, rep_results, roundrep): def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True, reps=None, maxiters=None, folds=None, in_params=None, - in_upper=None, in_lower=None, param_labels=None): + in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin"): #-------------------------------------------------------------------------------------- # Mandatory Arguments = #(1) fs: spectrum object name @@ -198,6 +198,7 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f #(13) in_upper: a list of upper bound values #(14) in_lower: a list of lower bound values #(15) param_labels: a string, labels for parameters that will be written to the output file to keep track of their order + #(16) optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method). #-------------------------------------------------------------------------------------- #call function that determines if our params and bounds have been set or need to be generated for us @@ -211,6 +212,9 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f #start keeping track of time it takes to complete optimizations for this model tbr = datetime.now() + + #optimizer dict + optdict = {"log":"BFGS method", "log_lbfgsb":"L-BFGS-B method", "log_fmin":"Nelder-Mead method", "log_powell":"Powell's method"} # We need an output file that will store all summary info for each replicate, across rounds outname = "{0}.{1}.optimized.txt".format(outfile, model_name) @@ -256,12 +260,31 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f print("\n\t\t\tStarting parameters = [{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_perturbed]))) #optimize from perturbed parameters - params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, - lower_bound=lower_bound, upper_bound=upper_bound, - verbose=1, maxiter=maxiters_list[r], - output_file = "{}.log.txt".format(model_name)) - - print("\t\t\tOptimized parameters =[{}]\n".format(", ".join([str(numpy.around(x, 6)) for x in params_opt]))) + if optimizer == "log_fmin": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + elif optimizer == "log": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + elif optimizer == "log_lbfgsb": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + elif optimizer == "log_powell": + params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts, + lower_bound=lower_bound, upper_bound=upper_bound, + verbose=1, maxiter=maxiters_list[r], + output_file = "{}.log.txt".format(model_name)) + else: + raise ValueError("\n\nERROR: Unrecognized optimizer option: {}\nPlease select from: log, log_lbfgsb, log_fmin, or log_powell.\n\n".format(optimizer)) + + print("\t\t\tOptimized parameters =[{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_opt]))) + print("\t\t\tOptimized using: {0} ({1})\n".format(optimizer, optdict[optimizer])) #simulate the model with the optimized parameters sim_model = func_exec(params_opt, fs.sample_sizes, pts) diff --git a/README.md b/README.md index 7c9a64b..0e3d9d4 100644 --- a/README.md +++ b/README.md @@ -36,10 +36,19 @@ The most current version of this pipeline is designed to run in Python 3 and req What's new in version 3.1+? +Version 3.1.0: + + All scripts have been upgraded to Python 3 (tested with 3.7), allowing compatibility with dadi v2. + Prints parameter labels and perturbed starting parameters to screen for each replicate. This allows quick visual comparisons of the initial and final optimized parameters among replicates. + +Version 3.1.1: + The 2D island model set has been revised and updated, fixing issues with parameter assignment. -+ Issue causing crash when referencing optimized params list is fixed. + +Version 3.1.2: ++ Issue causing crash when referencing optimized params list is fixed. Compatible with dadi v2.0.5. + +Version 3.1.3: ++ Added option to select which optimizer to use. Includes the following functions from the dadi `Inferenence.py` module: optimize_log (BFGS method), optimize_log_lbfgsb (L-BFGS-B method), optimize_log_fmin (Nelder-Mead method), and optimize_log_powell (Powell's method). For usage details please see section: [Default Optimization Routine Settings](#DOR). ## **Dadi Pipeline Overview** @@ -72,7 +81,7 @@ I will show several ways to use the main function for model fitting to highlight We will use always use the following function from the `Optimize_Functions.py` script, which requires some explanation: -***Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=" ")*** +***Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin")*** ***Mandatory Arguments:*** @@ -94,6 +103,7 @@ We will use always use the following function from the `Optimize_Functions.py` s + **in_upper**: a list of upper bound values + **in_lower**: a list of lower bound values + **param_labels**: list of labels for parameters that will be written to the output file to keep track of their order ++ **optimizer**: optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method). The mandatory arguments must always be included when using the ***Optimize_Routine*** function, and the arguments must be provided in the exact order listed above (also known as positional arguments). The optional arguments can be included in any order after the required arguments, and are referred to by their name, followed by an equal sign, followed by a value (example: `reps = 4`). The usage is explained in the following examples. @@ -267,6 +277,27 @@ based on the number of rounds selected: | maxiter | 5 | 5 | 5 | 5 | 5 | | fold | 3 | 3 | 3 | 2 | 1 | + +The default optimizer used is the Nelder-Mead method (`Inference.py` function `optimize_log_fmin`). This can be changed by supplying the optional optimizer argument +with any of the following choices: + ++ log - Optimize log(params) to fit model to data using the BFGS method. ++ log_lbfgsb - Optimize log(params) to fit model to data using the L-BFGS-B method. ++ log_fmin - Optimize log(params) to fit model to data using Nelder-Mead. **This is the default option.** ++ log_powell - Optimize log(params) to fit model to data using Powell's method. + +In Example 1 above, we could use the L-BFGS-B method instead by using the following command: + +``` +Optimize_Functions.Optimize_Routine(fs, pts, prefix, "sym_mig", sym_mig, 3, 4, fs_folded=True, optimizer="log_lbfgsb") +``` + +Or we could use Powell's method with the following command: +``` +Optimize_Functions.Optimize_Routine(fs, pts, prefix, "sym_mig", sym_mig, 3, 4, fs_folded=True, optimizer="log_powell") +``` + + ## **Why Perform Multiple Rounds of Optimizations?** When fitting demographic models, it is important to perform multiple runs and ensure that final optimizations are converging on a similar log-likelihood score. In the 2D, 3D, and custom workflows of *dadi_pipeline*, the default starting parameters used for all replicates in first round are random. After each round is completed, the parameters of the best scoring replicate from the previous round are then used to generate perturbed starting parameters for the replicates of the subsequent round. This optimization strategy of focusing the parameter search space improves the log-likelihood scores and generally results in convergence in the final round. diff --git a/Three_Population_Pipeline/README.md b/Three_Population_Pipeline/README.md index 9b5a75a..4324c8d 100644 --- a/Three_Population_Pipeline/README.md +++ b/Three_Population_Pipeline/README.md @@ -69,6 +69,15 @@ of the arguments across rounds are summarized in the table below. If you change the number of rounds, you have to change the list length of the reps, maxiters, and folds arguments to match. +The default optimizer used is the Nelder-Mead method (`Inference.py` function `optimize_log_fmin`). This can be changed by supplying the optional optimizer argument +with any of the following choices: + ++ log - Optimize log(params) to fit model to data using the BFGS method. ++ log_lbfgsb - Optimize log(params) to fit model to data using the L-BFGS-B method. ++ log_fmin - Optimize log(params) to fit model to data using Nelder-Mead. **This is the default option.** ++ log_powell - Optimize log(params) to fit model to data using Powell's method. + + ## Why Perform Multiple Rounds of Optimizations? When fitting demographic models, it is important to perform multiple runs and ensure that final optimizations are converging on a similar log-likelihood score. In this workflow, the starting parameters used for all replicates in first round are random. After each round is completed, the parameters of the best scoring replicate from the previous round are then used to generate perturbed starting parameters for the replicates of the subsequent round. This optimization strategy of focusing the parameter search space improves the log-likelihood scores and generally results in convergence in the final round. diff --git a/Three_Population_Pipeline/dadi_Run_3D_Set.py b/Three_Population_Pipeline/dadi_Run_3D_Set.py index 820f01f..afb9f06 100644 --- a/Three_Population_Pipeline/dadi_Run_3D_Set.py +++ b/Three_Population_Pipeline/dadi_Run_3D_Set.py @@ -118,7 +118,7 @@ Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True, reps=None, maxiters=None, folds=None, in_params=None, - in_upper=None, in_lower=None, param_labels=" ") + in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin") Mandatory Arguments = fs: spectrum object name @@ -139,6 +139,8 @@ in_upper: a list of upper bound values in_lower: a list of lower bound values param_labels: list of labels for parameters that will be written to the output file to keep track of their order + optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), + "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method). Below, I give all the necessary information to call each model available in the Models_3D.py script. I have set the optimization routine to be the same for each diff --git a/Two_Population_Pipeline/README.md b/Two_Population_Pipeline/README.md index 696eb61..19a9878 100644 --- a/Two_Population_Pipeline/README.md +++ b/Two_Population_Pipeline/README.md @@ -94,6 +94,15 @@ of the arguments across rounds are summarized in the table below. If you change the number of rounds, you have to change the list length of the reps, maxiters, and folds arguments to match. +The default optimizer used is the Nelder-Mead method (`Inference.py` function `optimize_log_fmin`). This can be changed by supplying the optional optimizer argument +with any of the following choices: + ++ log - Optimize log(params) to fit model to data using the BFGS method. ++ log_lbfgsb - Optimize log(params) to fit model to data using the L-BFGS-B method. ++ log_fmin - Optimize log(params) to fit model to data using Nelder-Mead. **This is the default option.** ++ log_powell - Optimize log(params) to fit model to data using Powell's method. + + ## Why Perform Multiple Rounds of Optimizations? When fitting demographic models, it is important to perform multiple runs and ensure that final optimizations are converging on a similar log-likelihood score. In this workflow, the starting parameters used for all replicates in first round are random. After each round is completed, the parameters of the best scoring replicate from the previous round are then used to generate perturbed starting parameters for the replicates of the subsequent round. This optimization strategy of focusing the parameter search space improves the log-likelihood scores and generally results in convergence in the final round. diff --git a/Two_Population_Pipeline/dadi_Run_2D_Set.py b/Two_Population_Pipeline/dadi_Run_2D_Set.py index c0cc036..95f1d16 100644 --- a/Two_Population_Pipeline/dadi_Run_2D_Set.py +++ b/Two_Population_Pipeline/dadi_Run_2D_Set.py @@ -126,7 +126,7 @@ Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True, reps=None, maxiters=None, folds=None, in_params=None, - in_upper=None, in_lower=None, param_labels=" ") + in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin") Mandatory Arguments = fs: spectrum object name @@ -147,6 +147,8 @@ in_upper: a list of upper bound values in_lower: a list of lower bound values param_labels: list of labels for parameters that will be written to the output file to keep track of their order + optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), + "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method). Below, I give all the necessary information to call each model available in the Models_2D.py script. I have set the optimization routine to be the same for each diff --git a/dadi_Run_Optimizations.py b/dadi_Run_Optimizations.py index f07384e..cef39b1 100644 --- a/dadi_Run_Optimizations.py +++ b/dadi_Run_Optimizations.py @@ -110,7 +110,7 @@ Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True, reps=None, maxiters=None, folds=None, in_params=None, - in_upper=None, in_lower=None, param_labels=" ") + in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin") Mandatory Arguments = fs: spectrum object name @@ -131,6 +131,8 @@ in_upper: a list of upper bound values in_lower: a list of lower bound values param_labels: list of labels for parameters that will be written to the output file to keep track of their order + optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), + "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method). '''