added optimizer argument

dportik · Dec 27, 2019 · 7db7e7f · 7db7e7f
1 parent cd86a2a
commit 7db7e7f
Show file tree

Hide file tree

Showing 9 changed files with 131 additions and 21 deletions.
diff --git a/Goodness_of_Fit/Optimize_Functions_GOF.py b/Goodness_of_Fit/Optimize_Functions_GOF.py
@@ -168,7 +168,7 @@ def write_log(outfile, model_name, rep_results, roundrep):
 
 def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded,
                              reps=None, maxiters=None, folds=None, in_params=None,
-                             in_upper=None, in_lower=None, param_labels=None):
+                             in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin"):
     #--------------------------------------------------------------------------------------
     # Mandatory Arguments =
     #(1) fs:  spectrum object name
@@ -188,6 +188,7 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe
     #(13) in_upper: a list of upper bound values
     #(14) in_lower: a list of lower bound values
     #(15) param_labels: list of labels for parameters that will be written to the output file to keep track of their order
+    #(16) optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method).
     #--------------------------------------------------------------------------------------
 
     #call function that determines if our params and bounds have been set or need to be generated for us
@@ -199,6 +200,9 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe
     #start keeping track of time it takes to complete optimizations for this model
     tb_round = datetime.now()
 
+    #optimizer dict
+    optdict = {"log":"BFGS method", "log_lbfgsb":"L-BFGS-B method", "log_fmin":"Nelder-Mead method", "log_powell":"Powell's method"}
+
     # We need an output file that will store all summary info for each replicate, across rounds
     outname = "{0}.{1}.optimized.txt".format(outfile,model_name)
     with open(outname, 'a') as fh_out:
@@ -242,11 +246,32 @@ def Optimize_Routine_GOF(fs, pts, outfile, model_name, func, rounds, param_numbe
                 print("\n\t\t\tStarting parameters = [{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_perturbed])))
 
             #optimize from perturbed parameters
-            params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
-                                                              lower_bound=lower_bound, upper_bound=upper_bound,
-                                                              verbose=1, maxiter=maxiters_list[r],
-                                                              output_file = "{}.log.txt".format(model_name))
-            print("\t\t\tOptimized parameters =[{}]\n".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
+            if optimizer == "log_fmin":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+            elif optimizer == "log":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+            elif optimizer == "log_lbfgsb":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+            elif optimizer == "log_powell":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+
+            else:
+                 raise ValueError("\n\nERROR: Unrecognized optimizer option: {}\nPlease select from: log, log_lbfgsb, log_fmin, or log_powell.\n\n".format(optimizer))
+
+            print("\t\t\tOptimized parameters =[{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
+            print("\t\t\tOptimized using: {0} ({1})\n".format(optimizer, optdict[optimizer]))
 
             #simulate the model with the optimized parameters
             sim_model = func_exec(params_opt, fs.sample_sizes, pts)
@@ -351,7 +376,8 @@ def Optimize_Empirical(fs, pts, outfile, model_name, func, in_params, fs_folded=
 
 
 def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number, fs_folded=True,
-                     reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=None):
+                     reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None,
+                     param_labels=None, optimizer="log_fmin"):
     #--------------------------------------------------------------------------------------
 	# Mandatory Arguments =
 		#(1) sim_number: the number of simulations to perform
@@ -367,6 +393,8 @@ def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_numb
 		 # reps: a list of integers controlling the number of replicates in each of the optimization rounds
 		 # maxiters: a list of integers controlling the maxiter argument in each of the optimization rounds
 		 # folds: a list of integers controlling the fold argument when perturbing input parameter values
+         # param_labels: list of labels for parameters that will be written to the output file to keep track of their order
+         # optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method).
     #--------------------------------------------------------------------------------------
 
     #Define number of simulations to perform
@@ -389,7 +417,8 @@ def Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_numb
                   "\n{}\n============================================================================\n\n".format(outfile))
 
         #optimize the simulated SFS
-        best_rep = Optimize_Routine_GOF(sim_fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps, maxiters, param_labels=param_labels)
+        best_rep = Optimize_Routine_GOF(sim_fs, pts, outfile, model_name, func, rounds, param_number, fs_folded,
+                                            reps, maxiters, param_labels=param_labels, optimizer=optimizer)
         #best_rep list is [roundnum_repnum, log-likelihood, AIC, chi^2 test stat, theta, parameter values, sfs_sum]
 
         #add result for this simulation to output file

diff --git a/Goodness_of_Fit/Simulate_and_Optimize.py b/Goodness_of_Fit/Simulate_and_Optimize.py
@@ -185,7 +185,7 @@ def sym_mig(params, ns, pts):
 '''
  We will use a function from the Optimize_Functions_GOF.py script:
  	Perform_Sims(sim_number, model_fs, pts, model_name, func, rounds, param_number, fs_folded,
- 				reps=None, maxiters=None, folds=None)
+ 				reps=None, maxiters=None, folds=None, param_labels=None, optimizer="log_fmin")
 
 Mandatory Arguments =
 	sim_number: the number of simulations to perform
@@ -201,6 +201,9 @@ def sym_mig(params, ns, pts):
      reps: a list of integers controlling the number of replicates in each of the optimization rounds
      maxiters: a list of integers controlling the maxiter argument in each of the optimization rounds
      folds: a list of integers controlling the fold argument when perturbing input parameter values
+     param_labels: list of labels for parameters that will be written to the output file to keep track of their order
+     optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), 
+                "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method).
 '''
 
 #**************

diff --git a/Optimize_Functions.py b/Optimize_Functions.py
@@ -178,7 +178,7 @@ def write_log(outfile, model_name, rep_results, roundrep):
 
 def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded=True,
                          reps=None, maxiters=None, folds=None, in_params=None,
-                         in_upper=None, in_lower=None, param_labels=None):
+                         in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin"):
     #--------------------------------------------------------------------------------------
     # Mandatory Arguments =
     #(1) fs:  spectrum object name
@@ -198,6 +198,7 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f
     #(13) in_upper: a list of upper bound values
     #(14) in_lower: a list of lower bound values
     #(15) param_labels: a string, labels for parameters that will be written to the output file to keep track of their order
+    #(16) optimizer: a string, to select the optimizer. Choices include: log (BFGS method), log_lbfgsb (L-BFGS-B method), log_fmin (Nelder-Mead method), and log_powell (Powell's method).
     #--------------------------------------------------------------------------------------
 
     #call function that determines if our params and bounds have been set or need to be generated for us
@@ -211,6 +212,9 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f
 
     #start keeping track of time it takes to complete optimizations for this model
     tbr = datetime.now()
+
+    #optimizer dict
+    optdict = {"log":"BFGS method", "log_lbfgsb":"L-BFGS-B method", "log_fmin":"Nelder-Mead method", "log_powell":"Powell's method"}
 
     # We need an output file that will store all summary info for each replicate, across rounds
     outname = "{0}.{1}.optimized.txt".format(outfile, model_name)
@@ -256,12 +260,31 @@ def Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, f
                 print("\n\t\t\tStarting parameters = [{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_perturbed])))
 
             #optimize from perturbed parameters
-            params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
-                                                              lower_bound=lower_bound, upper_bound=upper_bound,
-                                                              verbose=1, maxiter=maxiters_list[r],
-                                                              output_file = "{}.log.txt".format(model_name))
-
-            print("\t\t\tOptimized parameters =[{}]\n".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
+            if optimizer == "log_fmin":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+            elif optimizer == "log":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+            elif optimizer == "log_lbfgsb":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+            elif optimizer == "log_powell":
+                params_opt = dadi.Inference.optimize_log_fmin(params_perturbed, fs, func_exec, pts,
+                                                                  lower_bound=lower_bound, upper_bound=upper_bound,
+                                                                  verbose=1, maxiter=maxiters_list[r],
+                                                                  output_file = "{}.log.txt".format(model_name))
+            else:
+                 raise ValueError("\n\nERROR: Unrecognized optimizer option: {}\nPlease select from: log, log_lbfgsb, log_fmin, or log_powell.\n\n".format(optimizer))
+
+            print("\t\t\tOptimized parameters =[{}]".format(", ".join([str(numpy.around(x, 6)) for x in params_opt])))
+            print("\t\t\tOptimized using: {0} ({1})\n".format(optimizer, optdict[optimizer]))
 
             #simulate the model with the optimized parameters
             sim_model = func_exec(params_opt, fs.sample_sizes, pts)

diff --git a/README.md b/README.md
@@ -36,10 +36,19 @@ The most current version of this pipeline is designed to run in Python 3 and req
 
 What's new in version 3.1+?
 
+Version 3.1.0:
+
 + All scripts have been upgraded to Python 3 (tested with 3.7), allowing compatibility with dadi v2.
 + Prints parameter labels and perturbed starting parameters to screen for each replicate. This allows quick visual comparisons of the initial and final optimized parameters among replicates.
+
+Version 3.1.1:
 + The 2D island model set has been revised and updated, fixing issues with parameter assignment. 
-+ Issue causing crash when referencing optimized params list is fixed. 
+
+Version 3.1.2:
++ Issue causing crash when referencing optimized params list is fixed. Compatible with dadi v2.0.5.
+
+Version 3.1.3:
++ Added option to select which optimizer to use. Includes the following functions from the dadi `Inferenence.py` module: optimize_log (BFGS method), optimize_log_lbfgsb (L-BFGS-B method), optimize_log_fmin (Nelder-Mead method), and optimize_log_powell (Powell's method). For usage details please see section: [Default Optimization Routine Settings](#DOR).
 
 ## **Dadi Pipeline Overview** <a name="PO"></a>
 
@@ -72,7 +81,7 @@ I will show several ways to use the main function for model fitting to highlight
 
 We will use always use the following function from the `Optimize_Functions.py` script, which requires some explanation:
 
-***Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=" ")***
+***Optimize_Routine(fs, pts, outfile, model_name, func, rounds, param_number, fs_folded, reps=None, maxiters=None, folds=None, in_params=None, in_upper=None, in_lower=None, param_labels=None, optimizer="log_fmin")***
 
 ***Mandatory Arguments:***
 
@@ -94,6 +103,7 @@ We will use always use the following function from the `Optimize_Functions.py` s
 + **in_upper**: a list of upper bound values
 + **in_lower**: a list of lower bound values
 + **param_labels**: list of labels for parameters that will be written to the output file to keep track of their order
++ **optimizer**: optimizer: a string, to select the optimizer. Choices include: "log" (BFGS method), "log_lbfgsb" (L-BFGS-B method), "log_fmin" (Nelder-Mead method), and "log_powell" (Powell's method).
 
 The mandatory arguments must always be included when using the ***Optimize_Routine*** function, and the arguments must be provided in the exact order listed above (also known as positional arguments). The optional arguments can be included in any order after the required arguments, and are referred to by their name, followed by an equal sign, followed by a value (example: `reps = 4`). The usage is explained in the following examples.
 
@@ -267,6 +277,27 @@ based on the number of rounds selected:
 | maxiter | 5 |  5  | 5 | 5 | 5 |
 | fold |  3 |  3  | 3 | 2 | 1 |
 
+
+The default optimizer used is the Nelder-Mead method (`Inference.py` function `optimize_log_fmin`). This can be changed by supplying the optional optimizer argument 
+with any of the following choices:
+
++ log - Optimize log(params) to fit model to data using the BFGS method.
++ log_lbfgsb - Optimize log(params) to fit model to data using the L-BFGS-B method.
++ log_fmin - Optimize log(params) to fit model to data using Nelder-Mead. **This is the default option.**
++ log_powell - Optimize log(params) to fit model to data using Powell's method.
+
+In Example 1 above, we could use the L-BFGS-B method instead by using the following command:
+
+```
+Optimize_Functions.Optimize_Routine(fs, pts, prefix, "sym_mig", sym_mig, 3, 4, fs_folded=True, optimizer="log_lbfgsb")
+```
+
+Or we could use Powell's method with the following command:
+```
+Optimize_Functions.Optimize_Routine(fs, pts, prefix, "sym_mig", sym_mig, 3, 4, fs_folded=True, optimizer="log_powell")
+```
+
+
 ## **Why Perform Multiple Rounds of Optimizations?** <a name="WMR"></a>
 
 When fitting demographic models, it is important to perform multiple runs and ensure that final optimizations are converging on a similar log-likelihood score. In the 2D, 3D, and custom workflows of *dadi_pipeline*, the default starting parameters used for all replicates in first round are random. After each round is completed, the parameters of the best scoring replicate from the previous round are then used to generate perturbed starting parameters for the replicates of the subsequent round. This optimization strategy of focusing the parameter search space improves the log-likelihood scores and generally results in convergence in the final round. 

diff --git a/Three_Population_Pipeline/README.md b/Three_Population_Pipeline/README.md
@@ -69,6 +69,15 @@ of the arguments across rounds are summarized in the table below.
 
 If you change the number of rounds, you have to change the list length of the reps, maxiters, and folds arguments to match.
 
+The default optimizer used is the Nelder-Mead method (`Inference.py` function `optimize_log_fmin`). This can be changed by supplying the optional optimizer argument 
+with any of the following choices:
+
++ log - Optimize log(params) to fit model to data using the BFGS method.
++ log_lbfgsb - Optimize log(params) to fit model to data using the L-BFGS-B method.
++ log_fmin - Optimize log(params) to fit model to data using Nelder-Mead. **This is the default option.**
++ log_powell - Optimize log(params) to fit model to data using Powell's method.
+
+
 ## Why Perform Multiple Rounds of Optimizations?
 
 When fitting demographic models, it is important to perform multiple runs and ensure that final optimizations are converging on a similar log-likelihood score. In this workflow, the starting parameters used for all replicates in first round are random. After each round is completed, the parameters of the best scoring replicate from the previous round are then used to generate perturbed starting parameters for the replicates of the subsequent round. This optimization strategy of focusing the parameter search space improves the log-likelihood scores and generally results in convergence in the final round.