Source code for parrot.bayesian_optimization

"""
This file contains code for conducting Bayesian optimization.

.............................................................................
idptools-parrot was developed by the Holehouse lab
     Original release ---- 2020

Question/comments/concerns? Raise an issue on github:
https://github.com/idptools/parrot

Licensed under the MIT license. 
"""

import math

import numpy as np

try:
    import GPy
    import GPyOpt
    from GPyOpt.methods import BayesianOptimization
except ImportError:
    print('Error importing GPy.')
    print(' If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`')

from parrot import train_network
from parrot import brnn_architecture


[docs]class BayesianOptimizer(object): """A class for conducting Bayesian Optimization on a PyTorch RNN Sets up and runs GPy Bayesian Optimization in order to choose the best- performing hyperparameters for a RNN for a given machine learning task. Iteratively change learning rate, hidden vector size, and the number of layers in the network, then train and validating using 5-fold cross validation. Attributes ---------- cv_dataloaders : list of tuples of PyTorch DataLoader objects For each of the cross-val folds, a tuple containing a training set DataLoader and a validation set DataLoader. input_size : int Length of the amino acid encoding vectors n_epochs : int Number of epochs to train for each iteration of the algorithm n_classes : int Number of classes n_folds : int Number of cross-validation folds problem_type : str 'classification' or 'regression' dtype : str 'sequence' or 'residues' weights_file : str Path to which the network weights will be saved during training device : str 'cpu' or 'cuda' depending on system hardware max_iterations : int Maximum number of iterations to perform the optimization procedure silent : bool If true, do not print updates to console bds : list of dicts GPy-compatible bounds for each of the hyperparameters to be optimized """ def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes, dtype, weights_file, max_iterations, device, silent): """ Parameters ---------- cv_dataloaders : list of tuples of PyTorch DataLoader objects For each of the cross-val folds, a tuple containing a training set DataLoader and a validation set DataLoader. input_size : int Length of the amino acid encoding vectors n_epochs : int Number of epochs to train for each iteration of the algorithm n_classes : int Number of classes dtype : str 'sequence' or 'residues' weights_file : str Path to which the network weights will be saved during training device : str 'cpu' or 'cuda' depending on system hardware max_iterations : int Maximum number of iterations to perform the optimization procedure silent : bool If true, do not print updates to console """ self.cv_loaders = cv_dataloaders self.input_size = input_size self.n_epochs = n_epochs self.n_folds = len(cv_dataloaders) self.n_classes = n_classes if n_classes > 1: self.problem_type = 'classification' else: self.problem_type = 'regression' self.dtype = dtype self.weights_file = weights_file self.max_iterations = max_iterations self.device = device self.silent = silent self.bds = [{'name': 'log_learning_rate', 'type': 'continuous', 'domain': (-5, -2)}, # 0.00001-0.01 {'name': 'n_layers', 'type': 'discrete', 'domain': tuple(range(1, 6))}, # 1-5 {'name': 'hidden_size', 'type': 'discrete', 'domain': tuple(range(5, 51))}] # 5-50
[docs] def compute_cv_loss(self, hyperparameters): """Compute the average cross-val loss for a given set of hyperparameters Given N sets of hyperparameters, determine the average cross-validation loss for BRNNs trained with these parameters. Parameters ---------- hyperparameters : numpy float array Each row corresponds to a set of hyperparameters, in the order: [log_learining_rate, n_layers, hidden_size] Returns ------- numpy float array a Nx1 numpy array of the average cross-val loss per set of input hyperparameters """ cv_outputs = np.zeros((len(hyperparameters), self.n_folds)) for i in range(len(hyperparameters)): log_lr, nl, hs = hyperparameters[i] lr = 10**float(log_lr) nl = int(nl) hs = int(hs) # Train and validate network with these hyperparams using k-fold CV cv_outputs[i] = self.eval_cv_brnns(lr, nl, hs) avg = np.average(cv_outputs[i]) if self.silent is False: print(' %.6f | %2d | %2d | %.3f' % (lr, nl, hs, avg)) outputs = np.average(cv_outputs, axis=1) return outputs
[docs] def eval_cv_brnns(self, lr, nl, hs): """Train and test a network with given parameters across all cross-val folds Parameters ---------- lr : float Learning rate of the network nl : int Number of hidden layers (for each direction) in the network hs : int Size of hidden vectors in the network Returns ------- numpy float array the best validation loss from each fold of cross validation """ cv_losses = np.zeros(self.n_folds) - 1 # -1 so that it's obvious if something goes wrong for k in range(self.n_folds): if self.dtype == 'sequence': # Use a many-to-one architecture brnn_network = brnn_architecture.BRNN_MtO(self.input_size, hs, nl, self.n_classes, self.device).to(self.device) else: # Use a many-to-many architecture brnn_network = brnn_architecture.BRNN_MtM(self.input_size, hs, nl, self.n_classes, self.device).to(self.device) # Train network with this set of hyperparameters train_losses, val_losses = train_network.train(brnn_network, self.cv_loaders[k][0], self.cv_loaders[k][1], self.dtype, self.problem_type, self.weights_file, stop_condition='iter', device=self.device, learn_rate=lr, n_epochs=self.n_epochs, silent=True) # Take best val loss best_val_loss = np.min(val_losses) cv_losses[k] = best_val_loss return cv_losses
[docs] def optimize(self): """Set up and run Bayesian Optimization on the BRNN using GPy Returns ------- list the best hyperparameters are chosen by Bayesian Optimization. Returned in the order: [lr, nl, hs] """ # Initial hyperparameter search -- used to get noise estimate x_init = np.array([[-3.0, 1, 20], [-3.0, 2, 20], [-3.0, 3, 20], [-3.0, 4, 20], [-3.0, 5, 20], [-2.0, 2, 20], [-3.3, 2, 20], [-4.0, 2, 20], [-5.0, 2, 20], [-3.0, 2, 5], [-3.0, 2, 15], [-3.0, 2, 35], [-3.0, 2, 50]]) y_init, noise = self.initial_search(x_init) if self.silent is False: print("\nInitial search results:") print("lr\tnl\ths\toutput") for i in range(len(x_init)): print("%.5f\t%2d\t%2d\t%.4f" % (10**x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0])) print("Noise estimate:", noise) print('\n') print('Primary optimization:') print('--------------------\n') print('Learning rate | n_layers | hidden vector size | avg CV loss ') print('======================================================================') optimizer = BayesianOptimization(f=self.compute_cv_loss, domain=self.bds, model_type='GP', acquisition_type='EI', acquisition_jitter=0.05, X=x_init, Y=y_init, noise_var=noise, maximize=False) optimizer.run_optimization(max_iter=self.max_iterations) ins = optimizer.get_evaluations()[0] outs = optimizer.get_evaluations()[1].flatten() if self.silent is False: print("\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" % (10**optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2])) print() return optimizer.x_opt