Source code for parrot.bayesian_optimization

"""
This file contains code for conducting Bayesian optimization.

.............................................................................
idptools-parrot was developed by the Holehouse lab
     Original release ---- 2020

Question/comments/concerns? Raise an issue on github:
https://github.com/idptools/parrot

Licensed under the MIT license. 
"""

import math

import numpy as np

try:
    import GPy
    import GPyOpt
    from GPyOpt.methods import BayesianOptimization
except ImportError:
    print('Error importing GPy.')
    print(' If trying to run parrot-optimize, make sure to use `pip install idptools-parrot[optimize]`')

from parrot import train_network
from parrot import brnn_architecture


[docs]class BayesianOptimizer(object):
    """A class for conducting Bayesian Optimization on a PyTorch RNN

    Sets up and runs GPy Bayesian Optimization in order to choose the best-
    performing hyperparameters for a RNN for a given machine learning task. 
    Iteratively change learning rate, hidden vector size, and the number of layers
    in the network, then train and validating using 5-fold cross validation.

    Attributes
    ----------
    cv_dataloaders : list of tuples of PyTorch DataLoader objects
            For each of the cross-val folds, a tuple containing a training set
            DataLoader and a validation set DataLoader.
    input_size : int
            Length of the amino acid encoding vectors
    n_epochs : int
            Number of epochs to train for each iteration of the algorithm
    n_classes : int
            Number of classes
    n_folds : int
            Number of cross-validation folds
    problem_type : str
            'classification' or 'regression'
    dtype : str
            'sequence' or 'residues'
    weights_file : str
            Path to which the network weights will be saved during training
    device : str
            'cpu' or 'cuda' depending on system hardware
    max_iterations : int
            Maximum number of iterations to perform the optimization procedure
    silent : bool
            If true, do not print updates to console
    bds : list of dicts
            GPy-compatible bounds for each of the hyperparameters to be optimized
    """

    def __init__(self, cv_dataloaders, input_size, n_epochs, n_classes,
                 dtype, weights_file, max_iterations, device, silent):
        """
        Parameters
        ----------
        cv_dataloaders : list of tuples of PyTorch DataLoader objects
                For each of the cross-val folds, a tuple containing a training set
                DataLoader and a validation set DataLoader.
        input_size : int
                Length of the amino acid encoding vectors
        n_epochs : int
                Number of epochs to train for each iteration of the algorithm
        n_classes : int
                Number of classes
        dtype : str
                'sequence' or 'residues'
        weights_file : str
                Path to which the network weights will be saved during training
        device : str
                'cpu' or 'cuda' depending on system hardware
        max_iterations : int
                Maximum number of iterations to perform the optimization procedure
        silent : bool
                If true, do not print updates to console
        """

        self.cv_loaders = cv_dataloaders
        self.input_size = input_size
        self.n_epochs = n_epochs
        self.n_folds = len(cv_dataloaders)
        self.n_classes = n_classes
        if n_classes > 1:
            self.problem_type = 'classification'
        else:
            self.problem_type = 'regression'

        self.dtype = dtype
        self.weights_file = weights_file
        self.max_iterations = max_iterations
        self.device = device
        self.silent = silent

        self.bds = [{'name': 'log_learning_rate', 'type': 'continuous', 'domain': (-5, -2)},  # 0.00001-0.01
                    {'name': 'n_layers', 'type': 'discrete', 'domain': tuple(range(1, 6))},  # 1-5
                    {'name': 'hidden_size', 'type': 'discrete', 'domain': tuple(range(5, 51))}]  # 5-50

[docs]    def compute_cv_loss(self, hyperparameters):
        """Compute the average cross-val loss for a given set of hyperparameters

        Given N sets of hyperparameters, determine the average cross-validation loss
        for BRNNs trained with these parameters.

        Parameters
        ----------
        hyperparameters : numpy float array
                Each row corresponds to a set of hyperparameters, in the order:
                [log_learining_rate, n_layers, hidden_size]

        Returns
        -------
        numpy float array
                a Nx1 numpy array of the average cross-val loss 
                per set of input hyperparameters
        """

        cv_outputs = np.zeros((len(hyperparameters), self.n_folds))

        for i in range(len(hyperparameters)):

            log_lr, nl, hs = hyperparameters[i]
            lr = 10**float(log_lr)
            nl = int(nl)
            hs = int(hs)

            # Train and validate network with these hyperparams using k-fold CV
            cv_outputs[i] = self.eval_cv_brnns(lr, nl, hs)
            avg = np.average(cv_outputs[i])

            if self.silent is False:
                print('  %.6f	|     %2d       |         %2d           |    %.3f' % (lr, nl, hs, avg))

        outputs = np.average(cv_outputs, axis=1)
        return outputs

[docs]    def eval_cv_brnns(self, lr, nl, hs):
        """Train and test a network with given parameters across all cross-val folds

        Parameters
        ----------
        lr : float
                Learning rate of the network
        nl : int
                Number of hidden layers (for each direction) in the network
        hs : int
                Size of hidden vectors in the network

        Returns
        -------
        numpy float array
                the best validation loss from each fold of cross validation
        """

        cv_losses = np.zeros(self.n_folds) - 1  # -1 so that it's obvious if something goes wrong

        for k in range(self.n_folds):
            if self.dtype == 'sequence':
                # Use a many-to-one architecture
                brnn_network = brnn_architecture.BRNN_MtO(self.input_size, hs, nl,
                                                          self.n_classes, self.device).to(self.device)
            else:
                # Use a many-to-many architecture
                brnn_network = brnn_architecture.BRNN_MtM(self.input_size, hs, nl,
                                                          self.n_classes, self.device).to(self.device)

            # Train network with this set of hyperparameters
            train_losses, val_losses = train_network.train(brnn_network, self.cv_loaders[k][0],
                                                           self.cv_loaders[k][1], self.dtype, self.problem_type,
                                                           self.weights_file, stop_condition='iter', device=self.device,
                                                           learn_rate=lr, n_epochs=self.n_epochs, silent=True)
            # Take best val loss
            best_val_loss = np.min(val_losses)
            cv_losses[k] = best_val_loss

        return cv_losses

[docs]    def initial_search(self, x):
        """Calculate loss and estimate noise for an initial set of hyperparameters

        Parameters
        ----------
        x : numpy array
                Array containing initial hyperparameters to test

        Returns
        -------
        numpy array
                Array containing the average losses of the input hyperparameters
        float
                The standard deviation of loss across cross-val folds for the
                input hyperparameters; an estimation of the training noise
        """
        cv_outputs = np.zeros((len(x), self.n_folds))
        y = np.zeros((len(x), 1))

        for i in range(len(x)):

            log_lr, nl, hs = x[i]
            lr = 10**float(log_lr)
            nl = int(nl)
            hs = int(hs)

            # Train and validate network with these hyperparams using k-fold CV
            cv_outputs[i] = self.eval_cv_brnns(lr, nl, hs)

        # Calculate the avg and standard deviation of the losses
        for i in range(len(cv_outputs)):
            y[i] = np.average(cv_outputs[i])
        stddevs = np.std(cv_outputs, axis=1)
        avg_stddev = np.average(stddevs)

        return y, avg_stddev

[docs]    def optimize(self):
        """Set up and run Bayesian Optimization on the BRNN using GPy

        Returns
        -------
        list
                the best hyperparameters are chosen by Bayesian Optimization. Returned
                in the order: [lr, nl, hs]
        """

        # Initial hyperparameter search -- used to get noise estimate
        x_init = np.array([[-3.0, 1, 20], [-3.0, 2, 20], [-3.0, 3, 20], [-3.0, 4, 20], [-3.0, 5, 20],
                           [-2.0, 2, 20], [-3.3, 2, 20], [-4.0, 2, 20], [-5.0, 2, 20],
                           [-3.0, 2, 5],  [-3.0, 2, 15], [-3.0, 2, 35], [-3.0, 2, 50]])
        y_init, noise = self.initial_search(x_init)

        if self.silent is False:
            print("\nInitial search results:")
            print("lr\tnl\ths\toutput")
            for i in range(len(x_init)):
                print("%.5f\t%2d\t%2d\t%.4f" % (10**x_init[i][0], x_init[i][1], x_init[i][2], y_init[i][0]))
            print("Noise estimate:", noise)
            print('\n')
            print('Primary optimization:')
            print('--------------------\n')
            print('Learning rate   |   n_layers   |   hidden vector size |  avg CV loss  ')
            print('======================================================================')

        optimizer = BayesianOptimization(f=self.compute_cv_loss,
                                         domain=self.bds,
                                         model_type='GP',
                                         acquisition_type='EI',
                                         acquisition_jitter=0.05,
                                         X=x_init,
                                         Y=y_init,
                                         noise_var=noise,
                                         maximize=False)

        optimizer.run_optimization(max_iter=self.max_iterations)

        ins = optimizer.get_evaluations()[0]
        outs = optimizer.get_evaluations()[1].flatten()

        if self.silent is False:
            print("\nThe optimal hyperparameters are:\nlr = %.5f\nnl = %d\nhs = %d" %
                  (10**optimizer.x_opt[0], optimizer.x_opt[1], optimizer.x_opt[2]))
            print()

        return optimizer.x_opt