Source code for parrot.brnn_plot

"""
Plot training results for regression and classification tasks on both 
sequence-mapped and residue-mapped data.

.............................................................................
idptools-parrot was developed by the Holehouse lab
     Original release ---- 2020

Question/comments/concerns? Raise an issue on github:
https://github.com/idptools/parrot

Licensed under the MIT license. 
"""

import numpy as np
import torch
import itertools
from scipy.stats import linregress, pearsonr, spearmanr
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import precision_recall_curve, average_precision_score
from sklearn.metrics import f1_score, matthews_corrcoef, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sn
import pandas as pd

from parrot import encode_sequence


[docs]def training_loss(train_loss, val_loss, output_file_prefix=''): """Plot training and validation loss per epoch Figure is saved to file at "<output_file_prefix>_train_val_loss.png". Parameters ---------- train_loss : list training loss across each epoch val_loss : list validation loss across each epoch output_file_prefix : str, optional File to which the plot will be saved as "<output_file_prefix>_train_val_loss.png" """ fig, ax = plt.subplots(nrows=1, ncols=1, figsize=(6, 6)) props = dict(boxstyle='round', facecolor='gainsboro', alpha=0.5) num_epochs = len(train_loss) # Loss per epoch training_loss, = ax.plot(np.arange(1, num_epochs+1), train_loss, label='Train') validation_loss, = ax.plot(np.arange(1, num_epochs+1), val_loss, label='Val') ax.set_xlabel("Epoch") ax.set_ylabel("Avg loss") ax.set_title("Training and testing loss per epoch") ax.legend(handles=[training_loss, validation_loss], fontsize=14, facecolor='gainsboro', edgecolor='slategray') if num_epochs < 21: ax.set_xticks(np.arange(2, num_epochs+1, 2)) elif num_epochs < 66: ax.set_xticks(np.arange(5, num_epochs+1, 5)) elif num_epochs < 151: ax.set_xticks(np.arange(10, num_epochs+1, 10)) else: ax.set_xticks(np.arange(50, num_epochs+1, 50)) plt.savefig(output_file_prefix + '_train_val_loss.png') plt.clf()
[docs]def sequence_regression_scatterplot(true, predicted, output_file_prefix=''): """Create a scatterplot for a sequence-mapped values regression problem Figure is saved to file at "<output_file_prefix>_seq_scatterplot.png". Parameters ---------- true : list of PyTorch FloatTensors A list where each item is a [1 x 1] tensor with the true regression value of a particular sequence predicted : list of PyTorch FloatTensors A list where each item is a [1 x 1] tensor with the regression prediction for a particular sequence output_file_prefix : str, optional File to which the plot will be saved as "<output_file_prefix>_seq_scatterplot.png" """ true_list = [] pred_list = [] for item in true: true_list.append(item.cpu().numpy()[0][0]) for item in predicted: pred_list.append(item.cpu().numpy()[0][0]) plt.scatter(true_list, pred_list) edge_vals = [0.9*min(min(true_list), min(pred_list)), 1.1*max(max(true_list), max(pred_list))] plt.xlim(edge_vals) plt.ylim(edge_vals) plt.plot(edge_vals, edge_vals, 'k--') plt.xlabel('True') plt.ylabel('Predicted') slope, intercept, r_value, p_value, std_err = linregress(true_list, pred_list) plt.title('Testing accuracy: R^2=%.3f' % (r_value**2)) plt.savefig(output_file_prefix + '_seq_scatterplot.png')
[docs]def residue_regression_scatterplot(true, predicted, output_file_prefix=''): """Create a scatterplot for a residue-mapped values regression problem Each sequence is plotted with a unique marker-color combination, up to 70 different sequences. Figure is saved to file at "<output_file_prefix>_res_scatterplot.png". Parameters ---------- true : list of PyTorch FloatTensors A list where each item is a [1 x len(sequence)] tensor with the true regression values of each residue in a sequence predicted : list of PyTorch FloatTensors A list where each item is a [1 x len(sequence)] tensor with the regression predictions for each residue in a sequence output_file_prefix : str, optional File to which the plot will be saved as "<output_file_prefix>_res_scatterplot.png" """ true_list = [] pred_list = [] marker = itertools.cycle(('>', '+', '.', 'o', '*', 'v', 'D')) for item in true: single_frag = item.cpu().numpy()[0].flatten() true_list.append(list(single_frag)) for item in predicted: single_frag = item.cpu().numpy()[0].flatten() pred_list.append(list(single_frag)) for i in range(len(true_list)): plt.scatter(true_list[i], pred_list[i], s=6, marker=next(marker)) plt.figure(1) left, right = plt.xlim() bottom, top = plt.ylim() edge_vals = [min(left, bottom), max(right, top)] plt.xlim(edge_vals) plt.ylim(edge_vals) plt.plot(edge_vals, edge_vals, 'k--') plt.xlabel('True') plt.ylabel('Predicted') slope, intercept, r_value, p_value, std_err = linregress(sum(true_list, []), sum(pred_list, [])) plt.title('Testing accuracy: R^2=%.3f' % (r_value**2)) plt.savefig(output_file_prefix + '_res_scatterplot.png')
[docs]def plot_roc_curve(true_classes, predicted_class_probs, num_classes, output_file_prefix=''): """Create an ROC curve for a sequence classification problem Figure is saved to file at "<output_file_prefix>_ROC_curve.png". Parameters ---------- true_classes : list of PyTorch IntTensors A list where each item is a [1 x 1] tensor with the true class label of a particular sequence predicted_class_probs : list of PyTorch FloatTensors A list where each item is a [1 x num_classes] tensor of the probabilities of assignment to each class num_classes : int Number of distinct data classes output_file_prefix : str, optional File to which the plot will be saved as "<output_file_prefix>_ROC_curve.png" """ y_test = np.zeros((len(true_classes), num_classes), dtype=int) for i in range(len(true_classes)): label = true_classes[i].numpy()[0] y_test[i, label] = 1 y_score = np.vstack(predicted_class_probs) # Compute ROC curve and ROC area for each class fpr = dict() tpr = dict() roc_auc = dict() for c in range(num_classes): fpr[c], tpr[c], _ = roc_curve(y_test[:, c], y_score[:, c]) roc_auc[c] = auc(fpr[c], tpr[c]) plt.figure() if num_classes > 2: # Compute micro-average ROC curve and ROC area (if multiclass) fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) # Plot all ROC curves plt.plot(fpr["micro"], tpr["micro"], label='Average (area = {0:0.2f})' ''.format(roc_auc["micro"]), color='deeppink', linestyle=':', linewidth=4) for c in range(num_classes): plt.plot(fpr[c], tpr[c], lw=2, label='Class {0} (area = {1:0.2f})' ''.format(c, roc_auc[c])) elif num_classes == 2: # If binary classification # Plot only one curve (doesn't matter which one, they are symmetric) plt.plot(fpr[1], tpr[1], lw=2, label='Binary class (area = {0:0.2f})' ''.format(roc_auc[1])) plt.plot([0, 1], [0, 1], 'k--', lw=2) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver operating characteristic') plt.legend(loc="lower right", fontsize=8) plt.savefig(output_file_prefix + '_ROC_curve.png')
[docs]def plot_precision_recall_curve(true_classes, predicted_class_probs, num_classes, output_file_prefix=''): """Create an PR curve for a sequence classification problem Figure is saved to file at "<output_file_prefix>_PR_curve.png". Parameters ---------- true_classes : list of PyTorch IntTensors A list where each item is a [1 x 1] tensor with the true class label of a particular sequence predicted_class_probs : list of PyTorch FloatTensors A list where each item is a [1 x num_classes] tensor of the probabilities of assignment to each class num_classes : int Number of distinct data classes output_file_prefix : str, optional File to which the plot will be saved as "<output_file_prefix>_PR_curve.png" """ y_test = np.zeros((len(true_classes), num_classes), dtype=int) for i in range(len(true_classes)): label = true_classes[i].numpy()[0] y_test[i, label] = 1 y_score = np.vstack(predicted_class_probs) # For each class precision = dict() recall = dict() average_precision = dict() for i in range(num_classes): precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i]) average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i]) # A "micro-average": quantifying score on all classes jointly precision["micro"], recall["micro"], _ = precision_recall_curve(y_test.ravel(), y_score.ravel()) average_precision["micro"] = average_precision_score(y_test, y_score, average="micro") # Plot plt.figure() plt.plot(recall["micro"], precision["micro"], color='deeppink', linestyle=':', linewidth=4, label='Average (area = {0:0.2f})' ''.format(average_precision["micro"])) for c in range(num_classes): plt.plot(recall[c], precision[c], lw=2, label='Class {0} (area = {1:0.2f})' ''.format(c, average_precision[c])) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('Recall') plt.ylabel('Precision') plt.title('Precision-Recall') plt.legend() plt.savefig(output_file_prefix + '_PR_curve.png')
[docs]def confusion_matrix(true_classes, predicted_classes, num_classes, output_file_prefix=''): """Create a confusion matrix for a sequence classification problem Figure is saved to file at "<output_file_prefix>_seq_CM.png". Parameters ---------- true_classes : list of PyTorch IntTensors A list where each item is a [1 x 1] tensor with the true class label of a particular sequence predicted_classes : list of PyTorch FloatTensors A list where each item is a [1 x num_classes] tensor prediction of the class label for a particular sequence num_classes : int Number of distinct data classes output_file_prefix : str, optional File to which the plot will be saved as "<output_file_prefix>_seq_CM.png" """ cm = np.zeros((num_classes, num_classes)) for i in range(len(true_classes)): cm[np.argmax(predicted_classes[i][0].cpu().numpy()), true_classes[i][0]] += 1 df_cm = pd.DataFrame(cm, range(num_classes), range(num_classes)) sn.set(font_scale=1.4) # for label size sn.heatmap(df_cm, cmap='Blues', annot=True, annot_kws={"size": 16}) # font size plt.xlabel('True labels') plt.ylabel('Predicted labels') plt.title('Test set confusion matrix') plt.tight_layout() plt.savefig(output_file_prefix + '_seq_CM.png')
[docs]def res_confusion_matrix(true_classes, predicted_classes, num_classes, output_file_prefix=''): """Create a confusion matrix for a residue classification problem Figure is saved to file at "<output_file_prefix>_res_CM.png". Parameters ---------- true_classes : list of PyTorch IntTensors A list where each item is a [1 x len(sequence)] tensor with the true class label of the residues in a particular sequence predicted_classes : list of PyTorch FloatTensors A list where each item is a [1 x num_classes x len(sequence)] tensor with predictions of the class label for each residue in a particular sequence num_classes : int Number of distinct data classes output_file_prefix : str, optional File to which the plot will be saved as "<output_file_prefix>_res_CM.png" """ true_list = [] pred_list = [] for item in true_classes: single_frag = list(item[0].cpu().numpy().flatten()) true_list = true_list + single_frag for item in predicted_classes: single_frag = item[0].permute(1, 0).cpu().numpy() for residue in single_frag: pred_list.append(np.argmax(residue)) cm = np.zeros((num_classes, num_classes)) for i in range(len(true_list)): cm[pred_list[i], true_list[i]] += 1 df_cm = pd.DataFrame(cm, range(num_classes), range(num_classes)) sn.set(font_scale=1.4) # for label size sn.heatmap(df_cm, cmap='Blues', annot=True, annot_kws={"size": 16}) # font size plt.xlabel('True labels') plt.ylabel('Predicted labels') plt.title('Test set confusion matrix') plt.tight_layout() plt.savefig(output_file_prefix + '_res_CM.png')
[docs]def write_performance_metrics(sequence_data, dtype, problem_type, prob_class, output_file_prefix=''): """Writes a short text file describing performance on a variety of metrics Writes different output depending on whether a classification or regression task is specified. Also produces unique output if in probabilistic classification mode. File is saved to "<output_file_prefix>_performance_stats.txt". Parameters ---------- sequence_data : list of lists Details of the output predictions for each of the sequences in the test set. Each inner list represents a sample in the test set, with the format: [sequence_vector, true_value, predicted_value, sequence_ID] dtype : str The format of values in the dataset. Should be 'sequence' for datasets with a single value (or class label) per sequence, or 'residues' for datasets with values (or class labels) for every residue in a sequence. problem_type : str The machine learning task--should be either 'regression' or 'classification'. prob_class : bool Flag indicating if probabilistic classification was specified by the user. output_file_prefix : str Path and filename prefix to which the test set predictions will be saved. Final file path is "<output_file_prefix>_performance_stats.txt" """ true_vals = [l[1] for l in sequence_data] pred_vals = [l[2] for l in sequence_data] perform_metrics = {} if dtype == 'residues': true_vals = np.hstack(true_vals) pred_vals = np.hstack(pred_vals) if problem_type == 'classification': # Take care of probabilistic-classification case first if prob_class: # Reformat pred_vals = np.vstack(pred_vals) true_vals_array = np.zeros((len(true_vals), len(pred_vals[0])), dtype=int) for i in range(len(true_vals)): true_vals_array[i, true_vals[i]] = 1 # AUROC, AUPRC perform_metrics['Area under Precision-Recall curve'] = round( average_precision_score(true_vals_array, pred_vals, average="micro"), 3) fpr, tpr, _ = roc_curve(true_vals_array.ravel(), pred_vals.ravel()) perform_metrics["Area under ROC"] = round(auc(fpr, tpr), 3) # Change probs to discrete classes pred_vals = np.argmax(pred_vals, axis=1) # Then take care of general classification stats: accuracy, F1, MCC perform_metrics['Matthews Correlation Coef'] = round( matthews_corrcoef(true_vals, pred_vals), 3) perform_metrics['F1 Score'] = round( f1_score(true_vals, pred_vals, average='weighted'), 3) perform_metrics['Accuracy'] = round(accuracy_score(true_vals, pred_vals), 3) elif problem_type == 'regression': # Pearson R, Spearman R pears_r, p_val = pearsonr(true_vals, pred_vals) perform_metrics['Pearson R'] = round(pears_r, 3) spearman_r, p_val = spearmanr(true_vals, pred_vals) perform_metrics['Spearman R'] = round(spearman_r, 3) # Write performance metrics to file with open(output_file_prefix + '_performance_stats.txt', 'w') as f: for key, value in perform_metrics.items(): outstr = '%s : %.3f\n' % (key, value) f.write(outstr)
[docs]def output_predictions_to_file(sequence_data, excludeSeqID, encoding_scheme, probabilistic_class, encoder=None, output_file_prefix=''): """Output sequences, their true values, and their predicted values to a file Used on the output of the test_unlabeled_data() function in the train_network module in order to detail the performance of the trained network on the test set. Produces the file "test_set_predictions.tsv" in output_dir. Each pair of lines in this tsvfile corresponds to a particular test set sequence, with the first containing the true data values, and the second line having the predicted data values. Parameters ---------- sequence_data : list of lists Details of the output predictions for each of the sequences in the test set. Each inner list represents a sample in the test set, with the format: [sequence_vector, true_value, predicted_value, sequence_ID] excludeSeqID : bool Boolean indicating whether or not each line in `tsvfile` has a sequence ID (default is False) encoding_scheme : str Description of how an amino acid sequence should be encoded as a numeric vector. Providing a string other than 'onehot', 'biophysics', or 'user' will produce unintended consequences. probabilistic_class : bool Flag indicating if probabilistic classification was specified by the user. If True, instead of class labels, predictions will be output as probabilities of each class. encoder: UserEncoder object, optional If encoding_scheme is 'user', encoder should be a UserEncoder object that can convert amino acid sequences to numeric vectors. If encoding_scheme is not 'user', use None. output_file_prefix : str Path and filename prefix to which the test set predictions will be saved. Final file path is "<output_file_prefix>_predictions.tsv" """ seq_vectors = [] true_vals = [] pred_vals = [] names = [] count = 0 for sequence in sequence_data: seq_vector, true_val, pred_val, name = sequence seq_vectors.append(seq_vector) true_vals.append(true_val) pred_vals.append(pred_val) if excludeSeqID: names.append('test' + str(count)) count += 1 else: names.append(name) # Decode the sequence vectors if encoding_scheme == 'onehot': sequences = encode_sequence.rev_one_hot(seq_vectors) elif encoding_scheme == 'biophysics': sequences = encode_sequence.rev_biophysics(seq_vectors) else: sequences = encoder.decode(seq_vectors) # Write to file with open(output_file_prefix + '_predictions.tsv', 'w') as tsvfile: for i in range(len(names)): # Adjust formatting for residues or sequence data if isinstance(true_vals[i], np.ndarray): true_vals_format = ' '.join(true_vals[i].astype(str)) pred_vals_format = ' '.join(pred_vals[i].astype(str)) elif probabilistic_class: true_vals_format = true_vals[i] pred_vals_format = ' '.join(np.around(pred_vals[i], decimals=4).astype(str)) else: true_vals_format = true_vals[i] pred_vals_format = pred_vals[i] ''' Format: NAME_TRUE SEQUENCE TRUE_VALUE(S) NAME_PRED SEQUENCE PRED_VALUE(S) ''' output_str = "%s_TRUE %s %s\n" % (names[i], sequences[i], true_vals_format) output_str = output_str + "%s_PRED %s %s\n" % (names[i], sequences[i], pred_vals_format) tsvfile.write(output_str)