Source code for parrot.encode_sequence

"""
File containing functions for encoding a string of amino acids into a numeric vector.

.............................................................................
parrot was developed by the Holehouse lab
     Original release ---- 2020

Question/comments/concerns? Raise an issue on github:
https://github.com/idptools/parrot

Licensed under the MIT license. 
"""

import sys
import os

import numpy as np
import torch

ONE_HOT = { 'A': 0, 'C': 1, 'D': 2, 'E': 3, 'F': 4, 'G': 5, 'H': 6, 
            'I': 7, 'K': 8, 'L': 9, 'M': 10, 'N': 11, 'P': 12, 'Q': 13, 
            'R': 14, 'S': 15, 'T': 16, 'V': 17, 'W': 18, 'Y': 19 }

[docs]def one_hot(seq): """Convert an amino acid sequence to a PyTorch tensor of one-hot vectors Each amino acid is represented by a length 20 vector with a single 1 and 19 0's Inputing a sequence with a nono-canonical amino acid letter will cause the program to exit. E.g. Glutamic acid (E) is encoded: [0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0] Parameters ---------- seq : str An uppercase sequence of amino acids (single letter code) Returns ------- torch.IntTensor a PyTorch tensor representing the encoded sequence """ l = len(seq) m = np.zeros((l, 20)) try: for i in range(l): m[i, ONE_HOT[seq[i]]] = 1 except: error_str = 'Invalid amino acid detected: ' + seq[i] raise ValueError(error_str) return torch.from_numpy(m)
[docs]def rev_one_hot(seq_vectors): """Decode a list of one-hot sequence vectors into amino acid sequences Parameters ---------- seq_vectors : list of numpy arrays A list containing sequence vectors Returns ------- list Strings of amino acid sequences """ REV_ONE_HOT = 'ACDEFGHIKLMNPQRSTVWY' sequences = [] for seq_vector in seq_vectors: seq = [] for residue in seq_vector: seq.append(REV_ONE_HOT[np.argmax(residue)]) sequences.append("".join(seq)) return sequences
############ Biophysical scale encoding ############# # Map each amino acid to a vector of biophysical properties # 0: Hydrophobicity # 1: Charge # 2: pI # 3: Molecular weight (g/mol) # 4: Aromatic amino acid # 5: Capable of hydrogen bonding # 6: Side chain SASA (measured from ACE-XX-NME dipeptide) # 7: Backbone SASA (measured from ACE-XX-NME dipeptide) # 8: Free energy of solvation BIOPHYSICS = { 'A': [1.8, 0, 6.0, 89.1, 0, 0, 75.8, 76.1, 1.9], 'C': [2.5, 0, 5.1, 121.2, 0, 0, 115.4, 67.9, -1.2], 'D': [-3.5, -1, 2.8, 133.1, 0, 1, 130.3, 71.8, -107.3], 'E': [-3.5, -1, 3.2, 147.1, 0, 1, 161.8, 68.1, -107.3], 'F': [2.8, 0, 5.5, 165.2, 1, 0, 209.4, 66.0, -0.8], 'G': [-0.4, 0, 6.0, 75.1, 0, 0, 0.0, 115.0, 0.0], 'H': [-3.2, 1, 7.6, 155.2, 0, 1, 180.8, 67.5, -52.7], # Avg of HIP and HIE 'I': [4.5, 0, 6.0, 131.2, 0, 0, 172.7, 60.3, 2.2], 'K': [-3.9, 1, 9.7, 146.2, 0, 1, 205.9, 68.7, -100.9], 'L': [3.8, 0, 6.0, 131.2, 0, 0, 172.0, 64.5, 2.3], 'M': [1.9, 0, 5.7, 149.2, 0, 0, 184.8, 67.8, -1.4], 'N': [-3.5, 0, 5.4, 132.1, 0, 1, 142.7, 66.8, -9.7], 'P': [-1.6, 0, 6.3, 115.1, 0, 0, 134.3, 55.8, 2.0], 'Q': [-3.5, 0, 5.7, 146.2, 0, 1, 173.3, 66.6, -9.4], 'R': [-4.5, 1, 10.8, 174.2, 0, 1, 236.5, 66.7, -100.9], 'S': [-0.8, 0, 5.7, 105.1, 0, 1, 95.9, 72.9, -5.1], 'T': [-0.7, 0, 5.6, 119.1, 0, 1, 130.9, 64.1, -5.0], 'V': [4.2, 0, 6.0, 117.1, 0, 0, 143.1, 61.7, 2.0], 'W': [-0.9, 0, 5.9, 204.2, 1, 1, 254.6, 64.3, -5.9], 'Y': [-1.3, 0, 5.7, 181.2, 1, 1, 222.5, 71.9, -6.1] }
[docs]def biophysics(seq): """Convert an amino acid sequence to a PyTorch tensor with biophysical encoding Each amino acid is represented by a length 9 vector with each value representing a biophysical property. The nine encoded biophysical scales are Kyte-Doolittle hydrophobicity, charge, isoelectric point, molecular weight, aromaticity, h-bonding ability, side chain solvent accessible surface area, backbone SASA, and free energy of solvation. Inputing a sequence with a nono-canonical amino acid letter will cause the program to exit. E.g. Glutamic acid (E) is: [-3.5, -1, 3.2, 147.1, 0, 1, 161.8, 68.1, -107.3] Parameters ---------- seq : str An uppercase sequence of amino acids (single letter code) Returns ------- torch.FloatTensor a PyTorch tensor representing the encoded sequence """ l = len(seq) m = np.zeros((l, len(BIOPHYSICS['A']))) try: for i in range(l): m[i] = BIOPHYSICS[seq[i]] except: error_str = 'Invalid amino acid detected: ' + seq[i] raise ValueError(error_str) return torch.from_numpy(m)
[docs]def rev_biophysics(seq_vectors): """Decode a list of biophysically-encoded sequence vectors into amino acid sequences Parameters ---------- seq_vectors : list of numpy arrays A list containing sequence vectors Returns ------- list Strings of amino acid sequences """ REV_BIOPHYSICS = {} for key, value in BIOPHYSICS.items(): REV_BIOPHYSICS[str(value[6])] = key sequences = [] for seq_vector in seq_vectors: seq = [] for residue in seq_vector: seq.append(REV_BIOPHYSICS[str(residue[6])]) sequences.append("".join(seq)) return sequences
################## User-specified encoding ####################
[docs]def parse_encode_file(file): """Helper function to convert an encoding file into key:value dictionary""" with open(file) as f: lines = [x.strip().split() for x in f] l = len(lines[0]) - 1 d = {} for line in lines: d[line[0]] = line[1:] if len(line) - 1 != l: raise ValueError('Vectors in encoding file do not have same length.') return d, l
[docs]class UserEncoder(): """User-specified amino acid-to-vector encoding scheme object Attributes ---------- encode_file : str A path to a file that describes the encoding scheme encode_dict : dict A dictionary that maps each amino acid to a numeric vector input_size : int The length of the encoding vector used for each amino acid """ def __init__(self, encode_file): """ Parameters ---------- encode_file : str A path to a file that describes the encoding scheme """ self.encode_file = os.path.abspath(encode_file) if not os.path.isfile(self.encode_file): raise FileNotFoundError('Encoding file does not exist.') self.encode_dict, self.input_size = parse_encode_file(self.encode_file) def __len__(self): """Get length of encoding scheme""" return self.input_size
[docs] def encode(self, seq): """Convert an amino acid sequence into this encoding scheme Parameters ---------- seq : str An uppercase sequence of amino acids (single letter code) Returns ------- torch.FloatTensor a PyTorch tensor representing the encoded sequence """ l = len(seq) m = np.zeros((l, self.input_size)) try: for i in range(l): m[i] = self.encode_dict[seq[i]] except: error_str = 'Invalid amino acid detected: ' + seq[i] raise ValueError(error_str) return torch.from_numpy(m)
[docs] def decode(self, seq_vectors): """Converts a list of sequence vectors back to a list of protein sequences Parameters ---------- seq_vectors : list of numpy arrays A list containing sequence vectors Returns ------- list Strings of amino acid sequences """ # Create a reverse of the encode_dict using hashing rev_dict = {} for key, value in self.encode_dict.items(): value = np.array(value, dtype=np.float32) rev_dict[hash(tuple(value))] = key sequences = [] for seq_vector in seq_vectors: seq = [] for residue in seq_vector: seq.append(rev_dict[hash(tuple(residue))]) sequences.append("".join(seq)) return sequences