ai-content-maker/.venv/Lib/site-packages/transformers/models/esm/modeling_esmfold.py

2323 lines
85 KiB
Python
Raw Permalink Normal View History

2024-05-03 04:18:51 +03:00
# coding=utf-8
# Copyright 2022 Meta and The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import math
import sys
from dataclasses import dataclass
from functools import partial
from typing import Callable, Dict, List, Optional, Sequence, Tuple, Union
import numpy as np
import torch
import torch.nn as nn
from torch.nn import LayerNorm
from ...integrations.deepspeed import is_deepspeed_available
from ...modeling_outputs import ModelOutput
from ...utils import (
ContextManagers,
add_start_docstrings,
add_start_docstrings_to_model_forward,
is_scipy_available,
logging,
replace_return_docstrings,
)
from .configuration_esm import EsmConfig
from .modeling_esm import ESM_START_DOCSTRING, EsmModel, EsmPreTrainedModel
from .openfold_utils import (
OFProtein,
Rigid,
Rotation,
atom14_to_atom37,
chunk_layer,
compute_predicted_aligned_error,
compute_tm,
frames_and_literature_positions_to_atom14_pos,
make_atom14_masks,
residue_constants,
to_pdb,
torsion_angles_to_frames,
)
logger = logging.get_logger(__name__)
_CHECKPOINT_FOR_DOC = "facebook/esmfold_v1"
_CONFIG_FOR_DOC = "EsmConfig"
@dataclass
class EsmForProteinFoldingOutput(ModelOutput):
"""
Output type of [`EsmForProteinFoldingOutput`].
Args:
frames (`torch.FloatTensor`):
Output frames.
sidechain_frames (`torch.FloatTensor`):
Output sidechain frames.
unnormalized_angles (`torch.FloatTensor`):
Predicted unnormalized backbone and side chain torsion angles.
angles (`torch.FloatTensor`):
Predicted backbone and side chain torsion angles.
positions (`torch.FloatTensor`):
Predicted positions of the backbone and side chain atoms.
states (`torch.FloatTensor`):
Hidden states from the protein folding trunk.
s_s (`torch.FloatTensor`):
Per-residue embeddings derived by concatenating the hidden states of each layer of the ESM-2 LM stem.
s_z (`torch.FloatTensor`):
Pairwise residue embeddings.
distogram_logits (`torch.FloatTensor`):
Input logits to the distogram used to compute residue distances.
lm_logits (`torch.FloatTensor`):
Logits output by the ESM-2 protein language model stem.
aatype (`torch.FloatTensor`):
Input amino acids (AlphaFold2 indices).
atom14_atom_exists (`torch.FloatTensor`):
Whether each atom exists in the atom14 representation.
residx_atom14_to_atom37 (`torch.FloatTensor`):
Mapping between atoms in the atom14 and atom37 representations.
residx_atom37_to_atom14 (`torch.FloatTensor`):
Mapping between atoms in the atom37 and atom14 representations.
atom37_atom_exists (`torch.FloatTensor`):
Whether each atom exists in the atom37 representation.
residue_index (`torch.FloatTensor`):
The index of each residue in the protein chain. Unless internal padding tokens are used, this will just be
a sequence of integers from 0 to `sequence_length`.
lddt_head (`torch.FloatTensor`):
Raw outputs from the lddt head used to compute plddt.
plddt (`torch.FloatTensor`):
Per-residue confidence scores. Regions of low confidence may indicate areas where the model's prediction is
uncertain, or where the protein structure is disordered.
ptm_logits (`torch.FloatTensor`):
Raw logits used for computing ptm.
ptm (`torch.FloatTensor`):
TM-score output representing the model's high-level confidence in the overall structure.
aligned_confidence_probs (`torch.FloatTensor`):
Per-residue confidence scores for the aligned structure.
predicted_aligned_error (`torch.FloatTensor`):
Predicted error between the model's prediction and the ground truth.
max_predicted_aligned_error (`torch.FloatTensor`):
Per-sample maximum predicted error.
"""
frames: torch.FloatTensor = None
sidechain_frames: torch.FloatTensor = None
unnormalized_angles: torch.FloatTensor = None
angles: torch.FloatTensor = None
positions: torch.FloatTensor = None
states: torch.FloatTensor = None
s_s: torch.FloatTensor = None
s_z: torch.FloatTensor = None
distogram_logits: torch.FloatTensor = None
lm_logits: torch.FloatTensor = None
aatype: torch.FloatTensor = None
atom14_atom_exists: torch.FloatTensor = None
residx_atom14_to_atom37: torch.FloatTensor = None
residx_atom37_to_atom14: torch.FloatTensor = None
atom37_atom_exists: torch.FloatTensor = None
residue_index: torch.FloatTensor = None
lddt_head: torch.FloatTensor = None
plddt: torch.FloatTensor = None
ptm_logits: torch.FloatTensor = None
ptm: torch.FloatTensor = None
aligned_confidence_probs: torch.FloatTensor = None
predicted_aligned_error: torch.FloatTensor = None
max_predicted_aligned_error: torch.FloatTensor = None
ESMFOLD_INPUTS_DOCSTRING = r"""
Args:
input_ids (`torch.LongTensor` of shape `({0})`):
Indices of input sequence tokens in the vocabulary.
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
[`PreTrainedTokenizer.__call__`] for details.
[What are input IDs?](../glossary#input-ids)
attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
- 1 for tokens that are **not masked**,
- 0 for tokens that are **masked**.
[What are attention masks?](../glossary#attention-mask)
position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
config.max_position_embeddings - 1]`.
[What are position IDs?](../glossary#position-ids)
masking_pattern (`torch.LongTensor` of shape `({0})`, *optional*):
Locations of tokens to mask during training as a form of regularization. Mask values selected in `[0, 1]`.
num_recycles (`int`, *optional*, defaults to `None`):
Number of times to recycle the input sequence. If `None`, defaults to `config.num_recycles`. "Recycling"
consists of passing the output of the folding trunk back in as input to the trunk. During training, the
number of recycles should vary with each batch, to ensure that the model learns to output valid predictions
after each recycle. During inference, num_recycles should be set to the highest value that the model was
trained with for maximum accuracy. Accordingly, when this value is set to `None`, config.max_recycles is
used.
"""
def is_fp16_enabled():
# Autocast world
fp16_enabled = torch.get_autocast_gpu_dtype() == torch.float16
fp16_enabled = fp16_enabled and torch.is_autocast_enabled()
return fp16_enabled
def is_deepspeed_initialized():
if is_deepspeed_available():
return False
else:
try:
import deepspeed
# This is not available in all DeepSpeed versions.
return deepspeed.utils.is_initialized()
except Exception:
return False
def collate_dense_tensors(samples: List[torch.Tensor], pad_v: float = 0) -> torch.Tensor:
"""
Takes a list of tensors with the following dimensions:
[(d_11, ..., d_1K),
(d_21, ..., d_2K), ..., (d_N1, ..., d_NK)]
and stack + pads them into a single tensor of:
(N, max_i=1,N { d_i1 }, ..., max_i=1,N {diK})
"""
if len(samples) == 0:
return torch.Tensor()
if len({x.dim() for x in samples}) != 1:
raise RuntimeError(f"Samples has varying dimensions: {[x.dim() for x in samples]}")
(device,) = tuple({x.device for x in samples}) # assumes all on same device
max_shape = [max(lst) for lst in zip(*[x.shape for x in samples])]
result = torch.empty(len(samples), *max_shape, dtype=samples[0].dtype, device=device)
result.fill_(pad_v)
for i in range(len(samples)):
result_i = result[i]
t = samples[i]
result_i[tuple(slice(0, k) for k in t.shape)] = t
return result
def flatten_final_dims(t: torch.Tensor, no_dims: int):
return t.reshape(t.shape[:-no_dims] + (-1,))
def permute_final_dims(tensor: torch.Tensor, inds: List[int]):
zero_index = -1 * len(inds)
first_inds = list(range(len(tensor.shape[:zero_index])))
return tensor.permute(first_inds + [zero_index + i for i in inds])
def dict_multimap(fn, dicts):
first = dicts[0]
new_dict = {}
for k, v in first.items():
all_v = [d[k] for d in dicts]
if isinstance(v, dict):
new_dict[k] = dict_multimap(fn, all_v)
else:
new_dict[k] = fn(all_v)
return new_dict
def trunc_normal_init_(weights, scale=1.0, fan="fan_in"):
shape = weights.shape
scale = scale / max(1, shape[1])
if not is_scipy_available():
logger.warning(
"This init requires scipy, but scipy was not found, default to an approximation that might not be"
" equivalent."
)
std = math.sqrt(scale)
torch.nn.init.normal_(weights, std=std).clamp(min=0.0, max=2.0 * std)
else:
from scipy.stats import truncnorm
std = math.sqrt(scale) / truncnorm.std(a=-2, b=2, loc=0, scale=1)
samples = truncnorm.rvs(a=-2, b=2, loc=0, scale=std, size=weights.numel())
samples = np.reshape(samples, shape)
weights.copy_(torch.tensor(samples, device=weights.device))
def ipa_point_weights_init_(weights):
with torch.no_grad():
softplus_inverse_1 = 0.541324854612918
weights.fill_(softplus_inverse_1)
class EsmFoldLinear(nn.Linear):
"""
A Linear layer with built-in nonstandard initializations. Called just like torch.nn.Linear.
Implements the initializers in 1.11.4, plus some additional ones found in the code.
"""
def __init__(
self,
in_dim: int,
out_dim: int,
bias: bool = True,
init: str = "default",
init_fn: Optional[Callable[[torch.Tensor, torch.Tensor], None]] = None,
):
"""
Args:
in_dim:
The final dimension of inputs to the layer
out_dim:
The final dimension of layer outputs
bias:
Whether to learn an additive bias. True by default
init:
The initializer to use. Choose from:
"default": LeCun fan-in truncated normal initialization "relu": He initialization w/ truncated normal
distribution "glorot": Fan-average Glorot uniform initialization "gating": Weights=0, Bias=1 "normal":
Normal initialization with std=1/sqrt(fan_in) "final": Weights=0, Bias=0
Overridden by init_fn if the latter is not None.
init_fn:
A custom initializer taking weight and bias as inputs. Overrides init if not None.
"""
super().__init__(in_dim, out_dim, bias=bias)
if bias:
with torch.no_grad():
self.bias.fill_(0)
self.init = init
self.init_fn = init_fn
if init not in ["default", "relu", "glorot", "gating", "normal", "final"]:
raise ValueError("Invalid init string.")
class EsmFoldLayerNorm(nn.Module):
def __init__(self, c_in, eps=1e-5):
super().__init__()
self.c_in = (c_in,)
self.eps = eps
self.weight = nn.Parameter(torch.ones(c_in))
self.bias = nn.Parameter(torch.zeros(c_in))
def forward(self, x):
d = x.dtype
if d is torch.bfloat16 and not is_deepspeed_initialized():
with torch.cuda.amp.autocast(enabled=False):
out = nn.functional.layer_norm(x, self.c_in, self.weight.to(dtype=d), self.bias.to(dtype=d), self.eps)
else:
out = nn.functional.layer_norm(x, self.c_in, self.weight, self.bias, self.eps)
return out
@torch.jit.ignore
def softmax_no_cast(t: torch.Tensor, dim: int = -1) -> torch.Tensor:
"""
Softmax, but without automatic casting to fp32 when the input is of type bfloat16
"""
d = t.dtype
if d is torch.bfloat16 and not is_deepspeed_initialized():
with torch.cuda.amp.autocast(enabled=False):
s = torch.nn.functional.softmax(t, dim=dim)
else:
s = torch.nn.functional.softmax(t, dim=dim)
return s
class EsmFoldAttention(nn.Module):
"""
Standard multi-head attention using AlphaFold's default layer initialization. Allows multiple bias vectors.
"""
def __init__(
self,
c_q: int,
c_k: int,
c_v: int,
c_hidden: int,
no_heads: int,
gating: bool = True,
):
"""
Args:
c_q:
Input dimension of query data
c_k:
Input dimension of key data
c_v:
Input dimension of value data
c_hidden:
Per-head hidden dimension
no_heads:
Number of attention heads
gating:
Whether the output should be gated using query data
"""
super().__init__()
self.c_q = c_q
self.c_k = c_k
self.c_v = c_v
self.c_hidden = c_hidden
self.no_heads = no_heads
self.gating = gating
# DISCREPANCY: c_hidden is not the per-head channel dimension, as
# stated in the supplement, but the overall channel dimension.
self.linear_q = EsmFoldLinear(self.c_q, self.c_hidden * self.no_heads, bias=False, init="glorot")
self.linear_k = EsmFoldLinear(self.c_k, self.c_hidden * self.no_heads, bias=False, init="glorot")
self.linear_v = EsmFoldLinear(self.c_v, self.c_hidden * self.no_heads, bias=False, init="glorot")
self.linear_o = EsmFoldLinear(self.c_hidden * self.no_heads, self.c_q, init="final")
self.linear_g = None
if self.gating:
self.linear_g = EsmFoldLinear(self.c_q, self.c_hidden * self.no_heads, init="gating")
self.sigmoid = nn.Sigmoid()
def _prep_qkv(self, q_x: torch.Tensor, kv_x: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
# [*, Q/K/V, H * C_hidden]
q = self.linear_q(q_x)
k = self.linear_k(kv_x)
v = self.linear_v(kv_x)
# [*, Q/K, H, C_hidden]
q = q.view(q.shape[:-1] + (self.no_heads, -1))
k = k.view(k.shape[:-1] + (self.no_heads, -1))
v = v.view(v.shape[:-1] + (self.no_heads, -1))
# [*, H, Q/K, C_hidden]
q = q.transpose(-2, -3)
k = k.transpose(-2, -3)
v = v.transpose(-2, -3)
q /= math.sqrt(self.c_hidden)
return q, k, v
def _wrap_up(self, o: torch.Tensor, q_x: torch.Tensor) -> torch.Tensor:
if self.linear_g is not None:
g = self.sigmoid(self.linear_g(q_x))
# [*, Q, H, C_hidden]
g = g.view(g.shape[:-1] + (self.no_heads, -1))
o = o * g
# [*, Q, H * C_hidden]
o = flatten_final_dims(o, 2)
# [*, Q, C_q]
o = self.linear_o(o)
return o
def forward(
self,
q_x: torch.Tensor,
kv_x: torch.Tensor,
biases: Optional[List[torch.Tensor]] = None,
use_memory_efficient_kernel: bool = False,
use_lma: bool = False,
lma_q_chunk_size: int = 1024,
lma_kv_chunk_size: int = 4096,
use_flash: bool = False,
flash_mask: Optional[torch.Tensor] = None,
) -> torch.Tensor:
"""
Args:
q_x:
[*, Q, C_q] query data
kv_x:
[*, K, C_k] key data
biases:
List of biases that broadcast to [*, H, Q, K]
use_memory_efficient_kernel:
Whether to use a custom memory-efficient attention kernel. This should be the default choice for most.
If none of the "use_<...>" flags are True, a stock PyTorch implementation is used instead
use_lma:
Whether to use low-memory attention (Staats & Rabe 2021). If none of the "use_<...>" flags are True, a
stock PyTorch implementation is used instead
lma_q_chunk_size:
Query chunk size (for LMA)
lma_kv_chunk_size:
Key/Value chunk size (for LMA)
Returns
[*, Q, C_q] attention update
"""
if use_lma and (lma_q_chunk_size is None or lma_kv_chunk_size is None):
raise ValueError("If use_lma is specified, lma_q_chunk_size and lma_kv_chunk_size must be provided")
if use_flash and biases is not None:
raise ValueError("use_flash is incompatible with the bias option. For masking, use flash_mask instead")
attn_options = [use_memory_efficient_kernel, use_lma, use_flash]
if sum(attn_options) > 1:
raise ValueError("Choose at most one alternative attention algorithm")
if biases is None:
biases = []
# [*, H, Q/K, C_hidden]
query, key, value = self._prep_qkv(q_x, kv_x)
key = permute_final_dims(key, (1, 0))
# [*, H, Q, K]
output = torch.matmul(query, key)
for b in biases:
output += b
output = softmax_no_cast(output, -1)
# [*, H, Q, C_hidden]
output = torch.matmul(output, value)
output = output.transpose(-2, -3)
output = self._wrap_up(output, q_x)
return output
class EsmFoldTriangleAttention(nn.Module):
def __init__(self, c_in, c_hidden, no_heads, starting=True, inf=1e9):
"""
Args:
c_in:
Input channel dimension
c_hidden:
Overall hidden channel dimension (not per-head)
no_heads:
Number of attention heads
"""
super().__init__()
self.c_in = c_in
self.c_hidden = c_hidden
self.no_heads = no_heads
self.starting = starting
self.inf = inf
self.layer_norm = LayerNorm(self.c_in)
self.linear = EsmFoldLinear(c_in, self.no_heads, bias=False, init="normal")
self.mha = EsmFoldAttention(self.c_in, self.c_in, self.c_in, self.c_hidden, self.no_heads)
@torch.jit.ignore
def _chunk(
self,
x: torch.Tensor,
biases: List[torch.Tensor],
chunk_size: int,
use_memory_efficient_kernel: bool = False,
use_lma: bool = False,
inplace_safe: bool = False,
) -> torch.Tensor:
"triangle! triangle!"
mha_inputs = {
"q_x": x,
"kv_x": x,
"biases": biases,
}
return chunk_layer(
partial(self.mha, use_memory_efficient_kernel=use_memory_efficient_kernel, use_lma=use_lma),
mha_inputs,
chunk_size=chunk_size,
no_batch_dims=len(x.shape[:-2]),
_out=x if inplace_safe else None,
)
def forward(
self,
x: torch.Tensor,
mask: Optional[torch.Tensor] = None,
chunk_size: Optional[int] = None,
use_memory_efficient_kernel: bool = False,
use_lma: bool = False,
inplace_safe: bool = False,
) -> torch.Tensor:
"""
Args:
x:
[*, I, J, C_in] input tensor (e.g. the pair representation)
Returns:
[*, I, J, C_in] output tensor
"""
if mask is None:
# [*, I, J]
mask = x.new_ones(
x.shape[:-1],
)
if not self.starting:
x = x.transpose(-2, -3)
mask = mask.transpose(-1, -2)
# [*, I, J, C_in]
x = self.layer_norm(x)
# [*, I, 1, 1, J]
mask_bias = (self.inf * (mask - 1))[..., :, None, None, :]
# [*, H, I, J]
triangle_bias = permute_final_dims(self.linear(x), (2, 0, 1))
# [*, 1, H, I, J]
triangle_bias = triangle_bias.unsqueeze(-4)
biases = [mask_bias, triangle_bias]
if chunk_size is not None:
x = self._chunk(
x,
biases,
chunk_size,
use_memory_efficient_kernel=use_memory_efficient_kernel,
use_lma=use_lma,
inplace_safe=inplace_safe,
)
else:
x = self.mha(
q_x=x, kv_x=x, biases=biases, use_memory_efficient_kernel=use_memory_efficient_kernel, use_lma=use_lma
)
if not self.starting:
x = x.transpose(-2, -3)
return x
class EsmFoldTriangleMultiplicativeUpdate(nn.Module):
"""
Implements Algorithms 11 and 12.
"""
def __init__(self, config, _outgoing=True):
super().__init__()
c_hidden = config.pairwise_state_dim
self._outgoing = _outgoing
self.linear_a_p = EsmFoldLinear(c_hidden, c_hidden)
self.linear_a_g = EsmFoldLinear(c_hidden, c_hidden, init="gating")
self.linear_b_p = EsmFoldLinear(c_hidden, c_hidden)
self.linear_b_g = EsmFoldLinear(c_hidden, c_hidden, init="gating")
self.linear_g = EsmFoldLinear(c_hidden, c_hidden, init="gating")
self.linear_z = EsmFoldLinear(c_hidden, c_hidden, init="final")
self.layer_norm_in = LayerNorm(c_hidden)
self.layer_norm_out = LayerNorm(c_hidden)
self.sigmoid = nn.Sigmoid()
def _combine_projections(
self, a: torch.Tensor, b: torch.Tensor, _inplace_chunk_size: Optional[int] = None
) -> torch.Tensor:
if self._outgoing:
a = permute_final_dims(a, (2, 0, 1))
b = permute_final_dims(b, (2, 1, 0))
else:
a = permute_final_dims(a, (2, 1, 0))
b = permute_final_dims(b, (2, 0, 1))
if _inplace_chunk_size is not None:
# To be replaced by torch vmap
for i in range(0, a.shape[-3], _inplace_chunk_size):
a_chunk = a[..., i : i + _inplace_chunk_size, :, :]
b_chunk = b[..., i : i + _inplace_chunk_size, :, :]
a[..., i : i + _inplace_chunk_size, :, :] = torch.matmul(
a_chunk,
b_chunk,
)
p = a
else:
p = torch.matmul(a, b)
return permute_final_dims(p, (1, 2, 0))
def _inference_forward(
self,
z: torch.Tensor,
mask: Optional[torch.Tensor] = None,
inplace_chunk_size: Optional[int] = None,
with_add: bool = True,
):
"""
Args:
z:
A [*, N, N, C_z] pair representation
mask:
A [*, N, N] pair mask
inplace_chunk_size:
Size of chunks used in the main computation. Increase to trade memory for speed.
with_add:
If True, z is overwritten with (z + update). Otherwise, it is overwritten with (update).
Returns:
A reference to the overwritten z
More memory-efficient, inference-only version of the forward function. Uses in-place operations, fusion of the
addition that happens after this module in the Evoformer, a smidge of recomputation, and a cache of overwritten
values to lower peak memory consumption of this module from 5x the size of the input tensor z to 2.5x its size.
Useful for inference on extremely long sequences.
It works as follows. We will make reference to variables used in the default forward implementation below.
Naively, triangle multiplication attention requires the manifestation of 5 tensors the size of z: 1) z, the
"square" input tensor, 2) a, the first projection of z, 3) b, the second projection of b, 4) g, a z-sized mask,
and 5) a z-sized tensor for intermediate computations. For large N, this is prohibitively expensive; for
N=4000, for example, z is more than 8GB alone. To avoid this problem, we compute b, g, and all intermediate
tensors in small chunks, noting that the chunks required to compute a chunk of the output depend only on the
tensor a and corresponding vertical and horizontal chunks of z. This suggests an algorithm that loops over
pairs of chunks of z: hereafter "columns" and "rows" of z, even though each "column" and "row" in fact contains
inplace_chunk_size contiguous true columns and rows of z. Writing output chunks to a new tensor would bring
total memory consumption down to 3x the size of z. However, more memory can be saved by writing output chunks
directly to z in-place. WLOG, we choose to write output chunks vertically, overwriting the ith "column" of z at
the end of the ith iteration of the main loop. Despite this overwriting, the ith column is always one column
ahead of previously overwritten columns and can be recovered directly from z. After the first iteration,
however, the ith row of z is always at least partially overwritten. For this reason, we introduce the z-cache,
a tensor one-half the size of z. The z-cache initially contains the left half (2nd and 3rd quadrants) of z. For
0 < i < N/2, the missing left part of the ith row of z is recovered from this cache at the beginning of the ith
iteration. Once i exceeds n/2, the cache is "reoriented" to encompass the 3rd and 4th quadrants of z instead.
Though the 3rd quadrant of the original z is entirely overwritten at this point, it can be recovered from the
z-cache itself. Thereafter, the ith row of z can be recovered in its entirety from the reoriented z-cache.
After the final iteration, z has been completely overwritten and contains the triangular multiplicative update.
If with_add is True, it instead contains the sum of z and the triangular multiplicative update. In either case,
peak memory consumption is just 2.5x the size of z, disregarding memory used for chunks and other small
variables.
"""
if mask is None:
mask = z.new_ones(z.shape[:-1])
mask = mask.unsqueeze(-1)
def compute_projection_helper(pair, mask, a=True):
if a:
linear_g = self.linear_a_g
linear_p = self.linear_a_p
else:
linear_g = self.linear_b_g
linear_p = self.linear_b_p
pair = self.layer_norm_in(pair)
p = linear_g(pair)
p.sigmoid_()
p *= linear_p(pair)
p *= mask
p = permute_final_dims(p, (2, 0, 1))
return p
def compute_projection(pair, mask, a=True, chunked=True):
need_transpose = self._outgoing ^ a
if not chunked:
p = compute_projection_helper(pair, mask, a)
if need_transpose:
p = p.transpose(-1, -2)
else:
# This computation is chunked so as not to exceed our 2.5x
# budget with a large intermediate tensor
linear_g = self.linear_a_g if a else self.linear_b_g
c = linear_g.bias.shape[-1]
out_shape = pair.shape[:-3] + (c,) + pair.shape[-3:-1]
p = pair.new_zeros(out_shape)
for i in range(0, pair.shape[-3], inplace_chunk_size):
pair_chunk = pair[..., i : i + inplace_chunk_size, :, :]
pair_chunk = compute_projection_helper(
pair[..., i : i + inplace_chunk_size, :, :],
mask[..., i : i + inplace_chunk_size, :, :],
a,
)
if need_transpose:
pair_chunk = pair_chunk.transpose(-1, -2)
p[..., i : i + inplace_chunk_size] = pair_chunk
else:
p[..., i : i + inplace_chunk_size, :] = pair_chunk
del pair_chunk
return p
# We start by fully manifesting a. In addition to the input, this
# brings total memory consumption to 2x z (disregarding size of chunks)
# [*, N, N, c]
a = compute_projection(z, mask, True, chunked=True)
if inplace_chunk_size is not None:
n = a.shape[-1]
half_n = n // 2 + n % 2
row_dim = -3
col_dim = -2
b_chunk_dim = row_dim if self._outgoing else col_dim
def empty_slicer(t):
return [slice(None) for _ in t.shape]
def slice_tensor(t, start, end, dim):
# Slices start:end from the dim dimension of t
s = empty_slicer(t)
s[dim] = slice(start, end)
return t[s]
def flip_z_cache_(z_cache, z):
# "Reorient" the z_cache (see below), filling it with quadrants
# 3---recovered from the z_cache---and 4---recovered from z---
# of the input tensor z.
quadrant_3 = slice_tensor(z_cache, half_n, None, row_dim)
z_cache = z_cache.transpose(row_dim, col_dim)
# If n is odd, we need to shrink the z_cache by one row
z_cache = z_cache[..., : (n // 2), :, :]
# Move the 3rd quadrant of z into the
first_half_slicer = empty_slicer(z_cache)
first_half_slicer[col_dim] = slice(0, half_n)
z_cache[first_half_slicer] = quadrant_3
# Get the fourth quadrant of z
quadrant_4 = slice_tensor(z, half_n, None, row_dim)
quadrant_4 = slice_tensor(quadrant_4, half_n, None, col_dim)
# Insert said quadrant into the rotated z-cache
quadrant_3_slicer = empty_slicer(z_cache)
quadrant_3_slicer[col_dim] = slice(half_n, None)
z_cache[quadrant_3_slicer] = quadrant_4
return z_cache
# Initialize the z cache to the left half of z.
z_cache_shape = list(z.shape)
z_cache_shape[col_dim] = half_n
z_cache = z.new_zeros(z_cache_shape)
z_cache_slicer = empty_slicer(z_cache)
z_cache_slicer[col_dim] = slice(0, half_n)
z_cache.copy_(z[z_cache_slicer])
z_cache_rotated = False
# We need to reorient the z-cache at the halfway point, and we
# don't want a single chunk to straddle that point. We contract one
# of the chunks in the middle to address that problem.
i_range = list(range(0, half_n, inplace_chunk_size))
initial_offsets = [i_2 - i_1 for i_1, i_2 in zip(i_range, i_range[1:] + [half_n])]
after_half = list(range(half_n, n, inplace_chunk_size))
after_half_offsets = [inplace_chunk_size for _ in after_half]
combined_range_with_offsets = zip(i_range + after_half, initial_offsets + after_half_offsets)
for i, offset in combined_range_with_offsets:
if not z_cache_rotated and i >= half_n:
z_cache = flip_z_cache_(z_cache, z)
z_cache_rotated = True
z_chunk_b = slice_tensor(z, i, i + offset, b_chunk_dim)
mask_chunk = slice_tensor(mask, i, i + offset, b_chunk_dim)
z_chunk_b = z_chunk_b.clone()
if b_chunk_dim == col_dim:
z_chunk_b = slice_tensor(z, i, i + offset, col_dim)
else: # b_chunk_dim == row_dim
# In this case, the b-dimension (b_chunk_dim) is partially
# overwritten at the end of each iteration. We need to
# restore the missing component from the z-cache.
if not z_cache_rotated:
z_chunk_slicer = empty_slicer(z_chunk_b)
z_chunk_slicer[col_dim] = slice(0, half_n)
z_chunk_b[z_chunk_slicer] = slice_tensor(z_cache, i, i + offset, row_dim)
else:
z_cache_offset = i - half_n
z_chunk_b = slice_tensor(z_cache, z_cache_offset, z_cache_offset + offset, row_dim)
b_chunk = compute_projection(z_chunk_b, mask_chunk, a=False, chunked=False)
del z_chunk_b
x_chunk = torch.matmul(a, b_chunk)
x_chunk = permute_final_dims(x_chunk, (1, 2, 0))
x_chunk = self.layer_norm_out(x_chunk)
x_chunk = self.linear_z(x_chunk)
# The g dimension (col_dim) is parallel to and ahead of the
# overwrites in z. We can extract the g chunk normally.
z_chunk_g = slice_tensor(z, i, i + offset, col_dim)
g_chunk = self.linear_g(self.layer_norm_in(z_chunk_g))
g_chunk.sigmoid_()
del z_chunk_g
x_chunk *= g_chunk
# Write the columns into z in-place
z_slicer = empty_slicer(z)
z_slicer[col_dim] = slice(i, i + offset)
if with_add:
z[z_slicer] += x_chunk
else:
z[z_slicer] = x_chunk
else:
b = compute_projection(z, mask, False, False)
x = torch.matmul(a, b)
x = self.layer_norm_out(x)
x = self.linear_z(x)
g = self.linear_g(z)
g.sigmoid_()
x *= g
if with_add:
z += x
else:
z = x
return z
def forward(
self,
z: torch.Tensor,
mask: Optional[torch.Tensor] = None,
inplace_safe: bool = False,
_add_with_inplace: bool = False,
_inplace_chunk_size: Optional[int] = 256,
) -> torch.Tensor:
"""
Args:
x:
[*, N_res, N_res, C_z] input tensor
mask:
[*, N_res, N_res] input mask
Returns:
[*, N_res, N_res, C_z] output tensor
"""
if inplace_safe:
x = self._inference_forward(
z,
mask,
inplace_chunk_size=_inplace_chunk_size,
with_add=_add_with_inplace,
)
return x
if mask is None:
mask = z.new_ones(z.shape[:-1])
mask = mask.unsqueeze(-1)
z = self.layer_norm_in(z)
a = mask
a = a * self.sigmoid(self.linear_a_g(z))
a = a * self.linear_a_p(z)
b = mask
b = b * self.sigmoid(self.linear_b_g(z))
b = b * self.linear_b_p(z)
if is_fp16_enabled():
with torch.cuda.amp.autocast(enabled=False):
x = self._combine_projections(a.float(), b.float())
else:
x = self._combine_projections(a, b)
del a, b
x = self.layer_norm_out(x)
x = self.linear_z(x)
g = self.sigmoid(self.linear_g(z))
x = x * g
return x
class EsmFoldPreTrainedModel(EsmPreTrainedModel):
"""
An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
models.
"""
# Subclass `EsMPreTrainedModel` to deal with special init
def _init_weights(self, module):
"""Initialize the weights"""
if isinstance(module, EsmFoldLinear):
with torch.no_grad():
if module.init_fn is not None:
module.init_fn(module.weight, module.bias)
elif module.init == "default":
trunc_normal_init_(module.weight, scale=1.0)
elif module.init == "relu":
trunc_normal_init_(module.weight, scale=2.0)
elif module.init == "glorot":
nn.init.xavier_uniform_(module.weight, gain=1)
elif module.init == "gating":
module.weight.fill_(0.0)
if module.bias:
module.bias.fill_(1.0)
elif module.init == "normal":
torch.nn.init.kaiming_normal_(module.weight, nonlinearity="linear")
elif module.init == "final":
module.weight.fill_(0.0)
elif isinstance(module, EsmFoldInvariantPointAttention):
ipa_point_weights_init_(module.head_weights)
elif isinstance(module, EsmFoldTriangularSelfAttentionBlock):
torch.nn.init.zeros_(module.tri_mul_in.linear_z.weight)
torch.nn.init.zeros_(module.tri_mul_in.linear_z.bias)
torch.nn.init.zeros_(module.tri_mul_out.linear_z.weight)
torch.nn.init.zeros_(module.tri_mul_out.linear_z.bias)
torch.nn.init.zeros_(module.tri_att_start.mha.linear_o.weight)
torch.nn.init.zeros_(module.tri_att_start.mha.linear_o.bias)
torch.nn.init.zeros_(module.tri_att_end.mha.linear_o.weight)
torch.nn.init.zeros_(module.tri_att_end.mha.linear_o.bias)
torch.nn.init.zeros_(module.sequence_to_pair.o_proj.weight)
torch.nn.init.zeros_(module.sequence_to_pair.o_proj.bias)
torch.nn.init.zeros_(module.pair_to_sequence.linear.weight)
torch.nn.init.zeros_(module.seq_attention.o_proj.weight)
torch.nn.init.zeros_(module.seq_attention.o_proj.bias)
torch.nn.init.zeros_(module.mlp_seq.mlp[-2].weight)
torch.nn.init.zeros_(module.mlp_seq.mlp[-2].bias)
torch.nn.init.zeros_(module.mlp_pair.mlp[-2].weight)
torch.nn.init.zeros_(module.mlp_pair.mlp[-2].bias)
else:
super()._init_weights(module)
class EsmFoldSelfAttention(nn.Module):
def __init__(self, embed_dim, num_heads, head_width, gated=False):
super().__init__()
assert embed_dim == num_heads * head_width
self.embed_dim = embed_dim
self.num_heads = num_heads
self.head_width = head_width
self.proj = nn.Linear(embed_dim, embed_dim * 3, bias=False)
self.o_proj = nn.Linear(embed_dim, embed_dim, bias=True)
self.gated = gated
if gated:
self.g_proj = nn.Linear(embed_dim, embed_dim)
torch.nn.init.zeros_(self.g_proj.weight)
torch.nn.init.ones_(self.g_proj.bias)
self.rescale_factor = self.head_width**-0.5
torch.nn.init.zeros_(self.o_proj.bias)
def forward(self, x, mask=None, bias=None, indices=None):
"""
Basic self attention with optional mask and external pairwise bias. To handle sequences of different lengths,
use mask.
Inputs:
x: batch of input sequneces (.. x L x C) mask: batch of boolean masks where 1=valid, 0=padding position (..
x L_k) bias: batch of scalar pairwise attention biases (.. x Lq x Lk x num_heads)
Outputs:
sequence projection (B x L x embed_dim), attention maps (B x L x L x num_heads)
"""
t = self.proj(x).view(*x.shape[:2], self.num_heads, -1)
t = t.permute(0, 2, 1, 3)
q, k, v = t.chunk(3, dim=-1)
q = self.rescale_factor * q
a = torch.einsum("...qc,...kc->...qk", q, k)
# Add external attention bias.
if bias is not None:
a = a + bias.permute(0, 3, 1, 2)
# Do not attend to padding tokens.
if mask is not None:
mask = mask[:, None, None]
a = a.masked_fill(mask == False, -np.inf) # noqa: E712
a = nn.functional.softmax(a, dim=-1)
y = torch.einsum("...hqk,...hkc->...qhc", a, v)
y = y.reshape(*y.shape[:2], -1)
if self.gated:
y = self.g_proj(x).sigmoid() * y
y = self.o_proj(y)
return y, a.permute(0, 3, 1, 2)
class EsmFoldDropout(nn.Module):
"""
Implementation of dropout with the ability to share the dropout mask along a particular dimension.
"""
def __init__(self, r: float, batch_dim: Union[int, List[int]]):
super().__init__()
self.r = r
if isinstance(batch_dim, int):
batch_dim = [batch_dim]
self.batch_dim = batch_dim
self.dropout = nn.Dropout(self.r)
def forward(self, x: torch.Tensor) -> torch.Tensor:
shape = list(x.shape)
if self.batch_dim is not None:
for bd in self.batch_dim:
shape[bd] = 1
return x * self.dropout(x.new_ones(shape))
class EsmFoldSequenceToPair(nn.Module):
def __init__(self, sequence_state_dim, inner_dim, pairwise_state_dim):
super().__init__()
self.layernorm = nn.LayerNorm(sequence_state_dim)
self.proj = nn.Linear(sequence_state_dim, inner_dim * 2, bias=True)
self.o_proj = nn.Linear(2 * inner_dim, pairwise_state_dim, bias=True)
torch.nn.init.zeros_(self.proj.bias)
torch.nn.init.zeros_(self.o_proj.bias)
def forward(self, sequence_state):
"""
Inputs:
sequence_state: B x L x sequence_state_dim
Output:
pairwise_state: B x L x L x pairwise_state_dim
Intermediate state:
B x L x L x 2*inner_dim
"""
assert len(sequence_state.shape) == 3
s = self.layernorm(sequence_state)
s = self.proj(s)
q, k = s.chunk(2, dim=-1)
prod = q[:, None, :, :] * k[:, :, None, :]
diff = q[:, None, :, :] - k[:, :, None, :]
x = torch.cat([prod, diff], dim=-1)
x = self.o_proj(x)
return x
class EsmFoldPairToSequence(nn.Module):
def __init__(self, pairwise_state_dim, num_heads):
super().__init__()
self.layernorm = nn.LayerNorm(pairwise_state_dim)
self.linear = nn.Linear(pairwise_state_dim, num_heads, bias=False)
def forward(self, pairwise_state):
"""
Inputs:
pairwise_state: B x L x L x pairwise_state_dim
Output:
pairwise_bias: B x L x L x num_heads
"""
assert len(pairwise_state.shape) == 4
z = self.layernorm(pairwise_state)
pairwise_bias = self.linear(z)
return pairwise_bias
class EsmFoldResidueMLP(nn.Module):
def __init__(self, embed_dim, inner_dim, dropout=0):
super().__init__()
self.mlp = nn.Sequential(
nn.LayerNorm(embed_dim),
nn.Linear(embed_dim, inner_dim),
nn.ReLU(),
nn.Linear(inner_dim, embed_dim),
nn.Dropout(dropout),
)
def forward(self, x):
return x + self.mlp(x)
class EsmFoldTriangularSelfAttentionBlock(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
sequence_state_dim = config.sequence_state_dim
pairwise_state_dim = config.pairwise_state_dim
sequence_num_heads = sequence_state_dim // config.sequence_head_width
pairwise_num_heads = pairwise_state_dim // config.pairwise_head_width
self.layernorm_1 = nn.LayerNorm(sequence_state_dim)
self.sequence_to_pair = EsmFoldSequenceToPair(sequence_state_dim, pairwise_state_dim // 2, pairwise_state_dim)
self.pair_to_sequence = EsmFoldPairToSequence(pairwise_state_dim, sequence_num_heads)
self.seq_attention = EsmFoldSelfAttention(
sequence_state_dim, sequence_num_heads, config.sequence_head_width, gated=True
)
self.tri_mul_out = EsmFoldTriangleMultiplicativeUpdate(config, _outgoing=True)
self.tri_mul_in = EsmFoldTriangleMultiplicativeUpdate(config, _outgoing=False)
self.tri_att_start = EsmFoldTriangleAttention(
pairwise_state_dim, config.pairwise_head_width, pairwise_num_heads, inf=1e9, starting=True
)
self.tri_att_end = EsmFoldTriangleAttention(
pairwise_state_dim, config.pairwise_head_width, pairwise_num_heads, inf=1e9, starting=False
)
self.mlp_seq = EsmFoldResidueMLP(sequence_state_dim, 4 * sequence_state_dim, dropout=config.dropout)
self.mlp_pair = EsmFoldResidueMLP(pairwise_state_dim, 4 * pairwise_state_dim, dropout=config.dropout)
self.drop = nn.Dropout(config.dropout)
self.row_drop = EsmFoldDropout(config.dropout * 2, 2)
self.col_drop = EsmFoldDropout(config.dropout * 2, 1)
def forward(self, sequence_state, pairwise_state, mask=None, chunk_size=None, **__kwargs):
"""
Inputs:
sequence_state: B x L x sequence_state_dim pairwise_state: B x L x L x pairwise_state_dim mask: B x L boolean
tensor of valid positions
Output:
sequence_state: B x L x sequence_state_dim pairwise_state: B x L x L x pairwise_state_dim
"""
if len(sequence_state.shape) != 3:
raise ValueError(f"`sequence_state` should be a 3d-tensor, got {len(sequence_state.shape)} dims.")
if len(pairwise_state.shape) != 4:
raise ValueError(f"`pairwise_state` should be a 4d-tensor, got {len(pairwise_state.shape)} dims.")
if mask is not None and len(mask.shape) != 2:
raise ValueError(f"`mask` should be a 2d-tensor, got {len(mask.shape)} dims.")
batch_dim, seq_dim, sequence_state_dim = sequence_state.shape
pairwise_state_dim = pairwise_state.shape[3]
if sequence_state_dim != self.config.sequence_state_dim:
raise ValueError(
"`sequence_state` last dimension should be equal to `self.sequence_state_dim`. Got "
f"{sequence_state_dim} != {self.config.sequence_state_dim}."
)
if pairwise_state_dim != self.config.pairwise_state_dim:
raise ValueError(
"`pairwise_state` last dimension should be equal to `self.pairwise_state_dim`. Got "
f"{pairwise_state_dim} != {self.config.pairwise_state_dim}."
)
if batch_dim != pairwise_state.shape[0]:
raise ValueError(
f"`sequence_state` and `pairwise_state` have inconsistent batch size: {batch_dim} != "
f"{pairwise_state.shape[0]}."
)
if seq_dim != pairwise_state.shape[1] or seq_dim != pairwise_state.shape[2]:
raise ValueError(
f"`sequence_state` and `pairwise_state` have inconsistent sequence length: {seq_dim} != "
f"{pairwise_state.shape[1]} or {pairwise_state.shape[2]}."
)
# Update sequence state
bias = self.pair_to_sequence(pairwise_state)
# Self attention with bias + mlp.
y = self.layernorm_1(sequence_state)
y, _ = self.seq_attention(y, mask=mask, bias=bias)
sequence_state = sequence_state + self.drop(y)
sequence_state = self.mlp_seq(sequence_state)
# Update pairwise state
pairwise_state = pairwise_state + self.sequence_to_pair(sequence_state)
# Axial attention with triangular bias.
tri_mask = mask.unsqueeze(2) * mask.unsqueeze(1) if mask is not None else None
pairwise_state = pairwise_state + self.row_drop(self.tri_mul_out(pairwise_state, mask=tri_mask))
pairwise_state = pairwise_state + self.col_drop(self.tri_mul_in(pairwise_state, mask=tri_mask))
pairwise_state = pairwise_state + self.row_drop(
self.tri_att_start(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
)
pairwise_state = pairwise_state + self.col_drop(
self.tri_att_end(pairwise_state, mask=tri_mask, chunk_size=chunk_size)
)
# MLP over pairs.
pairwise_state = self.mlp_pair(pairwise_state)
return sequence_state, pairwise_state
class EsmCategoricalMixture:
def __init__(self, param, bins=50, start=0, end=1):
# All tensors are of shape ..., bins.
self.logits = param
bins = torch.linspace(start, end, bins + 1, device=self.logits.device, dtype=self.logits.dtype)
self.v_bins = (bins[:-1] + bins[1:]) / 2
def log_prob(self, true):
# Shapes are:
# self.probs: ... x bins
# true : ...
true_index = (true.unsqueeze(-1) - self.v_bins[[None] * true.ndim]).abs().argmin(-1)
nll = self.logits.log_softmax(-1)
return torch.take_along_dim(nll, true_index.unsqueeze(-1), dim=-1).squeeze(-1)
def mean(self):
return (self.logits.softmax(-1) @ self.v_bins.unsqueeze(1)).squeeze(-1)
def categorical_lddt(logits, bins=50):
# Logits are ..., 37, bins.
return EsmCategoricalMixture(logits, bins=bins).mean()
def get_axial_mask(mask):
"""
Helper to convert B x L mask of valid positions to axial mask used in row column attentions.
Input:
mask: B x L tensor of booleans
Output:
mask: B x L x L tensor of booleans
"""
if mask is None:
return None
if len(mask.shape) != 2:
raise ValueError(f"`mask` should be a 2d-tensor, got {len(mask.shape)} dims.")
batch_dim, seq_dim = mask.shape
m = mask.unsqueeze(1).expand(batch_dim, seq_dim, seq_dim)
m = m.reshape(batch_dim * seq_dim, seq_dim)
return m
class EsmFoldRelativePosition(nn.Module):
def __init__(self, config):
super().__init__()
self.bins = config.position_bins
# Note an additional offset is used so that the 0th position
# is reserved for masked pairs.
self.embedding = torch.nn.Embedding(2 * self.bins + 2, config.pairwise_state_dim)
def forward(self, residue_index, mask=None):
"""
Input:
residue_index: B x L tensor of indices (dytpe=torch.long) mask: B x L tensor of booleans
Output:
pairwise_state: B x L x L x pairwise_state_dim tensor of embeddings
"""
if residue_index.dtype != torch.long:
raise ValueError(f"`residue_index` has dtype {residue_index.dtype}, it should be `torch.long`.")
if mask is not None and residue_index.shape != mask.shape:
raise ValueError(
f"`residue_index` and `mask` have inconsistent shapes: {residue_index.shape} != {mask.shape}."
)
diff = residue_index[:, None, :] - residue_index[:, :, None]
diff = diff.clamp(-self.bins, self.bins)
diff = diff + self.bins + 1 # Add 1 to adjust for padding index.
if mask is not None:
mask = mask[:, None, :] * mask[:, :, None]
diff[mask == False] = 0 # noqa: E712
output = self.embedding(diff)
return output
class EsmFoldAngleResnetBlock(nn.Module):
def __init__(self, config):
super().__init__()
self.linear_1 = EsmFoldLinear(config.resnet_dim, config.resnet_dim, init="relu")
self.linear_2 = EsmFoldLinear(config.resnet_dim, config.resnet_dim, init="final")
self.relu = nn.ReLU()
def forward(self, a: torch.Tensor) -> torch.Tensor:
s_initial = a
a = self.relu(a)
a = self.linear_1(a)
a = self.relu(a)
a = self.linear_2(a)
return a + s_initial
class EsmFoldAngleResnet(nn.Module):
"""
Implements Algorithm 20, lines 11-14
"""
def __init__(self, config):
super().__init__()
self.config = config
self.linear_in = EsmFoldLinear(config.sequence_dim, config.resnet_dim)
self.linear_initial = EsmFoldLinear(config.sequence_dim, config.resnet_dim)
self.layers = nn.ModuleList()
for _ in range(config.num_resnet_blocks):
layer = EsmFoldAngleResnetBlock(config)
self.layers.append(layer)
self.linear_out = EsmFoldLinear(config.resnet_dim, config.num_angles * 2)
self.relu = nn.ReLU()
def forward(self, s: torch.Tensor, s_initial: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Args:
s:
[*, C_hidden] single embedding
s_initial:
[*, C_hidden] single embedding as of the start of the StructureModule
Returns:
[*, no_angles, 2] predicted angles
"""
# NOTE: The ReLU's applied to the inputs are absent from the supplement
# pseudocode but present in the source. For maximal compatibility with
# the pretrained weights, I'm going with the source.
# [*, C_hidden]
s_initial = self.relu(s_initial)
s_initial = self.linear_initial(s_initial)
s = self.relu(s)
s = self.linear_in(s)
s = s + s_initial
for l in self.layers:
s = l(s)
s = self.relu(s)
# [*, no_angles * 2]
s = self.linear_out(s)
# [*, no_angles, 2]
s = s.view(s.shape[:-1] + (-1, 2))
unnormalized_s = s
norm_denom = torch.sqrt(
torch.clamp(
torch.sum(s**2, dim=-1, keepdim=True),
min=self.config.epsilon,
)
)
s = s / norm_denom
return unnormalized_s, s
class EsmFoldInvariantPointAttention(nn.Module):
"""
Implements Algorithm 22.
"""
def __init__(self, config):
super().__init__()
self.config = config
c_s = config.sequence_dim
c_z = config.pairwise_dim
self.hidden_dim = config.ipa_dim
self.num_heads = config.num_heads_ipa
self.num_qk_points = config.num_qk_points
self.num_v_points = config.num_v_points
# These linear layers differ from their specifications in the
# supplement. There, they lack bias and use Glorot initialization.
# Here as in the official source, they have bias and use the default
# Lecun initialization.
hc = config.ipa_dim * config.num_heads_ipa
self.linear_q = EsmFoldLinear(c_s, hc)
self.linear_kv = EsmFoldLinear(c_s, 2 * hc)
hpq = config.num_heads_ipa * config.num_qk_points * 3
self.linear_q_points = EsmFoldLinear(c_s, hpq)
hpkv = config.num_heads_ipa * (config.num_qk_points + config.num_v_points) * 3
self.linear_kv_points = EsmFoldLinear(c_s, hpkv)
self.linear_b = EsmFoldLinear(c_z, config.num_heads_ipa)
self.head_weights = nn.Parameter(torch.zeros((config.num_heads_ipa)))
concat_out_dim = config.num_heads_ipa * (c_z + config.ipa_dim + config.num_v_points * 4)
self.linear_out = EsmFoldLinear(concat_out_dim, c_s, init="final")
self.softmax = nn.Softmax(dim=-1)
self.softplus = nn.Softplus()
def forward(
self,
s: torch.Tensor,
z: Optional[torch.Tensor],
r: Rigid,
mask: torch.Tensor,
_offload_inference: bool = False,
_z_reference_list: Optional[Sequence[torch.Tensor]] = None,
) -> torch.Tensor:
"""
Args:
s:
[*, N_res, C_s] single representation
z:
[*, N_res, N_res, C_z] pair representation
r:
[*, N_res] transformation object
mask:
[*, N_res] mask
Returns:
[*, N_res, C_s] single representation update
"""
z = [z]
#######################################
# Generate scalar and point activations
#######################################
# [*, N_res, H * C_hidden]
q = self.linear_q(s)
kv = self.linear_kv(s)
# [*, N_res, H, C_hidden]
q = q.view(q.shape[:-1] + (self.num_heads, -1))
# [*, N_res, H, 2 * C_hidden]
kv = kv.view(kv.shape[:-1] + (self.num_heads, -1))
# [*, N_res, H, C_hidden]
k, v = torch.split(kv, self.hidden_dim, dim=-1)
# [*, N_res, H * P_q * 3]
q_pts = self.linear_q_points(s)
# This is kind of clunky, but it's how the original does it
# [*, N_res, H * P_q, 3]
q_pts = torch.split(q_pts, q_pts.shape[-1] // 3, dim=-1)
q_pts = torch.stack(q_pts, dim=-1)
q_pts = r[..., None].apply(q_pts)
# [*, N_res, H, P_q, 3]
q_pts = q_pts.view(q_pts.shape[:-2] + (self.num_heads, self.num_qk_points, 3))
# [*, N_res, H * (P_q + P_v) * 3]
kv_pts = self.linear_kv_points(s)
# [*, N_res, H * (P_q + P_v), 3]
kv_pts = torch.split(kv_pts, kv_pts.shape[-1] // 3, dim=-1)
kv_pts = torch.stack(kv_pts, dim=-1)
kv_pts = r[..., None].apply(kv_pts)
# [*, N_res, H, (P_q + P_v), 3]
kv_pts = kv_pts.view(kv_pts.shape[:-2] + (self.num_heads, -1, 3))
# [*, N_res, H, P_q/P_v, 3]
k_pts, v_pts = torch.split(kv_pts, [self.num_qk_points, self.num_v_points], dim=-2)
##########################
# Compute attention scores
##########################
# [*, N_res, N_res, H]
b = self.linear_b(z[0])
if _offload_inference:
assert sys.getrefcount(z[0]) == 2
z[0] = z[0].cpu()
# [*, H, N_res, N_res]
if is_fp16_enabled():
with torch.cuda.amp.autocast(enabled=False):
a = torch.matmul(
permute_final_dims(q.float(), (1, 0, 2)), # [*, H, N_res, C_hidden]
permute_final_dims(k.float(), (1, 2, 0)), # [*, H, C_hidden, N_res]
)
else:
a = torch.matmul(
permute_final_dims(q, (1, 0, 2)), # [*, H, N_res, C_hidden]
permute_final_dims(k, (1, 2, 0)), # [*, H, C_hidden, N_res]
)
a *= math.sqrt(1.0 / (3 * self.hidden_dim))
a += math.sqrt(1.0 / 3) * permute_final_dims(b, (2, 0, 1))
# [*, N_res, N_res, H, P_q, 3]
pt_att = q_pts.unsqueeze(-4) - k_pts.unsqueeze(-5)
pt_att = pt_att**2
# [*, N_res, N_res, H, P_q]
pt_att = sum(torch.unbind(pt_att, dim=-1))
head_weights = self.softplus(self.head_weights).view(*((1,) * len(pt_att.shape[:-2]) + (-1, 1)))
head_weights = head_weights * math.sqrt(1.0 / (3 * (self.num_qk_points * 9.0 / 2)))
pt_att = pt_att * head_weights
# [*, N_res, N_res, H]
pt_att = torch.sum(pt_att, dim=-1) * (-0.5)
# [*, N_res, N_res]
square_mask = mask.unsqueeze(-1) * mask.unsqueeze(-2)
square_mask = self.config.inf * (square_mask - 1)
# [*, H, N_res, N_res]
pt_att = permute_final_dims(pt_att, (2, 0, 1))
a = a + pt_att
a = a + square_mask.unsqueeze(-3)
a = self.softmax(a)
################
# Compute output
################
# [*, N_res, H, C_hidden]
o = torch.matmul(a, v.transpose(-2, -3).to(dtype=a.dtype)).transpose(-2, -3)
# [*, N_res, H * C_hidden]
o = flatten_final_dims(o, 2)
# [*, H, 3, N_res, P_v]
o_pt = torch.sum(
(a[..., None, :, :, None] * permute_final_dims(v_pts, (1, 3, 0, 2))[..., None, :, :]),
dim=-2,
)
# [*, N_res, H, P_v, 3]
o_pt = permute_final_dims(o_pt, (2, 0, 3, 1))
o_pt = r[..., None, None].invert_apply(o_pt)
# [*, N_res, H * P_v]
o_pt_norm = flatten_final_dims(torch.sqrt(torch.sum(o_pt**2, dim=-1) + self.config.epsilon), 2)
# [*, N_res, H * P_v, 3]
o_pt = o_pt.reshape(*o_pt.shape[:-3], -1, 3)
if _offload_inference:
z[0] = z[0].to(o_pt.device)
# [*, N_res, H, C_z]
o_pair = torch.matmul(a.transpose(-2, -3), z[0].to(dtype=a.dtype))
# [*, N_res, H * C_z]
o_pair = flatten_final_dims(o_pair, 2)
# [*, N_res, C_s]
s = self.linear_out(
torch.cat((o, *torch.unbind(o_pt, dim=-1), o_pt_norm, o_pair), dim=-1).to(dtype=z[0].dtype)
)
return s
class EsmFoldBackboneUpdate(nn.Module):
"""
Implements part of Algorithm 23.
"""
def __init__(self, config):
super().__init__()
self.linear = EsmFoldLinear(config.sequence_dim, 6, init="final")
def forward(self, s: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
"""
Args:
[*, N_res, C_s] single representation
Returns:
[*, N_res, 6] update vector
"""
# [*, 6]
update = self.linear(s)
return update
class EsmFoldStructureModuleTransitionLayer(nn.Module):
def __init__(self, config):
super().__init__()
self.linear_1 = EsmFoldLinear(config.sequence_dim, config.sequence_dim, init="relu")
self.linear_2 = EsmFoldLinear(config.sequence_dim, config.sequence_dim, init="relu")
self.linear_3 = EsmFoldLinear(config.sequence_dim, config.sequence_dim, init="final")
self.relu = nn.ReLU()
def forward(self, s):
s_initial = s
s = self.linear_1(s)
s = self.relu(s)
s = self.linear_2(s)
s = self.relu(s)
s = self.linear_3(s)
s = s + s_initial
return s
class EsmFoldStructureModuleTransition(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
self.layers = nn.ModuleList()
for _ in range(config.num_transition_layers):
l = EsmFoldStructureModuleTransitionLayer(config)
self.layers.append(l)
self.dropout = nn.Dropout(config.dropout_rate)
self.layer_norm = LayerNorm(config.sequence_dim)
def forward(self, s):
for l in self.layers:
s = l(s)
s = self.dropout(s)
s = self.layer_norm(s)
return s
class EsmFoldStructureModule(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
# Buffers to be lazily initialized later
# self.default_frames
# self.group_idx
# self.atom_mask
# self.lit_positions
self.layer_norm_s = LayerNorm(config.sequence_dim)
self.layer_norm_z = LayerNorm(config.pairwise_dim)
self.linear_in = EsmFoldLinear(config.sequence_dim, config.sequence_dim)
self.ipa = EsmFoldInvariantPointAttention(config)
self.ipa_dropout = nn.Dropout(config.dropout_rate)
self.layer_norm_ipa = LayerNorm(config.sequence_dim)
self.transition = EsmFoldStructureModuleTransition(config)
self.bb_update = EsmFoldBackboneUpdate(config)
self.angle_resnet = EsmFoldAngleResnet(config)
def forward(
self,
evoformer_output_dict,
aatype,
mask=None,
_offload_inference=False,
):
"""
Args:
evoformer_output_dict:
Dictionary containing:
"single":
[*, N_res, C_s] single representation
"pair":
[*, N_res, N_res, C_z] pair representation
aatype:
[*, N_res] amino acid indices
mask:
Optional [*, N_res] sequence mask
Returns:
A dictionary of outputs
"""
s = evoformer_output_dict["single"]
if mask is None:
# [*, N]
mask = s.new_ones(s.shape[:-1])
# [*, N, C_s]
s = self.layer_norm_s(s)
# [*, N, N, C_z]
z = self.layer_norm_z(evoformer_output_dict["pair"])
z_reference_list = None
if _offload_inference:
assert sys.getrefcount(evoformer_output_dict["pair"]) == 2
evoformer_output_dict["pair"] = evoformer_output_dict["pair"].cpu()
z_reference_list = [z]
z = None
# [*, N, C_s]
s_initial = s
s = self.linear_in(s)
# [*, N]
rigids = Rigid.identity(
s.shape[:-1],
s.dtype,
s.device,
self.training,
fmt="quat",
)
outputs = []
for i in range(self.config.num_blocks):
# [*, N, C_s]
s = s + self.ipa(
s,
z,
rigids,
mask,
_offload_inference=_offload_inference,
_z_reference_list=z_reference_list,
)
s = self.ipa_dropout(s)
s = self.layer_norm_ipa(s)
s = self.transition(s)
# [*, N]
rigids = rigids.compose_q_update_vec(self.bb_update(s))
# To hew as closely as possible to AlphaFold, we convert our
# quaternion-based transformations to rotation-matrix ones
# here
backb_to_global = Rigid(
Rotation(rot_mats=rigids.get_rots().get_rot_mats(), quats=None),
rigids.get_trans(),
)
backb_to_global = backb_to_global.scale_translation(self.config.trans_scale_factor)
# [*, N, 7, 2]
unnormalized_angles, angles = self.angle_resnet(s, s_initial)
all_frames_to_global = self.torsion_angles_to_frames(backb_to_global, angles, aatype)
pred_xyz = self.frames_and_literature_positions_to_atom14_pos(all_frames_to_global, aatype)
scaled_rigids = rigids.scale_translation(self.config.trans_scale_factor)
preds = {
"frames": scaled_rigids.to_tensor_7(),
"sidechain_frames": all_frames_to_global.to_tensor_4x4(),
"unnormalized_angles": unnormalized_angles,
"angles": angles,
"positions": pred_xyz,
"states": s,
}
outputs.append(preds)
rigids = rigids.stop_rot_gradient()
del z, z_reference_list
if _offload_inference:
evoformer_output_dict["pair"] = evoformer_output_dict["pair"].to(s.device)
outputs = dict_multimap(torch.stack, outputs)
outputs["single"] = s
return outputs
def _init_residue_constants(self, float_dtype, device):
if not hasattr(self, "default_frames"):
self.register_buffer(
"default_frames",
torch.tensor(
residue_constants.restype_rigid_group_default_frame,
dtype=float_dtype,
device=device,
requires_grad=False,
),
persistent=False,
)
if not hasattr(self, "group_idx"):
self.register_buffer(
"group_idx",
torch.tensor(
residue_constants.restype_atom14_to_rigid_group,
device=device,
requires_grad=False,
),
persistent=False,
)
if not hasattr(self, "atom_mask"):
self.register_buffer(
"atom_mask",
torch.tensor(
residue_constants.restype_atom14_mask,
dtype=float_dtype,
device=device,
requires_grad=False,
),
persistent=False,
)
if not hasattr(self, "lit_positions"):
self.register_buffer(
"lit_positions",
torch.tensor(
residue_constants.restype_atom14_rigid_group_positions,
dtype=float_dtype,
device=device,
requires_grad=False,
),
persistent=False,
)
def torsion_angles_to_frames(self, r, alpha, f):
# Lazily initialize the residue constants on the correct device
self._init_residue_constants(alpha.dtype, alpha.device)
# Separated purely to make testing less annoying
return torsion_angles_to_frames(r, alpha, f, self.default_frames)
def frames_and_literature_positions_to_atom14_pos(self, r, f): # [*, N, 8] # [*, N]
# Lazily initialize the residue constants on the correct device
self._init_residue_constants(r.get_rots().dtype, r.get_rots().device)
return frames_and_literature_positions_to_atom14_pos(
r,
f,
self.default_frames,
self.group_idx,
self.atom_mask,
self.lit_positions,
)
class EsmFoldingTrunk(nn.Module):
def __init__(self, config):
super().__init__()
self.config = config
c_s = config.sequence_state_dim
c_z = config.pairwise_state_dim
self.pairwise_positional_embedding = EsmFoldRelativePosition(config)
self.blocks = nn.ModuleList([EsmFoldTriangularSelfAttentionBlock(config) for _ in range(config.num_blocks)])
self.recycle_bins = 15
self.recycle_s_norm = nn.LayerNorm(c_s)
self.recycle_z_norm = nn.LayerNorm(c_z)
self.recycle_disto = nn.Embedding(self.recycle_bins, c_z)
self.recycle_disto.weight[0].detach().zero_()
self.structure_module = EsmFoldStructureModule(config.structure_module)
self.trunk2sm_s = nn.Linear(c_s, config.structure_module.sequence_dim)
self.trunk2sm_z = nn.Linear(c_z, config.structure_module.pairwise_dim)
self.chunk_size = config.chunk_size
def set_chunk_size(self, chunk_size):
# This parameter means the axial attention will be computed
# in a chunked manner. This should make the memory used more or less O(L) instead of O(L^2).
# It's equivalent to running a for loop over chunks of the dimension we're iterative over,
# where the chunk_size is the size of the chunks, so 128 would mean to parse 128-length chunks.
self.chunk_size = chunk_size
def forward(self, seq_feats, pair_feats, true_aa, residx, mask, no_recycles):
"""
Inputs:
seq_feats: B x L x C tensor of sequence features pair_feats: B x L x L x C tensor of pair features residx: B
x L long tensor giving the position in the sequence mask: B x L boolean tensor indicating valid residues
Output:
predicted_structure: B x L x (num_atoms_per_residue * 3) tensor wrapped in a Coordinates object
"""
device = seq_feats.device
s_s_0 = seq_feats
s_z_0 = pair_feats
if no_recycles is None:
no_recycles = self.config.max_recycles
else:
if no_recycles < 0:
raise ValueError("Number of recycles must not be negative.")
no_recycles += 1 # First 'recycle' is just the standard forward pass through the model.
def trunk_iter(s, z, residx, mask):
z = z + self.pairwise_positional_embedding(residx, mask=mask)
for block in self.blocks:
s, z = block(s, z, mask=mask, residue_index=residx, chunk_size=self.chunk_size)
return s, z
s_s = s_s_0
s_z = s_z_0
recycle_s = torch.zeros_like(s_s)
recycle_z = torch.zeros_like(s_z)
recycle_bins = torch.zeros(*s_z.shape[:-1], device=device, dtype=torch.int64)
for recycle_idx in range(no_recycles):
with ContextManagers([] if recycle_idx == no_recycles - 1 else [torch.no_grad()]):
# === Recycling ===
recycle_s = self.recycle_s_norm(recycle_s.detach()).to(device)
recycle_z = self.recycle_z_norm(recycle_z.detach()).to(device)
recycle_z += self.recycle_disto(recycle_bins.detach()).to(device)
s_s, s_z = trunk_iter(s_s_0 + recycle_s, s_z_0 + recycle_z, residx, mask)
# === Structure module ===
structure = self.structure_module(
{"single": self.trunk2sm_s(s_s), "pair": self.trunk2sm_z(s_z)},
true_aa,
mask.float(),
)
recycle_s = s_s
recycle_z = s_z
# Distogram needs the N, CA, C coordinates, and bin constants same as alphafold.
recycle_bins = EsmFoldingTrunk.distogram(
structure["positions"][-1][:, :, :3],
3.375,
21.375,
self.recycle_bins,
)
structure["s_s"] = s_s
structure["s_z"] = s_z
return structure
@staticmethod
def distogram(coords, min_bin, max_bin, num_bins):
# Coords are [... L x 3 x 3], where it's [N, CA, C] x 3 coordinates.
boundaries = torch.linspace(
min_bin,
max_bin,
num_bins - 1,
device=coords.device,
)
boundaries = boundaries**2
N, CA, C = [x.squeeze(-2) for x in coords.chunk(3, dim=-2)]
# Infer CB coordinates.
b = CA - N
c = C - CA
a = b.cross(c, dim=-1)
CB = -0.58273431 * a + 0.56802827 * b - 0.54067466 * c + CA
dists = (CB[..., None, :, :] - CB[..., :, None, :]).pow(2).sum(dim=-1, keepdims=True)
bins = torch.sum(dists > boundaries, dim=-1) # [..., L, L]
return bins
# TODO Add information to the docstring about any methods that convert to PDB format, or otherwise prepare
# the outputs for downstream use.
@add_start_docstrings(
"""
ESMForProteinFolding is the HuggingFace port of the original ESMFold model. It consists of an ESM-2 "stem" followed
by a protein folding "head", although unlike most other output heads, this "head" is similar in size and runtime to
the rest of the model combined! It outputs a dictionary containing predicted structural information about the input
protein(s).
""",
ESM_START_DOCSTRING,
)
class EsmForProteinFolding(EsmPreTrainedModel):
_no_split_modules = ["EsmFoldStructureModule", "EsmFoldTriangularSelfAttentionBlock"]
def __init__(self, config):
super().__init__(config)
self.config = config
self.distogram_bins = 64
self.esm = EsmModel(config, add_pooling_layer=False)
self.esm.requires_grad_(False)
if self.config.esmfold_config.fp16_esm:
self.esm.half()
self.esm_feats = self.config.hidden_size
self.esm_attns = self.config.num_hidden_layers * self.config.num_attention_heads
self.esm_layers = self.config.num_hidden_layers
self.register_buffer("af2_to_esm", self._af2_to_esm_from_vocab_list(config.vocab_list))
self.esm_s_combine = nn.Parameter(torch.zeros(self.esm_layers + 1))
trunk_config = self.config.esmfold_config.trunk
c_s = trunk_config.sequence_state_dim
c_z = trunk_config.pairwise_state_dim
self.esm_s_mlp = nn.Sequential(
LayerNorm(self.esm_feats),
nn.Linear(self.esm_feats, c_s),
nn.ReLU(),
nn.Linear(c_s, c_s),
)
# 0 is padding, N is unknown residues, N + 1 is mask.
self.n_tokens_embed = residue_constants.restype_num + 3
self.pad_idx = 0
self.unk_idx = self.n_tokens_embed - 2
self.mask_idx = self.n_tokens_embed - 1
self.esm_dict_cls_idx = self.config.vocab_list.index("<cls>")
self.esm_dict_mask_idx = self.config.vocab_list.index("<mask>")
self.esm_dict_eos_idx = self.config.vocab_list.index("<eos>")
self.esm_dict_padding_idx = self.config.vocab_list.index("<pad>")
if self.config.esmfold_config.embed_aa:
self.embedding = nn.Embedding(self.n_tokens_embed, c_s, padding_idx=0)
self.trunk = EsmFoldingTrunk(trunk_config)
self.distogram_head = nn.Linear(c_z, self.distogram_bins)
self.ptm_head = nn.Linear(c_z, self.distogram_bins)
self.lm_head = nn.Linear(c_s, self.n_tokens_embed)
self.lddt_bins = 50
structure_module_config = trunk_config.structure_module
self.lddt_head = nn.Sequential(
nn.LayerNorm(structure_module_config.sequence_dim),
nn.Linear(structure_module_config.sequence_dim, self.config.esmfold_config.lddt_head_hid_dim),
nn.Linear(self.config.esmfold_config.lddt_head_hid_dim, self.config.esmfold_config.lddt_head_hid_dim),
nn.Linear(self.config.esmfold_config.lddt_head_hid_dim, 37 * self.lddt_bins),
)
@staticmethod
def _af2_to_esm_from_vocab_list(vocab_list: List[str]) -> torch.Tensor:
# Remember that t is shifted from residue_constants by 1 (0 is padding).
esm_reorder = [vocab_list.index("<pad>")] + [vocab_list.index(v) for v in residue_constants.restypes_with_x]
return torch.tensor(esm_reorder)
@add_start_docstrings_to_model_forward(ESMFOLD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
@replace_return_docstrings(output_type=EsmForProteinFoldingOutput, config_class=EsmConfig)
def forward(
self,
input_ids: torch.Tensor,
attention_mask: Optional[torch.Tensor] = None,
position_ids: Optional[torch.Tensor] = None,
masking_pattern: Optional[torch.Tensor] = None,
num_recycles: Optional[int] = None,
) -> EsmForProteinFoldingOutput:
r"""
Returns:
Example:
```python
>>> from transformers import AutoTokenizer, EsmForProteinFolding
>>> model = EsmForProteinFolding.from_pretrained("facebook/esmfold_v1")
>>> tokenizer = AutoTokenizer.from_pretrained("facebook/esmfold_v1")
>>> inputs = tokenizer(["MLKNVQVQLV"], return_tensors="pt", add_special_tokens=False) # A tiny random peptide
>>> outputs = model(**inputs)
>>> folded_positions = outputs.positions
```
"""
cfg = self.config.esmfold_config
aa = input_ids # B x L
B = aa.shape[0]
L = aa.shape[1]
device = input_ids.device
if attention_mask is None:
attention_mask = torch.ones_like(aa, device=device)
if position_ids is None:
position_ids = torch.arange(L, device=device).expand_as(input_ids)
# === ESM ===
esmaa = self.af2_idx_to_esm_idx(aa, attention_mask)
if masking_pattern is not None:
masked_aa, esmaa, mlm_targets = self.bert_mask(aa, esmaa, attention_mask, masking_pattern)
else:
masked_aa = aa
mlm_targets = None
# We get sequence and pair representations from whatever version of ESM /
# configuration we are using. The sequence representation esm_s is always
# present. The pair embedding esm_z may be present depending on the
# configuration of the model. If esm_z is not used by the model then it
# is returned as None here.
esm_s = self.compute_language_model_representations(esmaa)
# Convert esm_s and esm_z, if present, to the precision used by the trunk and
# the structure module. These tensors may be a lower precision if, for example,
# we're running the language model in fp16 precision.
esm_s = esm_s.to(self.esm_s_combine.dtype)
if cfg.esm_ablate_sequence:
esm_s = esm_s * 0
esm_s = esm_s.detach()
# === preprocessing ===
esm_s = (self.esm_s_combine.softmax(0).unsqueeze(0) @ esm_s).squeeze(2)
s_s_0 = self.esm_s_mlp(esm_s)
s_z_0 = s_s_0.new_zeros(B, L, L, cfg.trunk.pairwise_state_dim)
if self.config.esmfold_config.embed_aa:
s_s_0 += self.embedding(masked_aa)
structure: dict = self.trunk(s_s_0, s_z_0, aa, position_ids, attention_mask, no_recycles=num_recycles)
# Documenting what we expect:
structure = {
k: v
for k, v in structure.items()
if k
in [
"s_z",
"s_s",
"frames",
"sidechain_frames",
"unnormalized_angles",
"angles",
"positions",
"states",
]
}
# Add BERT mask for the loss to use, if available.
if mlm_targets:
structure["mlm_targets"] = mlm_targets
disto_logits = self.distogram_head(structure["s_z"])
disto_logits = (disto_logits + disto_logits.transpose(1, 2)) / 2
structure["distogram_logits"] = disto_logits
lm_logits = self.lm_head(structure["s_s"])
structure["lm_logits"] = lm_logits
structure["aatype"] = aa
make_atom14_masks(structure)
# Of course, this doesn't respect the true mask because it doesn't know about it...
# We're not going to properly mask change of index tensors:
# "residx_atom14_to_atom37",
# "residx_atom37_to_atom14",
for k in [
"atom14_atom_exists",
"atom37_atom_exists",
]:
structure[k] *= attention_mask.unsqueeze(-1)
structure["residue_index"] = position_ids
lddt_head = self.lddt_head(structure["states"]).reshape(structure["states"].shape[0], B, L, -1, self.lddt_bins)
structure["lddt_head"] = lddt_head
plddt = categorical_lddt(lddt_head[-1], bins=self.lddt_bins)
structure["plddt"] = plddt
ptm_logits = self.ptm_head(structure["s_z"])
structure["ptm_logits"] = ptm_logits
structure["ptm"] = compute_tm(ptm_logits, max_bin=31, no_bins=self.distogram_bins)
structure.update(compute_predicted_aligned_error(ptm_logits, max_bin=31, no_bins=self.distogram_bins))
return EsmForProteinFoldingOutput(**structure)
def af2_idx_to_esm_idx(self, aa, mask):
# avoid indexing on different devices
if self.af2_to_esm.device != aa.device:
self.af2_to_esm = self.af2_to_esm.to(aa.device)
aa = (aa + 1).masked_fill(mask != 1, 0)
return self.af2_to_esm[aa]
def compute_language_model_representations(self, esmaa: torch.Tensor) -> torch.Tensor:
device = next(self.parameters()).device
B, L = esmaa.shape # B = batch size, L = sequence length.
if self.config.esmfold_config.bypass_lm:
esm_s = torch.zeros(B, L, self.esm_s_combine.size[0], -1, self.esm_feats, device=device)
return esm_s
bosi, eosi = self.esm_dict_cls_idx, self.esm_dict_eos_idx
bos = esmaa.new_full((B, 1), bosi)
eos = esmaa.new_full((B, 1), self.esm_dict_padding_idx)
esmaa = torch.cat([bos, esmaa, eos], dim=1)
# Use the first padding index as eos during inference.
esmaa[range(B), (esmaa != 1).sum(1)] = eosi
# _, esm_z, esm_s = self.esm(esmaa, return_pairs=self.config.esmfold_config.use_esm_attn_map)
# Because we do not support use_esm_attn_map in the HF port as it is not used in any public models,
# esm_z is always None
esm_hidden_states = self.esm(esmaa, attention_mask=esmaa != 1, output_hidden_states=True)["hidden_states"]
esm_s = torch.stack(esm_hidden_states, dim=2)
esm_s = esm_s[:, 1:-1] # B, L, nLayers, C
return esm_s
def bert_mask(self, aa, esmaa, mask, pattern):
new_aa = aa.clone()
target = aa.clone()
new_esmaa = esmaa.clone()
new_aa[pattern == 1] = self.mask_idx
target[pattern != 1] = 0
new_esmaa[pattern == 1] = self.esm_dict_mask_idx
return new_aa, new_esmaa, target
@torch.no_grad()
def infer(
self,
seqs: Union[str, List[str]],
position_ids=None,
):
if isinstance(seqs, str):
lst = [seqs]
else:
lst = seqs
# Returns the raw outputs of the model given an input sequence.
device = next(self.parameters()).device
aatype = collate_dense_tensors(
[
torch.from_numpy(
residue_constants.sequence_to_onehot(
sequence=seq,
mapping=residue_constants.restype_order_with_x,
map_unknown_to_x=True,
)
)
.to(device)
.argmax(dim=1)
for seq in lst
]
) # B=1 x L
mask = collate_dense_tensors([aatype.new_ones(len(seq)) for seq in lst])
position_ids = (
torch.arange(aatype.shape[1], device=device).expand(len(lst), -1)
if position_ids is None
else position_ids.to(device)
)
if position_ids.ndim == 1:
position_ids = position_ids.unsqueeze(0)
return self.forward(
aatype,
mask,
position_ids=position_ids,
)
@staticmethod
def output_to_pdb(output: Dict) -> List[str]:
"""Returns the pbd (file) string from the model given the model output."""
output = {k: v.to("cpu").numpy() for k, v in output.items()}
pdbs = []
final_atom_positions = atom14_to_atom37(output["positions"][-1], output)
final_atom_mask = output["atom37_atom_exists"]
for i in range(output["aatype"].shape[0]):
aa = output["aatype"][i]
pred_pos = final_atom_positions[i]
mask = final_atom_mask[i]
resid = output["residue_index"][i] + 1
pred = OFProtein(
aatype=aa,
atom_positions=pred_pos,
atom_mask=mask,
residue_index=resid,
b_factors=output["plddt"][i],
)
pdbs.append(to_pdb(pred))
return pdbs
def infer_pdb(self, seqs, *args, **kwargs) -> str:
"""Returns the pdb (file) string from the model given an input sequence."""
assert isinstance(seqs, str)
output = self.infer(seqs, *args, **kwargs)
return self.output_to_pdb(output)[0]
def infer_pdbs(self, seqs: List[str], *args, **kwargs) -> List[str]:
"""Returns the pdb (file) string from the model given an input sequence."""
output = self.infer(seqs, *args, **kwargs)
return self.output_to_pdb(output)