# coding=utf-8 # Copyright 2023 Google AI and The HuggingFace Team. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """ PyTorch OWLv2 model.""" import warnings from dataclasses import dataclass from functools import lru_cache from typing import Any, Dict, Optional, Tuple, Union import torch import torch.utils.checkpoint from torch import Tensor, nn from ...activations import ACT2FN from ...modeling_attn_mask_utils import _create_4d_causal_attention_mask, _prepare_4d_attention_mask from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling from ...modeling_utils import PreTrainedModel from ...utils import ( ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, is_vision_available, logging, replace_return_docstrings, ) from .configuration_owlv2 import Owlv2Config, Owlv2TextConfig, Owlv2VisionConfig if is_vision_available(): from transformers.image_transforms import center_to_corners_format logger = logging.get_logger(__name__) _CHECKPOINT_FOR_DOC = "google/owlv2-base-patch16-ensemble" # See all Owlv2 models at https://huggingface.co/models?filter=owlv2 from ..deprecated._archive_maps import OWLV2_PRETRAINED_MODEL_ARCHIVE_LIST # noqa: F401, E402 # Copied from transformers.models.clip.modeling_clip.contrastive_loss with clip->owlv2 def contrastive_loss(logits: torch.Tensor) -> torch.Tensor: return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device)) # Copied from transformers.models.clip.modeling_clip.clip_loss with clip->owlv2 def owlv2_loss(similarity: torch.Tensor) -> torch.Tensor: caption_loss = contrastive_loss(similarity) image_loss = contrastive_loss(similarity.t()) return (caption_loss + image_loss) / 2.0 @dataclass class Owlv2Output(ModelOutput): """ Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`): Contrastive loss for image-text similarity. logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`): The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text similarity scores. logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`): The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image similarity scores. text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`]. image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of [`Owlv2VisionModel`]. text_model_output (Tuple[`BaseModelOutputWithPooling`]): The output of the [`Owlv2TextModel`]. vision_model_output (`BaseModelOutputWithPooling`): The output of the [`Owlv2VisionModel`]. """ loss: Optional[torch.FloatTensor] = None logits_per_image: torch.FloatTensor = None logits_per_text: torch.FloatTensor = None text_embeds: torch.FloatTensor = None image_embeds: torch.FloatTensor = None text_model_output: BaseModelOutputWithPooling = None vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> Tuple[Any]: return tuple( self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() for k in self.keys() ) # Copied from transformers.models.detr.modeling_detr._upcast def _upcast(t: Tensor) -> Tensor: # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type if t.is_floating_point(): return t if t.dtype in (torch.float32, torch.float64) else t.float() else: return t if t.dtype in (torch.int32, torch.int64) else t.int() # Copied from transformers.models.detr.modeling_detr.box_area def box_area(boxes: Tensor) -> Tensor: """ Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates. Args: boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`): Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1 < x2` and `0 <= y1 < y2`. Returns: `torch.FloatTensor`: a tensor containing the area for each box. """ boxes = _upcast(boxes) return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1]) # Copied from transformers.models.detr.modeling_detr.box_iou def box_iou(boxes1, boxes2): area1 = box_area(boxes1) area2 = box_area(boxes2) left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2]) # [N,M,2] right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:]) # [N,M,2] width_height = (right_bottom - left_top).clamp(min=0) # [N,M,2] inter = width_height[:, :, 0] * width_height[:, :, 1] # [N,M] union = area1[:, None] + area2 - inter iou = inter / union return iou, union # Copied from transformers.models.detr.modeling_detr.generalized_box_iou def generalized_box_iou(boxes1, boxes2): """ Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format. Returns: `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2) """ # degenerate boxes gives inf / nan results # so do an early check if not (boxes1[:, 2:] >= boxes1[:, :2]).all(): raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}") if not (boxes2[:, 2:] >= boxes2[:, :2]).all(): raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}") iou, union = box_iou(boxes1, boxes2) top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2]) bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:]) width_height = (bottom_right - top_left).clamp(min=0) # [N,M,2] area = width_height[:, :, 0] * width_height[:, :, 1] return iou - (area - union) / area @dataclass class Owlv2ObjectDetectionOutput(ModelOutput): """ Output type of [`Owlv2ForObjectDetection`]. Args: loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)): Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized scale-invariant IoU loss. loss_dict (`Dict`, *optional*): A dictionary containing the individual losses. Useful for logging. logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`): Classification logits (including no-object) for all queries. objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`): The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the total number of patches is (image_size / patch_size)**2. pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`): Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`]. image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`): Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image embeddings for each patch. class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`): Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total number of patches is (image_size / patch_size)**2. text_model_output (Tuple[`BaseModelOutputWithPooling`]): The output of the [`Owlv2TextModel`]. vision_model_output (`BaseModelOutputWithPooling`): The output of the [`Owlv2VisionModel`]. """ loss: Optional[torch.FloatTensor] = None loss_dict: Optional[Dict] = None logits: torch.FloatTensor = None objectness_logits: torch.FloatTensor = None pred_boxes: torch.FloatTensor = None text_embeds: torch.FloatTensor = None image_embeds: torch.FloatTensor = None class_embeds: torch.FloatTensor = None text_model_output: BaseModelOutputWithPooling = None vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> Tuple[Any]: return tuple( self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() for k in self.keys() ) @dataclass # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTImageGuidedObjectDetectionOutput with OwlViT->Owlv2,OWL-ViT->OWLv2 class Owlv2ImageGuidedObjectDetectionOutput(ModelOutput): """ Output type of [`Owlv2ForObjectDetection.image_guided_detection`]. Args: logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`): Classification logits (including no-object) for all queries. target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`): Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These values are normalized in [0, 1], relative to the size of each individual target image in the batch (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`): Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These values are normalized in [0, 1], relative to the size of each individual query image in the batch (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the unnormalized bounding boxes. image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`): Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image embeddings for each patch. query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`): Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image embeddings for each patch. class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`): Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total number of patches is (image_size / patch_size)**2. text_model_output (Tuple[`BaseModelOutputWithPooling`]): The output of the [`Owlv2TextModel`]. vision_model_output (`BaseModelOutputWithPooling`): The output of the [`Owlv2VisionModel`]. """ logits: torch.FloatTensor = None image_embeds: torch.FloatTensor = None query_image_embeds: torch.FloatTensor = None target_pred_boxes: torch.FloatTensor = None query_pred_boxes: torch.FloatTensor = None class_embeds: torch.FloatTensor = None text_model_output: BaseModelOutputWithPooling = None vision_model_output: BaseModelOutputWithPooling = None def to_tuple(self) -> Tuple[Any]: return tuple( self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple() for k in self.keys() ) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionEmbeddings with OwlViT->Owlv2 class Owlv2VisionEmbeddings(nn.Module): def __init__(self, config: Owlv2VisionConfig): super().__init__() self.config = config self.embed_dim = config.hidden_size self.class_embedding = nn.Parameter(torch.randn(config.hidden_size)) self.patch_embedding = nn.Conv2d( in_channels=config.num_channels, out_channels=self.embed_dim, kernel_size=config.patch_size, stride=config.patch_size, bias=False, ) self.num_patches = (config.image_size // config.patch_size) ** 2 self.num_positions = self.num_patches + 1 self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: batch_size = pixel_values.shape[0] patch_embeds = self.patch_embedding(pixel_values) # shape = [batch_size, num_channels, height, width] patch_embeds = patch_embeds.flatten(2).transpose(1, 2) class_embeds = self.class_embedding.expand(batch_size, 1, -1) embeddings = torch.cat([class_embeds, patch_embeds], dim=1) embeddings = embeddings + self.position_embedding(self.position_ids) return embeddings # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextEmbeddings with OwlViT->Owlv2 class Owlv2TextEmbeddings(nn.Module): def __init__(self, config: Owlv2TextConfig): super().__init__() self.token_embedding = nn.Embedding(config.vocab_size, config.hidden_size) self.position_embedding = nn.Embedding(config.max_position_embeddings, config.hidden_size) # position_ids (1, len position emb) is contiguous in memory and exported when serialized self.register_buffer( "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False ) def forward( self, input_ids: Optional[torch.LongTensor] = None, position_ids: Optional[torch.LongTensor] = None, inputs_embeds: Optional[torch.FloatTensor] = None, ) -> torch.Tensor: seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2] if position_ids is None: position_ids = self.position_ids[:, :seq_length] if inputs_embeds is None: inputs_embeds = self.token_embedding(input_ids) position_embeddings = self.position_embedding(position_ids) embeddings = inputs_embeds + position_embeddings return embeddings # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTAttention with OwlViT->Owlv2 class Owlv2Attention(nn.Module): """Multi-headed attention from 'Attention Is All You Need' paper""" def __init__(self, config): super().__init__() self.config = config self.embed_dim = config.hidden_size self.num_heads = config.num_attention_heads self.head_dim = self.embed_dim // self.num_heads if self.head_dim * self.num_heads != self.embed_dim: raise ValueError( f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" f" {self.num_heads})." ) self.scale = self.head_dim**-0.5 self.dropout = config.attention_dropout self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int): return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous() def forward( self, hidden_states: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = False, ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: """Input shape: Batch x Time x Channel""" bsz, tgt_len, embed_dim = hidden_states.size() # get query proj query_states = self.q_proj(hidden_states) * self.scale key_states = self._shape(self.k_proj(hidden_states), -1, bsz) value_states = self._shape(self.v_proj(hidden_states), -1, bsz) proj_shape = (bsz * self.num_heads, -1, self.head_dim) query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape) key_states = key_states.view(*proj_shape) value_states = value_states.view(*proj_shape) src_len = key_states.size(1) attn_weights = torch.bmm(query_states, key_states.transpose(1, 2)) if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len): raise ValueError( f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is" f" {attn_weights.size()}" ) # apply the causal_attention_mask first if causal_attention_mask is not None: if causal_attention_mask.size() != (bsz, 1, tgt_len, src_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is" f" {causal_attention_mask.size()}" ) attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + causal_attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) if attention_mask is not None: if attention_mask.size() != (bsz, 1, tgt_len, src_len): raise ValueError( f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}" ) attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) attn_weights = nn.functional.softmax(attn_weights, dim=-1) if output_attentions: # this operation is a bit akward, but it's required to # make sure that attn_weights keeps its gradient. # In order to do so, attn_weights have to reshaped # twice and have to be reused in the following attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len) else: attn_weights_reshaped = None attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) # For int8 compatibility, sometimes the `attn_probs` are in `fp32` attn_probs = attn_probs.to(value_states.dtype) attn_output = torch.bmm(attn_probs, value_states) if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim): raise ValueError( f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is" f" {attn_output.size()}" ) attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) attn_output = attn_output.transpose(1, 2) attn_output = attn_output.reshape(bsz, tgt_len, embed_dim) attn_output = self.out_proj(attn_output) return attn_output, attn_weights_reshaped # Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Owlv2 class Owlv2MLP(nn.Module): def __init__(self, config): super().__init__() self.config = config self.activation_fn = ACT2FN[config.hidden_act] self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: hidden_states = self.fc1(hidden_states) hidden_states = self.activation_fn(hidden_states) hidden_states = self.fc2(hidden_states) return hidden_states # Copied from transformers.models.clip.modeling_clip.CLIPEncoderLayer with CLIP->Owlv2 class Owlv2EncoderLayer(nn.Module): def __init__(self, config: Owlv2Config): super().__init__() self.embed_dim = config.hidden_size self.self_attn = Owlv2Attention(config) self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) self.mlp = Owlv2MLP(config) self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) def forward( self, hidden_states: torch.Tensor, attention_mask: torch.Tensor, causal_attention_mask: torch.Tensor, output_attentions: Optional[bool] = False, ) -> Tuple[torch.FloatTensor]: """ Args: hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)` attention_mask (`torch.FloatTensor`): attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values. `(config.encoder_attention_heads,)`. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. """ residual = hidden_states hidden_states = self.layer_norm1(hidden_states) hidden_states, attn_weights = self.self_attn( hidden_states=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, ) hidden_states = residual + hidden_states residual = hidden_states hidden_states = self.layer_norm2(hidden_states) hidden_states = self.mlp(hidden_states) hidden_states = residual + hidden_states outputs = (hidden_states,) if output_attentions: outputs += (attn_weights,) return outputs # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTPreTrainedModel with OwlViT->Owlv2,owlvit->owlv2 class Owlv2PreTrainedModel(PreTrainedModel): """ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained models. """ config_class = Owlv2Config base_model_prefix = "owlv2" supports_gradient_checkpointing = True _no_split_modules = ["Owlv2EncoderLayer"] def _init_weights(self, module): """Initialize the weights""" factor = self.config.initializer_factor if isinstance(module, Owlv2TextEmbeddings): module.token_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) module.position_embedding.weight.data.normal_(mean=0.0, std=factor * 0.02) elif isinstance(module, Owlv2VisionEmbeddings): factor = self.config.initializer_factor nn.init.normal_(module.class_embedding, mean=0.0, std=module.embed_dim**-0.5 * factor) nn.init.normal_(module.patch_embedding.weight, std=module.config.initializer_range * factor) nn.init.normal_(module.position_embedding.weight, std=module.config.initializer_range * factor) elif isinstance(module, Owlv2Attention): factor = self.config.initializer_factor in_proj_std = (module.embed_dim**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor out_proj_std = (module.embed_dim**-0.5) * factor nn.init.normal_(module.q_proj.weight, std=in_proj_std) nn.init.normal_(module.k_proj.weight, std=in_proj_std) nn.init.normal_(module.v_proj.weight, std=in_proj_std) nn.init.normal_(module.out_proj.weight, std=out_proj_std) elif isinstance(module, Owlv2MLP): factor = self.config.initializer_factor in_proj_std = (module.config.hidden_size**-0.5) * ((2 * module.config.num_hidden_layers) ** -0.5) * factor fc_std = (2 * module.config.hidden_size) ** -0.5 * factor nn.init.normal_(module.fc1.weight, std=fc_std) nn.init.normal_(module.fc2.weight, std=in_proj_std) elif isinstance(module, Owlv2Model): nn.init.normal_( module.text_projection.weight, std=module.text_embed_dim**-0.5 * self.config.initializer_factor, ) nn.init.normal_( module.visual_projection.weight, std=module.vision_embed_dim**-0.5 * self.config.initializer_factor, ) if isinstance(module, nn.LayerNorm): module.bias.data.zero_() module.weight.data.fill_(1.0) if isinstance(module, nn.Linear) and module.bias is not None: module.bias.data.zero_() OWLV2_START_DOCSTRING = r""" This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads etc.) This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior. Parameters: config ([`Owvl2Config`]): Model configuration class with all the parameters of the model. Initializing with a config file does not load the weights associated with the model, only the configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights. """ OWLV2_TEXT_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ OWLV2_VISION_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ OWLV2_INPUTS_DOCSTRING = r""" Args: input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids) pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) return_loss (`bool`, *optional*): Whether or not to return the contrastive loss. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_base_image_embeds (`bool`, *optional*): Whether or not to return the base image embeddings. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ OWLV2_OBJECT_DETECTION_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*): Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input IDs?](../glossary#input-ids). attention_mask (`torch.Tensor` of shape `(batch_size, num_max_text_queries, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) output_hidden_states (`bool`, *optional*): Whether or not to return the last hidden state. See `text_model_last_hidden_state` and `vision_model_last_hidden_state` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ OWLV2_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING = r""" Args: pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values. query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`): Pixel values of query image(s) to be detected. Pass in one query image per target image. output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTEncoder with OwlViT->Owlv2 class Owlv2Encoder(nn.Module): """ Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a [`Owlv2EncoderLayer`]. Args: config: Owlv2Config """ def __init__(self, config: Owlv2Config): super().__init__() self.layers = nn.ModuleList([Owlv2EncoderLayer(config) for _ in range(config.num_hidden_layers)]) self.gradient_checkpointing = False def forward( self, inputs_embeds, attention_mask: Optional[torch.Tensor] = None, causal_attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutput]: r""" Args: inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`). attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): Causal mask for the text model. Mask values selected in `[0, 1]`: - 1 for tokens that are **not masked**, - 0 for tokens that are **masked**. [What are attention masks?](../glossary#attention-mask) output_attentions (`bool`, *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned tensors for more detail. output_hidden_states (`bool`, *optional*): Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for more detail. return_dict (`bool`, *optional*): Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict encoder_states = () if output_hidden_states else None all_attentions = () if output_attentions else None hidden_states = inputs_embeds for encoder_layer in self.layers: if output_hidden_states: encoder_states = encoder_states + (hidden_states,) if self.gradient_checkpointing and self.training: layer_outputs = self._gradient_checkpointing_func( encoder_layer.__call__, hidden_states, attention_mask, causal_attention_mask, output_attentions, ) else: layer_outputs = encoder_layer( hidden_states, attention_mask, causal_attention_mask, output_attentions=output_attentions, ) hidden_states = layer_outputs[0] if output_attentions: all_attentions = all_attentions + (layer_outputs[1],) if output_hidden_states: encoder_states = encoder_states + (hidden_states,) if not return_dict: return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) return BaseModelOutput( last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions ) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextTransformer with OWLVIT->OWLV2,OwlViT->Owlv2 class Owlv2TextTransformer(nn.Module): def __init__(self, config: Owlv2TextConfig): super().__init__() self.config = config embed_dim = config.hidden_size self.embeddings = Owlv2TextEmbeddings(config) self.encoder = Owlv2Encoder(config) self.final_layer_norm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) @add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2TextConfig) def forward( self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, position_ids: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict input_shape = input_ids.size() input_ids = input_ids.view(-1, input_shape[-1]) hidden_states = self.embeddings(input_ids=input_ids, position_ids=position_ids) # num_samples, seq_len = input_shape where num_samples = batch_size * num_max_text_queries # OWLV2's text model uses causal mask, prepare it here. # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 causal_attention_mask = _create_4d_causal_attention_mask( input_shape, hidden_states.dtype, device=hidden_states.device ) # expand attention_mask if attention_mask is not None: # [num_samples, seq_len] -> [num_samples, 1, tgt_seq_len, src_seq_len] attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype) encoder_outputs = self.encoder( inputs_embeds=hidden_states, attention_mask=attention_mask, causal_attention_mask=causal_attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] last_hidden_state = self.final_layer_norm(last_hidden_state) # take features from the end of tokens embedding (end of token is the highest number in each sequence) # casting to torch.int for onnx compatibility: argmax doesn't support int64 inputs with opset 14 pooled_output = last_hidden_state[ torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device), input_ids.to(torch.int).argmax(dim=-1).to(last_hidden_state.device), ] if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTTextModel with google/owlvit-base-patch32->google/owlv2-base-patch16, OWLVIT->OWLV2,OwlViT->Owlv2 class Owlv2TextModel(Owlv2PreTrainedModel): config_class = Owlv2TextConfig def __init__(self, config: Owlv2TextConfig): super().__init__(config) self.text_model = Owlv2TextTransformer(config) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self) -> nn.Module: return self.text_model.embeddings.token_embedding def set_input_embeddings(self, value): self.text_model.embeddings.token_embedding = value @add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2TextConfig) def forward( self, input_ids: torch.Tensor, attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: Examples: ```python >>> from transformers import AutoProcessor, Owlv2TextModel >>> model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16") >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16") >>> inputs = processor( ... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt" ... ) >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled (EOS token) states ```""" # Get embeddings for all text queries in all batch samples return self.text_model( input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionTransformer with OWLVIT->OWLV2,OwlViT->Owlv2 class Owlv2VisionTransformer(nn.Module): def __init__(self, config: Owlv2VisionConfig): super().__init__() self.config = config self.embeddings = Owlv2VisionEmbeddings(config) self.pre_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) self.encoder = Owlv2Encoder(config) self.post_layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) @add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2VisionConfig) def forward( self, pixel_values: torch.FloatTensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: """ output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Cast the input to the expected `dtype` expected_input_dtype = self.embeddings.patch_embedding.weight.dtype pixel_values = pixel_values.to(expected_input_dtype) hidden_states = self.embeddings(pixel_values) hidden_states = self.pre_layernorm(hidden_states) encoder_outputs = self.encoder( inputs_embeds=hidden_states, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) last_hidden_state = encoder_outputs[0] pooled_output = last_hidden_state[:, 0, :] pooled_output = self.post_layernorm(pooled_output) if not return_dict: return (last_hidden_state, pooled_output) + encoder_outputs[1:] return BaseModelOutputWithPooling( last_hidden_state=last_hidden_state, pooler_output=pooled_output, hidden_states=encoder_outputs.hidden_states, attentions=encoder_outputs.attentions, ) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTVisionModel with OWLVIT->OWLV2,OwlViT->Owlv2,google/owlvit-base-patch32->google/owlv2-base-patch16 class Owlv2VisionModel(Owlv2PreTrainedModel): config_class = Owlv2VisionConfig main_input_name = "pixel_values" def __init__(self, config: Owlv2VisionConfig): super().__init__(config) self.vision_model = Owlv2VisionTransformer(config) # Initialize weights and apply final processing self.post_init() def get_input_embeddings(self) -> nn.Module: return self.vision_model.embeddings.patch_embedding @add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=Owlv2VisionConfig) def forward( self, pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, BaseModelOutputWithPooling]: r""" Returns: Examples: ```python >>> from PIL import Image >>> import requests >>> from transformers import AutoProcessor, Owlv2VisionModel >>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16") >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> inputs = processor(images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> last_hidden_state = outputs.last_hidden_state >>> pooled_output = outputs.pooler_output # pooled CLS states ```""" return self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) @add_start_docstrings(OWLV2_START_DOCSTRING) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTModel with google/owlvit-base-patch32->google/owlv2-base-patch16-ensemble, OWLVIT->OWLV2,OwlViT->Owlv2,owlvit->owlv2,OWL-ViT->OWLv2 class Owlv2Model(Owlv2PreTrainedModel): config_class = Owlv2Config def __init__(self, config: Owlv2Config): super().__init__(config) if not isinstance(config.text_config, Owlv2TextConfig): raise ValueError( "config.text_config is expected to be of type Owlv2TextConfig but is of type" f" {type(config.text_config)}." ) if not isinstance(config.vision_config, Owlv2VisionConfig): raise ValueError( "config.vision_config is expected to be of type Owlv2VisionConfig but is of type" f" {type(config.vision_config)}." ) text_config = config.text_config vision_config = config.vision_config self.projection_dim = config.projection_dim self.text_embed_dim = text_config.hidden_size self.vision_embed_dim = vision_config.hidden_size self.text_model = Owlv2TextTransformer(text_config) self.vision_model = Owlv2VisionTransformer(vision_config) self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False) self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False) self.logit_scale = nn.Parameter(torch.tensor(config.logit_scale_init_value)) # Initialize weights and apply final processing self.post_init() @add_start_docstrings_to_model_forward(OWLV2_TEXT_INPUTS_DOCSTRING) def get_text_features( self, input_ids: Optional[torch.Tensor] = None, attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`]. Examples: ```python >>> from transformers import AutoProcessor, Owlv2Model >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble") >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") >>> inputs = processor( ... text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt" ... ) >>> text_features = model.get_text_features(**inputs) ```""" # Use OWLv2 model's config for some fields (if specified) instead of those of vision & text components. return_dict = return_dict if return_dict is not None else self.config.use_return_dict # Get embeddings for all text queries in all batch samples text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask, return_dict=return_dict) pooled_output = text_output[1] text_features = self.text_projection(pooled_output) return text_features @add_start_docstrings_to_model_forward(OWLV2_VISION_INPUTS_DOCSTRING) def get_image_features( self, pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> torch.FloatTensor: r""" Returns: image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying the projection layer to the pooled output of [`Owlv2VisionModel`]. Examples: ```python >>> from PIL import Image >>> import requests >>> from transformers import AutoProcessor, Owlv2Model >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble") >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> inputs = processor(images=image, return_tensors="pt") >>> image_features = model.get_image_features(**inputs) ```""" # Use OWLv2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_outputs = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) pooled_output = vision_outputs[1] image_features = self.visual_projection(pooled_output) return image_features @add_start_docstrings_to_model_forward(OWLV2_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Owlv2Output, config_class=Owlv2Config) def forward( self, input_ids: Optional[torch.LongTensor] = None, pixel_values: Optional[torch.FloatTensor] = None, attention_mask: Optional[torch.Tensor] = None, return_loss: Optional[bool] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_base_image_embeds: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Union[Tuple, Owlv2Output]: r""" Returns: Examples: ```python >>> from PIL import Image >>> import requests >>> from transformers import AutoProcessor, Owlv2Model >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble") >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt") >>> outputs = model(**inputs) >>> logits_per_image = outputs.logits_per_image # this is the image-text similarity score >>> probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities ```""" # Use OWLv2 model's config for some fields (if specified) instead of those of vision & text components. output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.use_return_dict vision_outputs = self.vision_model( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) # Get embeddings for all text queries in all batch samples text_outputs = self.text_model( input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=return_dict, ) text_embeds = text_outputs[1] text_embeds = self.text_projection(text_embeds) image_embeds = vision_outputs[1] image_embeds = self.visual_projection(image_embeds) # normalized features image_embeds = image_embeds / torch.linalg.norm(image_embeds, ord=2, dim=-1, keepdim=True) text_embeds_norm = text_embeds / torch.linalg.norm(text_embeds, ord=2, dim=-1, keepdim=True) # cosine similarity as logits and set it on the correct device logit_scale = self.logit_scale.exp().to(image_embeds.device) logits_per_text = torch.matmul(text_embeds_norm, image_embeds.t()) * logit_scale logits_per_image = logits_per_text.t() loss = None if return_loss: loss = owlv2_loss(logits_per_text) if return_base_image_embeds: warnings.warn( "`return_base_image_embeds` is deprecated and will be removed in v4.27 of Transformers, one can" " obtain the base (unprojected) image embeddings from outputs.vision_model_output.", FutureWarning, ) last_hidden_state = vision_outputs[0] image_embeds = self.vision_model.post_layernorm(last_hidden_state) else: text_embeds = text_embeds_norm if not return_dict: output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs) return ((loss,) + output) if loss is not None else output return Owlv2Output( loss=loss, logits_per_image=logits_per_image, logits_per_text=logits_per_text, text_embeds=text_embeds, image_embeds=image_embeds, text_model_output=text_outputs, vision_model_output=vision_outputs, ) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTBoxPredictionHead with OwlViT->Owlv2 class Owlv2BoxPredictionHead(nn.Module): def __init__(self, config: Owlv2Config, out_dim: int = 4): super().__init__() width = config.vision_config.hidden_size self.dense0 = nn.Linear(width, width) self.dense1 = nn.Linear(width, width) self.gelu = nn.GELU() self.dense2 = nn.Linear(width, out_dim) def forward(self, image_features: torch.Tensor) -> torch.FloatTensor: output = self.dense0(image_features) output = self.gelu(output) output = self.dense1(output) output = self.gelu(output) output = self.dense2(output) return output # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTClassPredictionHead with OwlViT->Owlv2 class Owlv2ClassPredictionHead(nn.Module): def __init__(self, config: Owlv2Config): super().__init__() out_dim = config.text_config.hidden_size self.query_dim = config.vision_config.hidden_size self.dense0 = nn.Linear(self.query_dim, out_dim) self.logit_shift = nn.Linear(self.query_dim, 1) self.logit_scale = nn.Linear(self.query_dim, 1) self.elu = nn.ELU() def forward( self, image_embeds: torch.FloatTensor, query_embeds: Optional[torch.FloatTensor], query_mask: Optional[torch.Tensor], ) -> Tuple[torch.FloatTensor]: image_class_embeds = self.dense0(image_embeds) if query_embeds is None: device = image_class_embeds.device batch_size, num_patches = image_class_embeds.shape[:2] pred_logits = torch.zeros((batch_size, num_patches, self.query_dim)).to(device) return (pred_logits, image_class_embeds) # Normalize image and text features image_class_embeds = image_class_embeds / (torch.linalg.norm(image_class_embeds, dim=-1, keepdim=True) + 1e-6) query_embeds = query_embeds / (torch.linalg.norm(query_embeds, dim=-1, keepdim=True) + 1e-6) # Get class predictions pred_logits = torch.einsum("...pd,...qd->...pq", image_class_embeds, query_embeds) # Apply a learnable shift and scale to logits logit_shift = self.logit_shift(image_embeds) logit_scale = self.logit_scale(image_embeds) logit_scale = self.elu(logit_scale) + 1 pred_logits = (pred_logits + logit_shift) * logit_scale if query_mask is not None: if query_mask.ndim > 1: query_mask = torch.unsqueeze(query_mask, dim=-2) pred_logits = pred_logits.to(torch.float64) pred_logits = torch.where(query_mask == 0, -1e6, pred_logits) pred_logits = pred_logits.to(torch.float32) return (pred_logits, image_class_embeds) class Owlv2ForObjectDetection(Owlv2PreTrainedModel): config_class = Owlv2Config def __init__(self, config: Owlv2Config): super().__init__(config) self.owlv2 = Owlv2Model(config) self.class_head = Owlv2ClassPredictionHead(config) self.box_head = Owlv2BoxPredictionHead(config) self.objectness_head = Owlv2BoxPredictionHead(config, out_dim=1) self.layer_norm = nn.LayerNorm(config.vision_config.hidden_size, eps=config.vision_config.layer_norm_eps) self.sigmoid = nn.Sigmoid() self.sqrt_num_patches = config.vision_config.image_size // config.vision_config.patch_size self.box_bias = self.compute_box_bias(self.sqrt_num_patches) @staticmethod # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.normalize_grid_corner_coordinates def normalize_grid_corner_coordinates(num_patches: int) -> torch.Tensor: # Create grid coordinates using torch x_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32) y_coordinates = torch.arange(1, num_patches + 1, dtype=torch.float32) xx, yy = torch.meshgrid(x_coordinates, y_coordinates, indexing="xy") # Stack the coordinates and divide by num_patches box_coordinates = torch.stack((xx, yy), dim=-1) box_coordinates /= num_patches # Flatten (h, w, 2) -> (h*w, 2) box_coordinates = box_coordinates.view(-1, 2) return box_coordinates def objectness_predictor(self, image_features: torch.FloatTensor) -> torch.FloatTensor: """Predicts the probability that each image feature token is an object. Args: image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)): Features extracted from the image. Returns: Objectness scores. """ image_features = image_features.detach() objectness_logits = self.objectness_head(image_features) objectness_logits = objectness_logits[..., 0] return objectness_logits @lru_cache(maxsize=2) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.compute_box_bias def compute_box_bias(self, num_patches: int, feature_map: Optional[torch.FloatTensor] = None) -> torch.Tensor: if feature_map is not None: raise ValueError("feature_map has been deprecated as an input. Please pass in num_patches instead") # The box center is biased to its position on the feature grid box_coordinates = self.normalize_grid_corner_coordinates(num_patches) box_coordinates = torch.clip(box_coordinates, 0.0, 1.0) # Unnormalize xy box_coord_bias = torch.log(box_coordinates + 1e-4) - torch.log1p(-box_coordinates + 1e-4) # The box size is biased to the patch size box_size = torch.full_like(box_coord_bias, 1.0 / num_patches) box_size_bias = torch.log(box_size + 1e-4) - torch.log1p(-box_size + 1e-4) # Compute box bias box_bias = torch.cat([box_coord_bias, box_size_bias], dim=-1) return box_bias # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.box_predictor def box_predictor( self, image_feats: torch.FloatTensor, feature_map: torch.FloatTensor, ) -> torch.FloatTensor: """ Args: image_feats: Features extracted from the image, returned by the `image_text_embedder` method. feature_map: A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method. Returns: pred_boxes: List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary. """ # Bounding box detection head [batch_size, num_boxes, 4]. pred_boxes = self.box_head(image_feats) # Compute the location of each token on the grid and use it to compute a bias for the bbox prediction box_bias = self.box_bias.to(feature_map.device) pred_boxes += box_bias pred_boxes = self.sigmoid(pred_boxes) return pred_boxes # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.class_predictor def class_predictor( self, image_feats: torch.FloatTensor, query_embeds: Optional[torch.FloatTensor] = None, query_mask: Optional[torch.Tensor] = None, ) -> Tuple[torch.FloatTensor]: """ Args: image_feats: Features extracted from the `image_text_embedder`. query_embeds: Text query embeddings. query_mask: Must be provided with query_embeddings. A mask indicating which query embeddings are valid. """ (pred_logits, image_class_embeds) = self.class_head(image_feats, query_embeds, query_mask) return (pred_logits, image_class_embeds) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.image_text_embedder with owlvit->owlv2 def image_text_embedder( self, input_ids: torch.Tensor, pixel_values: torch.FloatTensor, attention_mask: torch.Tensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[torch.FloatTensor]: # Encode text and image outputs = self.owlv2( pixel_values=pixel_values, input_ids=input_ids, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, return_dict=True, ) # Get image embeddings last_hidden_state = outputs.vision_model_output[0] image_embeds = self.owlv2.vision_model.post_layernorm(last_hidden_state) # Resize class token class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape) # Merge image embedding with class tokens image_embeds = image_embeds[:, 1:, :] * class_token_out image_embeds = self.layer_norm(image_embeds) # Resize to [batch_size, num_patches, num_patches, hidden_size] new_size = ( image_embeds.shape[0], self.sqrt_num_patches, self.sqrt_num_patches, image_embeds.shape[-1], ) image_embeds = image_embeds.reshape(new_size) text_embeds = outputs[-4] return (text_embeds, image_embeds, outputs) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.image_embedder with owlvit->owlv2, OwlViTModel->Owlv2Model def image_embedder( self, pixel_values: torch.FloatTensor, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, ) -> Tuple[torch.FloatTensor]: # Get Owlv2Model vision embeddings (same as CLIP) vision_outputs = self.owlv2.vision_model(pixel_values=pixel_values, return_dict=True) # Apply post_layernorm to last_hidden_state, return non-projected output last_hidden_state = vision_outputs[0] image_embeds = self.owlv2.vision_model.post_layernorm(last_hidden_state) # Resize class token class_token_out = torch.broadcast_to(image_embeds[:, :1, :], image_embeds[:, :-1].shape) # Merge image embedding with class tokens image_embeds = image_embeds[:, 1:, :] * class_token_out image_embeds = self.layer_norm(image_embeds) # Resize to [batch_size, num_patches, num_patches, hidden_size] new_size = ( image_embeds.shape[0], self.sqrt_num_patches, self.sqrt_num_patches, image_embeds.shape[-1], ) image_embeds = image_embeds.reshape(new_size) return (image_embeds, vision_outputs) # Copied from transformers.models.owlvit.modeling_owlvit.OwlViTForObjectDetection.embed_image_query def embed_image_query( self, query_image_features: torch.FloatTensor, query_feature_map: torch.FloatTensor ) -> torch.FloatTensor: _, class_embeds = self.class_predictor(query_image_features) pred_boxes = self.box_predictor(query_image_features, query_feature_map) pred_boxes_as_corners = center_to_corners_format(pred_boxes) # Loop over query images best_class_embeds = [] best_box_indices = [] pred_boxes_device = pred_boxes_as_corners.device for i in range(query_image_features.shape[0]): each_query_box = torch.tensor([[0, 0, 1, 1]], device=pred_boxes_device) each_query_pred_boxes = pred_boxes_as_corners[i] ious, _ = box_iou(each_query_box, each_query_pred_boxes) # If there are no overlapping boxes, fall back to generalized IoU if torch.all(ious[0] == 0.0): ious = generalized_box_iou(each_query_box, each_query_pred_boxes) # Use an adaptive threshold to include all boxes within 80% of the best IoU iou_threshold = torch.max(ious) * 0.8 selected_inds = (ious[0] >= iou_threshold).nonzero() if selected_inds.numel(): selected_embeddings = class_embeds[i][selected_inds.squeeze(1)] mean_embeds = torch.mean(class_embeds[i], axis=0) mean_sim = torch.einsum("d,id->i", mean_embeds, selected_embeddings) best_box_ind = selected_inds[torch.argmin(mean_sim)] best_class_embeds.append(class_embeds[i][best_box_ind]) best_box_indices.append(best_box_ind) if best_class_embeds: query_embeds = torch.stack(best_class_embeds) box_indices = torch.stack(best_box_indices) else: query_embeds, box_indices = None, None return query_embeds, box_indices, pred_boxes @add_start_docstrings_to_model_forward(OWLV2_IMAGE_GUIDED_OBJECT_DETECTION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Owlv2ImageGuidedObjectDetectionOutput, config_class=Owlv2Config) def image_guided_detection( self, pixel_values: torch.FloatTensor, query_pixel_values: Optional[torch.FloatTensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Owlv2ImageGuidedObjectDetectionOutput: r""" Returns: Examples: ```python >>> import requests >>> from PIL import Image >>> import torch >>> import numpy as np >>> from transformers import AutoProcessor, Owlv2ForObjectDetection >>> from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg" >>> query_image = Image.open(requests.get(query_url, stream=True).raw) >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt") >>> # forward pass >>> with torch.no_grad(): ... outputs = model.image_guided_detection(**inputs) >>> # Note: boxes need to be visualized on the padded, unnormalized image >>> # hence we'll set the target image sizes (height, width) based on that >>> def get_preprocessed_image(pixel_values): ... pixel_values = pixel_values.squeeze().numpy() ... unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None] ... unnormalized_image = (unnormalized_image * 255).astype(np.uint8) ... unnormalized_image = np.moveaxis(unnormalized_image, 0, -1) ... unnormalized_image = Image.fromarray(unnormalized_image) ... return unnormalized_image >>> unnormalized_image = get_preprocessed_image(inputs.pixel_values) >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]]) >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax) >>> results = processor.post_process_image_guided_detection( ... outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes ... ) >>> i = 0 # Retrieve predictions for the first image >>> boxes, scores = results[i]["boxes"], results[i]["scores"] >>> for box, score in zip(boxes, scores): ... box = [round(i, 2) for i in box.tolist()] ... print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}") Detected similar object with confidence 0.938 at location [490.96, 109.89, 821.09, 536.11] Detected similar object with confidence 0.959 at location [8.67, 721.29, 928.68, 732.78] Detected similar object with confidence 0.902 at location [4.27, 720.02, 941.45, 761.59] Detected similar object with confidence 0.985 at location [265.46, -58.9, 1009.04, 365.66] Detected similar object with confidence 1.0 at location [9.79, 28.69, 937.31, 941.64] Detected similar object with confidence 0.998 at location [869.97, 58.28, 923.23, 978.1] Detected similar object with confidence 0.985 at location [309.23, 21.07, 371.61, 932.02] Detected similar object with confidence 0.947 at location [27.93, 859.45, 969.75, 915.44] Detected similar object with confidence 0.996 at location [785.82, 41.38, 880.26, 966.37] Detected similar object with confidence 0.998 at location [5.08, 721.17, 925.93, 998.41] Detected similar object with confidence 0.969 at location [6.7, 898.1, 921.75, 949.51] Detected similar object with confidence 0.966 at location [47.16, 927.29, 981.99, 942.14] Detected similar object with confidence 0.924 at location [46.4, 936.13, 953.02, 950.78] ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.return_dict # Compute feature maps for the input and query images query_feature_map = self.image_embedder(pixel_values=query_pixel_values)[0] feature_map, vision_outputs = self.image_embedder( pixel_values=pixel_values, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) batch_size, num_patches, num_patches, hidden_dim = feature_map.shape image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim)) batch_size, num_patches, num_patches, hidden_dim = query_feature_map.shape query_image_feats = torch.reshape(query_feature_map, (batch_size, num_patches * num_patches, hidden_dim)) # Get top class embedding and best box index for each query image in batch query_embeds, best_box_indices, query_pred_boxes = self.embed_image_query(query_image_feats, query_feature_map) # Predict object classes [batch_size, num_patches, num_queries+1] (pred_logits, class_embeds) = self.class_predictor(image_feats=image_feats, query_embeds=query_embeds) # Predict object boxes target_pred_boxes = self.box_predictor(image_feats, feature_map) if not return_dict: output = ( feature_map, query_feature_map, target_pred_boxes, query_pred_boxes, pred_logits, class_embeds, vision_outputs.to_tuple(), ) output = tuple(x for x in output if x is not None) return output return Owlv2ImageGuidedObjectDetectionOutput( image_embeds=feature_map, query_image_embeds=query_feature_map, target_pred_boxes=target_pred_boxes, query_pred_boxes=query_pred_boxes, logits=pred_logits, class_embeds=class_embeds, text_model_output=None, vision_model_output=vision_outputs, ) @add_start_docstrings_to_model_forward(OWLV2_OBJECT_DETECTION_INPUTS_DOCSTRING) @replace_return_docstrings(output_type=Owlv2ObjectDetectionOutput, config_class=Owlv2Config) def forward( self, input_ids: torch.Tensor, pixel_values: torch.FloatTensor, attention_mask: Optional[torch.Tensor] = None, output_attentions: Optional[bool] = None, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None, ) -> Owlv2ObjectDetectionOutput: r""" Returns: Examples: ```python >>> import requests >>> from PIL import Image >>> import numpy as np >>> import torch >>> from transformers import AutoProcessor, Owlv2ForObjectDetection >>> from transformers.utils.constants import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble") >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble") >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg" >>> image = Image.open(requests.get(url, stream=True).raw) >>> texts = [["a photo of a cat", "a photo of a dog"]] >>> inputs = processor(text=texts, images=image, return_tensors="pt") >>> # forward pass >>> with torch.no_grad(): ... outputs = model(**inputs) >>> # Note: boxes need to be visualized on the padded, unnormalized image >>> # hence we'll set the target image sizes (height, width) based on that >>> def get_preprocessed_image(pixel_values): ... pixel_values = pixel_values.squeeze().numpy() ... unnormalized_image = (pixel_values * np.array(OPENAI_CLIP_STD)[:, None, None]) + np.array(OPENAI_CLIP_MEAN)[:, None, None] ... unnormalized_image = (unnormalized_image * 255).astype(np.uint8) ... unnormalized_image = np.moveaxis(unnormalized_image, 0, -1) ... unnormalized_image = Image.fromarray(unnormalized_image) ... return unnormalized_image >>> unnormalized_image = get_preprocessed_image(inputs.pixel_values) >>> target_sizes = torch.Tensor([unnormalized_image.size[::-1]]) >>> # Convert outputs (bounding boxes and class logits) to final bounding boxes and scores >>> results = processor.post_process_object_detection( ... outputs=outputs, threshold=0.2, target_sizes=target_sizes ... ) >>> i = 0 # Retrieve predictions for the first image for the corresponding text queries >>> text = texts[i] >>> boxes, scores, labels = results[i]["boxes"], results[i]["scores"], results[i]["labels"] >>> for box, score, label in zip(boxes, scores, labels): ... box = [round(i, 2) for i in box.tolist()] ... print(f"Detected {text[label]} with confidence {round(score.item(), 3)} at location {box}") Detected a photo of a cat with confidence 0.614 at location [512.5, 35.08, 963.48, 557.02] Detected a photo of a cat with confidence 0.665 at location [10.13, 77.94, 489.93, 709.69] ```""" output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions output_hidden_states = ( output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states ) return_dict = return_dict if return_dict is not None else self.config.return_dict # Embed images and text queries query_embeds, feature_map, outputs = self.image_text_embedder( input_ids=input_ids, pixel_values=pixel_values, attention_mask=attention_mask, output_attentions=output_attentions, output_hidden_states=output_hidden_states, ) # Text and vision model outputs text_outputs = outputs.text_model_output vision_outputs = outputs.vision_model_output batch_size, num_patches, num_patches, hidden_dim = feature_map.shape image_feats = torch.reshape(feature_map, (batch_size, num_patches * num_patches, hidden_dim)) # Reshape from [batch_size * max_text_queries, hidden_dim] -> [batch_size, max_text_queries, hidden_dim] max_text_queries = input_ids.shape[0] // batch_size query_embeds = query_embeds.reshape(batch_size, max_text_queries, query_embeds.shape[-1]) # If first token is 0, then this is a padded query [batch_size, num_queries]. input_ids = input_ids.reshape(batch_size, max_text_queries, input_ids.shape[-1]) query_mask = input_ids[..., 0] > 0 # Predict object classes [batch_size, num_patches, num_queries+1] (pred_logits, class_embeds) = self.class_predictor(image_feats, query_embeds, query_mask) # Predict objectness objectness_logits = self.objectness_predictor(image_feats) # Predict object boxes pred_boxes = self.box_predictor(image_feats, feature_map) if not return_dict: output = ( pred_logits, objectness_logits, pred_boxes, query_embeds, feature_map, class_embeds, text_outputs.to_tuple(), vision_outputs.to_tuple(), ) output = tuple(x for x in output if x is not None) return output return Owlv2ObjectDetectionOutput( image_embeds=feature_map, text_embeds=query_embeds, pred_boxes=pred_boxes, logits=pred_logits, objectness_logits=objectness_logits, class_embeds=class_embeds, text_model_output=text_outputs, vision_model_output=vision_outputs, )