ai-content-maker/.venv/Lib/site-packages/transformers/pipelines/visual_question_answering.py

152 lines
6.7 KiB
Python

from typing import Union
from ..utils import add_end_docstrings, is_torch_available, is_vision_available, logging
from .base import Pipeline, build_pipeline_init_args
if is_vision_available():
from PIL import Image
from ..image_utils import load_image
if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES
logger = logging.get_logger(__name__)
@add_end_docstrings(build_pipeline_init_args(has_tokenizer=True, has_image_processor=True))
class VisualQuestionAnsweringPipeline(Pipeline):
"""
Visual Question Answering pipeline using a `AutoModelForVisualQuestionAnswering`. This pipeline is currently only
available in PyTorch.
Example:
```python
>>> from transformers import pipeline
>>> oracle = pipeline(model="dandelin/vilt-b32-finetuned-vqa")
>>> image_url = "https://huggingface.co/datasets/Narsil/image_dummy/raw/main/lena.png"
>>> oracle(question="What is she wearing ?", image=image_url)
[{'score': 0.948, 'answer': 'hat'}, {'score': 0.009, 'answer': 'fedora'}, {'score': 0.003, 'answer': 'clothes'}, {'score': 0.003, 'answer': 'sun hat'}, {'score': 0.002, 'answer': 'nothing'}]
>>> oracle(question="What is she wearing ?", image=image_url, top_k=1)
[{'score': 0.948, 'answer': 'hat'}]
>>> oracle(question="Is this a person ?", image=image_url, top_k=1)
[{'score': 0.993, 'answer': 'yes'}]
>>> oracle(question="Is this a man ?", image=image_url, top_k=1)
[{'score': 0.996, 'answer': 'no'}]
```
Learn more about the basics of using a pipeline in the [pipeline tutorial](../pipeline_tutorial)
This visual question answering pipeline can currently be loaded from [`pipeline`] using the following task
identifiers: `"visual-question-answering", "vqa"`.
The models that this pipeline can use are models that have been fine-tuned on a visual question answering task. See
the up-to-date list of available models on
[huggingface.co/models](https://huggingface.co/models?filter=visual-question-answering).
"""
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.check_model_type(MODEL_FOR_VISUAL_QUESTION_ANSWERING_MAPPING_NAMES)
def _sanitize_parameters(self, top_k=None, padding=None, truncation=None, timeout=None, **kwargs):
preprocess_params, postprocess_params = {}, {}
if padding is not None:
preprocess_params["padding"] = padding
if truncation is not None:
preprocess_params["truncation"] = truncation
if timeout is not None:
preprocess_params["timeout"] = timeout
if top_k is not None:
postprocess_params["top_k"] = top_k
return preprocess_params, {}, postprocess_params
def __call__(self, image: Union["Image.Image", str], question: str = None, **kwargs):
r"""
Answers open-ended questions about images. The pipeline accepts several types of inputs which are detailed
below:
- `pipeline(image=image, question=question)`
- `pipeline({"image": image, "question": question})`
- `pipeline([{"image": image, "question": question}])`
- `pipeline([{"image": image, "question": question}, {"image": image, "question": question}])`
Args:
image (`str`, `List[str]`, `PIL.Image` or `List[PIL.Image]`):
The pipeline handles three types of images:
- A string containing a http link pointing to an image
- A string containing a local path to an image
- An image loaded in PIL directly
The pipeline accepts either a single image or a batch of images. If given a single image, it can be
broadcasted to multiple questions.
question (`str`, `List[str]`):
The question(s) asked. If given a single question, it can be broadcasted to multiple images.
top_k (`int`, *optional*, defaults to 5):
The number of top labels that will be returned by the pipeline. If the provided number is higher than
the number of labels available in the model configuration, it will default to the number of labels.
timeout (`float`, *optional*, defaults to None):
The maximum time in seconds to wait for fetching images from the web. If None, no timeout is set and
the call may block forever.
Return:
A dictionary or a list of dictionaries containing the result. The dictionaries contain the following keys:
- **label** (`str`) -- The label identified by the model.
- **score** (`int`) -- The score attributed by the model for that label.
"""
if isinstance(image, (Image.Image, str)) and isinstance(question, str):
inputs = {"image": image, "question": question}
else:
"""
Supports the following format
- {"image": image, "question": question}
- [{"image": image, "question": question}]
- Generator and datasets
"""
inputs = image
results = super().__call__(inputs, **kwargs)
return results
def preprocess(self, inputs, padding=False, truncation=False, timeout=None):
image = load_image(inputs["image"], timeout=timeout)
model_inputs = self.tokenizer(
inputs["question"], return_tensors=self.framework, padding=padding, truncation=truncation
)
image_features = self.image_processor(images=image, return_tensors=self.framework)
model_inputs.update(image_features)
return model_inputs
def _forward(self, model_inputs, **generate_kwargs):
if self.model.can_generate():
model_outputs = self.model.generate(**model_inputs, **generate_kwargs)
else:
model_outputs = self.model(**model_inputs)
return model_outputs
def postprocess(self, model_outputs, top_k=5):
if self.model.can_generate():
return [
{"answer": self.tokenizer.decode(output_ids, skip_special_tokens=True).strip()}
for output_ids in model_outputs
]
else:
if top_k > self.model.config.num_labels:
top_k = self.model.config.num_labels
if self.framework == "pt":
probs = model_outputs.logits.sigmoid()[0]
scores, ids = probs.topk(top_k)
else:
raise ValueError(f"Unsupported framework: {self.framework}")
scores = scores.tolist()
ids = ids.tolist()
return [{"score": score, "answer": self.model.config.id2label[_id]} for score, _id in zip(scores, ids)]