251 lines
9.3 KiB
Python
251 lines
9.3 KiB
Python
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||
|
# you may not use this file except in compliance with the License.
|
||
|
# You may obtain a copy of the License at
|
||
|
#
|
||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||
|
#
|
||
|
# Unless required by applicable law or agreed to in writing, software
|
||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||
|
# See the License for the specific language governing permissions and
|
||
|
# limitations under the License.
|
||
|
"""
|
||
|
Helpful utility functions and classes in relation to exploring API endpoints
|
||
|
with the aim for a user-friendly interface.
|
||
|
"""
|
||
|
|
||
|
import math
|
||
|
import re
|
||
|
import warnings
|
||
|
from dataclasses import dataclass
|
||
|
from typing import TYPE_CHECKING, List, Optional, Union
|
||
|
|
||
|
from ..repocard_data import ModelCardData
|
||
|
|
||
|
|
||
|
if TYPE_CHECKING:
|
||
|
from ..hf_api import ModelInfo
|
||
|
|
||
|
|
||
|
def _is_emission_within_treshold(model_info: "ModelInfo", minimum_threshold: float, maximum_threshold: float) -> bool:
|
||
|
"""Checks if a model's emission is within a given threshold.
|
||
|
|
||
|
Args:
|
||
|
model_info (`ModelInfo`):
|
||
|
A model info object containing the model's emission information.
|
||
|
minimum_threshold (`float`):
|
||
|
A minimum carbon threshold to filter by, such as 1.
|
||
|
maximum_threshold (`float`):
|
||
|
A maximum carbon threshold to filter by, such as 10.
|
||
|
|
||
|
Returns:
|
||
|
`bool`: Whether the model's emission is within the given threshold.
|
||
|
"""
|
||
|
if minimum_threshold is None and maximum_threshold is None:
|
||
|
raise ValueError("Both `minimum_threshold` and `maximum_threshold` cannot both be `None`")
|
||
|
if minimum_threshold is None:
|
||
|
minimum_threshold = -1
|
||
|
if maximum_threshold is None:
|
||
|
maximum_threshold = math.inf
|
||
|
|
||
|
card_data = getattr(model_info, "card_data", None)
|
||
|
if card_data is None or not isinstance(card_data, (dict, ModelCardData)):
|
||
|
return False
|
||
|
|
||
|
# Get CO2 emission metadata
|
||
|
emission = card_data.get("co2_eq_emissions", None)
|
||
|
if isinstance(emission, dict):
|
||
|
emission = emission["emissions"]
|
||
|
if not emission:
|
||
|
return False
|
||
|
|
||
|
# Filter out if value is missing or out of range
|
||
|
matched = re.search(r"\d+\.\d+|\d+", str(emission))
|
||
|
if matched is None:
|
||
|
return False
|
||
|
|
||
|
emission_value = float(matched.group(0))
|
||
|
return minimum_threshold <= emission_value <= maximum_threshold
|
||
|
|
||
|
|
||
|
@dataclass
|
||
|
class DatasetFilter:
|
||
|
"""
|
||
|
A class that converts human-readable dataset search parameters into ones
|
||
|
compatible with the REST API. For all parameters capitalization does not
|
||
|
matter.
|
||
|
|
||
|
<Tip warning={true}>
|
||
|
|
||
|
The `DatasetFilter` class is deprecated and will be removed in huggingface_hub>=0.24. Please pass the filter parameters as keyword arguments directly to [`list_datasets`].
|
||
|
|
||
|
</Tip>
|
||
|
|
||
|
Args:
|
||
|
author (`str`, *optional*):
|
||
|
A string that can be used to identify datasets on
|
||
|
the Hub by the original uploader (author or organization), such as
|
||
|
`facebook` or `huggingface`.
|
||
|
benchmark (`str` or `List`, *optional*):
|
||
|
A string or list of strings that can be used to identify datasets on
|
||
|
the Hub by their official benchmark.
|
||
|
dataset_name (`str`, *optional*):
|
||
|
A string or list of strings that can be used to identify datasets on
|
||
|
the Hub by its name, such as `SQAC` or `wikineural`
|
||
|
language_creators (`str` or `List`, *optional*):
|
||
|
A string or list of strings that can be used to identify datasets on
|
||
|
the Hub with how the data was curated, such as `crowdsourced` or
|
||
|
`machine_generated`.
|
||
|
language (`str` or `List`, *optional*):
|
||
|
A string or list of strings representing a two-character language to
|
||
|
filter datasets by on the Hub.
|
||
|
multilinguality (`str` or `List`, *optional*):
|
||
|
A string or list of strings representing a filter for datasets that
|
||
|
contain multiple languages.
|
||
|
size_categories (`str` or `List`, *optional*):
|
||
|
A string or list of strings that can be used to identify datasets on
|
||
|
the Hub by the size of the dataset such as `100K<n<1M` or
|
||
|
`1M<n<10M`.
|
||
|
task_categories (`str` or `List`, *optional*):
|
||
|
A string or list of strings that can be used to identify datasets on
|
||
|
the Hub by the designed task, such as `audio_classification` or
|
||
|
`named_entity_recognition`.
|
||
|
task_ids (`str` or `List`, *optional*):
|
||
|
A string or list of strings that can be used to identify datasets on
|
||
|
the Hub by the specific task such as `speech_emotion_recognition` or
|
||
|
`paraphrase`.
|
||
|
|
||
|
Examples:
|
||
|
|
||
|
```py
|
||
|
>>> from huggingface_hub import DatasetFilter
|
||
|
|
||
|
>>> # Using author
|
||
|
>>> new_filter = DatasetFilter(author="facebook")
|
||
|
|
||
|
>>> # Using benchmark
|
||
|
>>> new_filter = DatasetFilter(benchmark="raft")
|
||
|
|
||
|
>>> # Using dataset_name
|
||
|
>>> new_filter = DatasetFilter(dataset_name="wikineural")
|
||
|
|
||
|
>>> # Using language_creator
|
||
|
>>> new_filter = DatasetFilter(language_creator="crowdsourced")
|
||
|
|
||
|
>>> # Using language
|
||
|
>>> new_filter = DatasetFilter(language="en")
|
||
|
|
||
|
>>> # Using multilinguality
|
||
|
>>> new_filter = DatasetFilter(multilinguality="multilingual")
|
||
|
|
||
|
>>> # Using size_categories
|
||
|
>>> new_filter = DatasetFilter(size_categories="100K<n<1M")
|
||
|
|
||
|
>>> # Using task_categories
|
||
|
>>> new_filter = DatasetFilter(task_categories="audio_classification")
|
||
|
|
||
|
>>> # Using task_ids
|
||
|
>>> new_filter = DatasetFilter(task_ids="paraphrase")
|
||
|
```
|
||
|
"""
|
||
|
|
||
|
author: Optional[str] = None
|
||
|
benchmark: Optional[Union[str, List[str]]] = None
|
||
|
dataset_name: Optional[str] = None
|
||
|
language_creators: Optional[Union[str, List[str]]] = None
|
||
|
language: Optional[Union[str, List[str]]] = None
|
||
|
multilinguality: Optional[Union[str, List[str]]] = None
|
||
|
size_categories: Optional[Union[str, List[str]]] = None
|
||
|
task_categories: Optional[Union[str, List[str]]] = None
|
||
|
task_ids: Optional[Union[str, List[str]]] = None
|
||
|
|
||
|
def __post_init__(self):
|
||
|
warnings.warn(
|
||
|
"'DatasetFilter' is deprecated and will be removed in huggingface_hub>=0.24. Please pass the filter parameters as keyword arguments directly to the `list_datasets` method.",
|
||
|
category=FutureWarning,
|
||
|
)
|
||
|
|
||
|
|
||
|
@dataclass
|
||
|
class ModelFilter:
|
||
|
"""
|
||
|
A class that converts human-readable model search parameters into ones
|
||
|
compatible with the REST API. For all parameters capitalization does not
|
||
|
matter.
|
||
|
|
||
|
<Tip warning={true}>
|
||
|
|
||
|
The `ModelFilter` class is deprecated and will be removed in huggingface_hub>=0.24. Please pass the filter parameters as keyword arguments directly to [`list_models`].
|
||
|
|
||
|
</Tip>
|
||
|
|
||
|
Args:
|
||
|
author (`str`, *optional*):
|
||
|
A string that can be used to identify models on the Hub by the
|
||
|
original uploader (author or organization), such as `facebook` or
|
||
|
`huggingface`.
|
||
|
library (`str` or `List`, *optional*):
|
||
|
A string or list of strings of foundational libraries models were
|
||
|
originally trained from, such as pytorch, tensorflow, or allennlp.
|
||
|
language (`str` or `List`, *optional*):
|
||
|
A string or list of strings of languages, both by name and country
|
||
|
code, such as "en" or "English"
|
||
|
model_name (`str`, *optional*):
|
||
|
A string that contain complete or partial names for models on the
|
||
|
Hub, such as "bert" or "bert-base-cased"
|
||
|
task (`str` or `List`, *optional*):
|
||
|
A string or list of strings of tasks models were designed for, such
|
||
|
as: "fill-mask" or "automatic-speech-recognition"
|
||
|
tags (`str` or `List`, *optional*):
|
||
|
A string tag or a list of tags to filter models on the Hub by, such
|
||
|
as `text-generation` or `spacy`.
|
||
|
trained_dataset (`str` or `List`, *optional*):
|
||
|
A string tag or a list of string tags of the trained dataset for a
|
||
|
model on the Hub.
|
||
|
|
||
|
Examples:
|
||
|
|
||
|
```python
|
||
|
>>> from huggingface_hub import ModelFilter
|
||
|
|
||
|
>>> # For the author_or_organization
|
||
|
>>> new_filter = ModelFilter(author_or_organization="facebook")
|
||
|
|
||
|
>>> # For the library
|
||
|
>>> new_filter = ModelFilter(library="pytorch")
|
||
|
|
||
|
>>> # For the language
|
||
|
>>> new_filter = ModelFilter(language="french")
|
||
|
|
||
|
>>> # For the model_name
|
||
|
>>> new_filter = ModelFilter(model_name="bert")
|
||
|
|
||
|
>>> # For the task
|
||
|
>>> new_filter = ModelFilter(task="text-classification")
|
||
|
|
||
|
>>> from huggingface_hub import HfApi
|
||
|
|
||
|
>>> api = HfApi()
|
||
|
# To list model tags
|
||
|
|
||
|
>>> new_filter = ModelFilter(tags="benchmark:raft")
|
||
|
|
||
|
>>> # Related to the dataset
|
||
|
>>> new_filter = ModelFilter(trained_dataset="common_voice")
|
||
|
```
|
||
|
"""
|
||
|
|
||
|
author: Optional[str] = None
|
||
|
library: Optional[Union[str, List[str]]] = None
|
||
|
language: Optional[Union[str, List[str]]] = None
|
||
|
model_name: Optional[str] = None
|
||
|
task: Optional[Union[str, List[str]]] = None
|
||
|
trained_dataset: Optional[Union[str, List[str]]] = None
|
||
|
tags: Optional[Union[str, List[str]]] = None
|
||
|
|
||
|
def __post_init__(self):
|
||
|
warnings.warn(
|
||
|
"'ModelFilter' is deprecated and will be removed in huggingface_hub>=0.24. Please pass the filter parameters as keyword arguments directly to the `list_models` method.",
|
||
|
FutureWarning,
|
||
|
)
|