119 lines
4.3 KiB
Python
119 lines
4.3 KiB
Python
# coding=utf-8
|
|
# Copyright 2022-present, the HuggingFace Inc. team.
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
"""Contains utilities to handle paths in Huggingface Hub."""
|
|
|
|
from fnmatch import fnmatch
|
|
from pathlib import Path
|
|
from typing import Callable, Generator, Iterable, List, Optional, TypeVar, Union
|
|
|
|
|
|
T = TypeVar("T")
|
|
|
|
IGNORE_GIT_FOLDER_PATTERNS = [".git", ".git/*", "*/.git", "**/.git/**"]
|
|
|
|
|
|
def filter_repo_objects(
|
|
items: Iterable[T],
|
|
*,
|
|
allow_patterns: Optional[Union[List[str], str]] = None,
|
|
ignore_patterns: Optional[Union[List[str], str]] = None,
|
|
key: Optional[Callable[[T], str]] = None,
|
|
) -> Generator[T, None, None]:
|
|
"""Filter repo objects based on an allowlist and a denylist.
|
|
|
|
Input must be a list of paths (`str` or `Path`) or a list of arbitrary objects.
|
|
In the later case, `key` must be provided and specifies a function of one argument
|
|
that is used to extract a path from each element in iterable.
|
|
|
|
Patterns are Unix shell-style wildcards which are NOT regular expressions. See
|
|
https://docs.python.org/3/library/fnmatch.html for more details.
|
|
|
|
Args:
|
|
items (`Iterable`):
|
|
List of items to filter.
|
|
allow_patterns (`str` or `List[str]`, *optional*):
|
|
Patterns constituting the allowlist. If provided, item paths must match at
|
|
least one pattern from the allowlist.
|
|
ignore_patterns (`str` or `List[str]`, *optional*):
|
|
Patterns constituting the denylist. If provided, item paths must not match
|
|
any patterns from the denylist.
|
|
key (`Callable[[T], str]`, *optional*):
|
|
Single-argument function to extract a path from each item. If not provided,
|
|
the `items` must already be `str` or `Path`.
|
|
|
|
Returns:
|
|
Filtered list of objects, as a generator.
|
|
|
|
Raises:
|
|
:class:`ValueError`:
|
|
If `key` is not provided and items are not `str` or `Path`.
|
|
|
|
Example usage with paths:
|
|
```python
|
|
>>> # Filter only PDFs that are not hidden.
|
|
>>> list(filter_repo_objects(
|
|
... ["aaa.PDF", "bbb.jpg", ".ccc.pdf", ".ddd.png"],
|
|
... allow_patterns=["*.pdf"],
|
|
... ignore_patterns=[".*"],
|
|
... ))
|
|
["aaa.pdf"]
|
|
```
|
|
|
|
Example usage with objects:
|
|
```python
|
|
>>> list(filter_repo_objects(
|
|
... [
|
|
... CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf")
|
|
... CommitOperationAdd(path_or_fileobj="/tmp/bbb.jpg", path_in_repo="bbb.jpg")
|
|
... CommitOperationAdd(path_or_fileobj="/tmp/.ccc.pdf", path_in_repo=".ccc.pdf")
|
|
... CommitOperationAdd(path_or_fileobj="/tmp/.ddd.png", path_in_repo=".ddd.png")
|
|
... ],
|
|
... allow_patterns=["*.pdf"],
|
|
... ignore_patterns=[".*"],
|
|
... key=lambda x: x.repo_in_path
|
|
... ))
|
|
[CommitOperationAdd(path_or_fileobj="/tmp/aaa.pdf", path_in_repo="aaa.pdf")]
|
|
```
|
|
"""
|
|
if isinstance(allow_patterns, str):
|
|
allow_patterns = [allow_patterns]
|
|
|
|
if isinstance(ignore_patterns, str):
|
|
ignore_patterns = [ignore_patterns]
|
|
|
|
if key is None:
|
|
|
|
def _identity(item: T) -> str:
|
|
if isinstance(item, str):
|
|
return item
|
|
if isinstance(item, Path):
|
|
return str(item)
|
|
raise ValueError(f"Please provide `key` argument in `filter_repo_objects`: `{item}` is not a string.")
|
|
|
|
key = _identity # Items must be `str` or `Path`, otherwise raise ValueError
|
|
|
|
for item in items:
|
|
path = key(item)
|
|
|
|
# Skip if there's an allowlist and path doesn't match any
|
|
if allow_patterns is not None and not any(fnmatch(path, r) for r in allow_patterns):
|
|
continue
|
|
|
|
# Skip if there's a denylist and path matches any
|
|
if ignore_patterns is not None and any(fnmatch(path, r) for r in ignore_patterns):
|
|
continue
|
|
|
|
yield item
|