ai-content-maker/.venv/Lib/site-packages/TTS/utils/samplers.py

import math
import random
from typing import Callable, List, Union

from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler


class SubsetSampler(Sampler):
    """
    Samples elements sequentially from a given list of indices.

    Args:
        indices (list): a sequence of indices
    """

    def __init__(self, indices):
        super().__init__(indices)
        self.indices = indices

    def __iter__(self):
        return (self.indices[i] for i in range(len(self.indices)))

    def __len__(self):
        return len(self.indices)


class PerfectBatchSampler(Sampler):
    """
    Samples a mini-batch of indices for a balanced class batching

    Args:
        dataset_items(list): dataset items to sample from.
        classes (list): list of classes of dataset_items to sample from.
        batch_size (int): total number of samples to be sampled in a mini-batch.
        num_gpus (int): number of GPU in the data parallel mode.
        shuffle (bool): if True, samples randomly, otherwise samples sequentially.
        drop_last (bool): if True, drops last incomplete batch.
    """

    def __init__(
        self,
        dataset_items,
        classes,
        batch_size,
        num_classes_in_batch,
        num_gpus=1,
        shuffle=True,
        drop_last=False,
        label_key="class_name",
    ):
        super().__init__(dataset_items)
        assert (
            batch_size % (num_classes_in_batch * num_gpus) == 0
        ), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."

        label_indices = {}
        for idx, item in enumerate(dataset_items):
            label = item[label_key]
            if label not in label_indices.keys():
                label_indices[label] = [idx]
            else:
                label_indices[label].append(idx)

        if shuffle:
            self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes]
        else:
            self._samplers = [SubsetSampler(label_indices[key]) for key in classes]

        self._batch_size = batch_size
        self._drop_last = drop_last
        self._dp_devices = num_gpus
        self._num_classes_in_batch = num_classes_in_batch

    def __iter__(self):
        batch = []
        if self._num_classes_in_batch != len(self._samplers):
            valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
        else:
            valid_samplers_idx = None

        iters = [iter(s) for s in self._samplers]
        done = False

        while True:
            b = []
            for i, it in enumerate(iters):
                if valid_samplers_idx is not None and i not in valid_samplers_idx:
                    continue
                idx = next(it, None)
                if idx is None:
                    done = True
                    break
                b.append(idx)
            if done:
                break
            batch += b
            if len(batch) == self._batch_size:
                yield batch
                batch = []
                if valid_samplers_idx is not None:
                    valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)

        if not self._drop_last:
            if len(batch) > 0:
                groups = len(batch) // self._num_classes_in_batch
                if groups % self._dp_devices == 0:
                    yield batch
                else:
                    batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
                    if len(batch) > 0:
                        yield batch

    def __len__(self):
        class_batch_size = self._batch_size // self._num_classes_in_batch
        return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)


def identity(x):
    return x


class SortedSampler(Sampler):
    """Samples elements sequentially, always in the same order.

    Taken from https://github.com/PetrochukM/PyTorch-NLP

    Args:
        data (iterable): Iterable data.
        sort_key (callable): Specifies a function of one argument that is used to extract a
            numerical comparison key from each list element.

    Example:
        >>> list(SortedSampler(range(10), sort_key=lambda i: -i))
        [9, 8, 7, 6, 5, 4, 3, 2, 1, 0]

    """

    def __init__(self, data, sort_key: Callable = identity):
        super().__init__(data)
        self.data = data
        self.sort_key = sort_key
        zip_ = [(i, self.sort_key(row)) for i, row in enumerate(self.data)]
        zip_ = sorted(zip_, key=lambda r: r[1])
        self.sorted_indexes = [item[0] for item in zip_]

    def __iter__(self):
        return iter(self.sorted_indexes)

    def __len__(self):
        return len(self.data)


class BucketBatchSampler(BatchSampler):
    """Bucket batch sampler

    Adapted from https://github.com/PetrochukM/PyTorch-NLP

    Args:
        sampler (torch.data.utils.sampler.Sampler):
        batch_size (int): Size of mini-batch.
        drop_last (bool): If `True` the sampler will drop the last batch if its size would be less
            than `batch_size`.
        data (list): List of data samples.
        sort_key (callable, optional): Callable to specify a comparison key for sorting.
        bucket_size_multiplier (int, optional): Buckets are of size
            `batch_size * bucket_size_multiplier`.

    Example:
        >>> sampler = WeightedRandomSampler(weights, len(weights))
        >>> sampler = BucketBatchSampler(sampler, data=data_items, batch_size=32, drop_last=True)
    """

    def __init__(
        self,
        sampler,
        data,
        batch_size,
        drop_last,
        sort_key: Union[Callable, List] = identity,
        bucket_size_multiplier=100,
    ):
        super().__init__(sampler, batch_size, drop_last)
        self.data = data
        self.sort_key = sort_key
        _bucket_size = batch_size * bucket_size_multiplier
        if hasattr(sampler, "__len__"):
            _bucket_size = min(_bucket_size, len(sampler))
        self.bucket_sampler = BatchSampler(sampler, _bucket_size, False)

    def __iter__(self):
        for idxs in self.bucket_sampler:
            bucket_data = [self.data[idx] for idx in idxs]
            sorted_sampler = SortedSampler(bucket_data, self.sort_key)
            for batch_idx in SubsetRandomSampler(list(BatchSampler(sorted_sampler, self.batch_size, self.drop_last))):
                sorted_idxs = [idxs[i] for i in batch_idx]
                yield sorted_idxs

    def __len__(self):
        if self.drop_last:
            return len(self.sampler) // self.batch_size
        return math.ceil(len(self.sampler) / self.batch_size)
first commit 2024-05-03 04:18:51 +03:00			`import math`
			`import random`
			`from typing import Callable, List, Union`

			`from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler`


			`class SubsetSampler(Sampler):`
			`"""`
			`Samples elements sequentially from a given list of indices.`

			`Args:`
			`indices (list): a sequence of indices`
			`"""`

			`def __init__(self, indices):`
			`super().__init__(indices)`
			`self.indices = indices`

			`def __iter__(self):`
			`return (self.indices[i] for i in range(len(self.indices)))`

			`def __len__(self):`
			`return len(self.indices)`


			`class PerfectBatchSampler(Sampler):`
			`"""`
			`Samples a mini-batch of indices for a balanced class batching`

			`Args:`
			`dataset_items(list): dataset items to sample from.`
			`classes (list): list of classes of dataset_items to sample from.`
			`batch_size (int): total number of samples to be sampled in a mini-batch.`
			`num_gpus (int): number of GPU in the data parallel mode.`
			`shuffle (bool): if True, samples randomly, otherwise samples sequentially.`
			`drop_last (bool): if True, drops last incomplete batch.`
			`"""`

			`def __init__(`
			`self,`
			`dataset_items,`
			`classes,`
			`batch_size,`
			`num_classes_in_batch,`
			`num_gpus=1,`
			`shuffle=True,`
			`drop_last=False,`
			`label_key="class_name",`
			`):`
			`super().__init__(dataset_items)`
			`assert (`
			`batch_size % (num_classes_in_batch * num_gpus) == 0`
			`), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."`

			`label_indices = {}`
			`for idx, item in enumerate(dataset_items):`
			`label = item[label_key]`
			`if label not in label_indices.keys():`
			`label_indices[label] = [idx]`
			`else:`
			`label_indices[label].append(idx)`

			`if shuffle:`
			`self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes]`
			`else:`
			`self._samplers = [SubsetSampler(label_indices[key]) for key in classes]`

			`self._batch_size = batch_size`
			`self._drop_last = drop_last`
			`self._dp_devices = num_gpus`
			`self._num_classes_in_batch = num_classes_in_batch`

			`def __iter__(self):`
			`batch = []`
			`if self._num_classes_in_batch != len(self._samplers):`
			`valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)`
			`else:`
			`valid_samplers_idx = None`

			`iters = [iter(s) for s in self._samplers]`
			`done = False`

			`while True:`
			`b = []`
			`for i, it in enumerate(iters):`
			`if valid_samplers_idx is not None and i not in valid_samplers_idx:`
			`continue`
			`idx = next(it, None)`
			`if idx is None:`
			`done = True`
			`break`
			`b.append(idx)`
			`if done:`
			`break`
			`batch += b`
			`if len(batch) == self._batch_size:`
			`yield batch`
			`batch = []`
			`if valid_samplers_idx is not None:`
			`valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)`

			`if not self._drop_last:`
			`if len(batch) > 0:`
			`groups = len(batch) // self._num_classes_in_batch`
			`if groups % self._dp_devices == 0:`
			`yield batch`
			`else:`
			`batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]`
			`if len(batch) > 0:`
			`yield batch`

			`def __len__(self):`
			`class_batch_size = self._batch_size // self._num_classes_in_batch`
			`return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)`


			`def identity(x):`
			`return x`


			`class SortedSampler(Sampler):`
			`"""Samples elements sequentially, always in the same order.`

			`Taken from https://github.com/PetrochukM/PyTorch-NLP`

			`Args:`
			`data (iterable): Iterable data.`
			`sort_key (callable): Specifies a function of one argument that is used to extract a`
			`numerical comparison key from each list element.`

			`Example:`
			`>>> list(SortedSampler(range(10), sort_key=lambda i: -i))`
			`[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]`

			`"""`

			`def __init__(self, data, sort_key: Callable = identity):`
			`super().__init__(data)`
			`self.data = data`
			`self.sort_key = sort_key`
			`zip_ = [(i, self.sort_key(row)) for i, row in enumerate(self.data)]`
			`zip_ = sorted(zip_, key=lambda r: r[1])`
			`self.sorted_indexes = [item[0] for item in zip_]`

			`def __iter__(self):`
			`return iter(self.sorted_indexes)`

			`def __len__(self):`
			`return len(self.data)`


			`class BucketBatchSampler(BatchSampler):`
			`"""Bucket batch sampler`

			`Adapted from https://github.com/PetrochukM/PyTorch-NLP`

			`Args:`
			`sampler (torch.data.utils.sampler.Sampler):`
			`batch_size (int): Size of mini-batch.`
			drop_last (bool): If `True` the sampler will drop the last batch if its size would be less
			than `batch_size`.
			`data (list): List of data samples.`
			`sort_key (callable, optional): Callable to specify a comparison key for sorting.`
			`bucket_size_multiplier (int, optional): Buckets are of size`
			`batch_size * bucket_size_multiplier`.

			`Example:`
			`>>> sampler = WeightedRandomSampler(weights, len(weights))`
			`>>> sampler = BucketBatchSampler(sampler, data=data_items, batch_size=32, drop_last=True)`
			`"""`

			`def __init__(`
			`self,`
			`sampler,`
			`data,`
			`batch_size,`
			`drop_last,`
			`sort_key: Union[Callable, List] = identity,`
			`bucket_size_multiplier=100,`
			`):`
			`super().__init__(sampler, batch_size, drop_last)`
			`self.data = data`
			`self.sort_key = sort_key`
			`_bucket_size = batch_size * bucket_size_multiplier`
			`if hasattr(sampler, "__len__"):`
			`_bucket_size = min(_bucket_size, len(sampler))`
			`self.bucket_sampler = BatchSampler(sampler, _bucket_size, False)`

			`def __iter__(self):`
			`for idxs in self.bucket_sampler:`
			`bucket_data = [self.data[idx] for idx in idxs]`
			`sorted_sampler = SortedSampler(bucket_data, self.sort_key)`
			`for batch_idx in SubsetRandomSampler(list(BatchSampler(sorted_sampler, self.batch_size, self.drop_last))):`
			`sorted_idxs = [idxs[i] for i in batch_idx]`
			`yield sorted_idxs`

			`def __len__(self):`
			`if self.drop_last:`
			`return len(self.sampler) // self.batch_size`
			`return math.ceil(len(self.sampler) / self.batch_size)`