202 lines
6.6 KiB
Python
202 lines
6.6 KiB
Python
import math
|
|
import random
|
|
from typing import Callable, List, Union
|
|
|
|
from torch.utils.data.sampler import BatchSampler, Sampler, SubsetRandomSampler
|
|
|
|
|
|
class SubsetSampler(Sampler):
|
|
"""
|
|
Samples elements sequentially from a given list of indices.
|
|
|
|
Args:
|
|
indices (list): a sequence of indices
|
|
"""
|
|
|
|
def __init__(self, indices):
|
|
super().__init__(indices)
|
|
self.indices = indices
|
|
|
|
def __iter__(self):
|
|
return (self.indices[i] for i in range(len(self.indices)))
|
|
|
|
def __len__(self):
|
|
return len(self.indices)
|
|
|
|
|
|
class PerfectBatchSampler(Sampler):
|
|
"""
|
|
Samples a mini-batch of indices for a balanced class batching
|
|
|
|
Args:
|
|
dataset_items(list): dataset items to sample from.
|
|
classes (list): list of classes of dataset_items to sample from.
|
|
batch_size (int): total number of samples to be sampled in a mini-batch.
|
|
num_gpus (int): number of GPU in the data parallel mode.
|
|
shuffle (bool): if True, samples randomly, otherwise samples sequentially.
|
|
drop_last (bool): if True, drops last incomplete batch.
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
dataset_items,
|
|
classes,
|
|
batch_size,
|
|
num_classes_in_batch,
|
|
num_gpus=1,
|
|
shuffle=True,
|
|
drop_last=False,
|
|
label_key="class_name",
|
|
):
|
|
super().__init__(dataset_items)
|
|
assert (
|
|
batch_size % (num_classes_in_batch * num_gpus) == 0
|
|
), "Batch size must be divisible by number of classes times the number of data parallel devices (if enabled)."
|
|
|
|
label_indices = {}
|
|
for idx, item in enumerate(dataset_items):
|
|
label = item[label_key]
|
|
if label not in label_indices.keys():
|
|
label_indices[label] = [idx]
|
|
else:
|
|
label_indices[label].append(idx)
|
|
|
|
if shuffle:
|
|
self._samplers = [SubsetRandomSampler(label_indices[key]) for key in classes]
|
|
else:
|
|
self._samplers = [SubsetSampler(label_indices[key]) for key in classes]
|
|
|
|
self._batch_size = batch_size
|
|
self._drop_last = drop_last
|
|
self._dp_devices = num_gpus
|
|
self._num_classes_in_batch = num_classes_in_batch
|
|
|
|
def __iter__(self):
|
|
batch = []
|
|
if self._num_classes_in_batch != len(self._samplers):
|
|
valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
|
|
else:
|
|
valid_samplers_idx = None
|
|
|
|
iters = [iter(s) for s in self._samplers]
|
|
done = False
|
|
|
|
while True:
|
|
b = []
|
|
for i, it in enumerate(iters):
|
|
if valid_samplers_idx is not None and i not in valid_samplers_idx:
|
|
continue
|
|
idx = next(it, None)
|
|
if idx is None:
|
|
done = True
|
|
break
|
|
b.append(idx)
|
|
if done:
|
|
break
|
|
batch += b
|
|
if len(batch) == self._batch_size:
|
|
yield batch
|
|
batch = []
|
|
if valid_samplers_idx is not None:
|
|
valid_samplers_idx = random.sample(range(len(self._samplers)), self._num_classes_in_batch)
|
|
|
|
if not self._drop_last:
|
|
if len(batch) > 0:
|
|
groups = len(batch) // self._num_classes_in_batch
|
|
if groups % self._dp_devices == 0:
|
|
yield batch
|
|
else:
|
|
batch = batch[: (groups // self._dp_devices) * self._dp_devices * self._num_classes_in_batch]
|
|
if len(batch) > 0:
|
|
yield batch
|
|
|
|
def __len__(self):
|
|
class_batch_size = self._batch_size // self._num_classes_in_batch
|
|
return min(((len(s) + class_batch_size - 1) // class_batch_size) for s in self._samplers)
|
|
|
|
|
|
def identity(x):
|
|
return x
|
|
|
|
|
|
class SortedSampler(Sampler):
|
|
"""Samples elements sequentially, always in the same order.
|
|
|
|
Taken from https://github.com/PetrochukM/PyTorch-NLP
|
|
|
|
Args:
|
|
data (iterable): Iterable data.
|
|
sort_key (callable): Specifies a function of one argument that is used to extract a
|
|
numerical comparison key from each list element.
|
|
|
|
Example:
|
|
>>> list(SortedSampler(range(10), sort_key=lambda i: -i))
|
|
[9, 8, 7, 6, 5, 4, 3, 2, 1, 0]
|
|
|
|
"""
|
|
|
|
def __init__(self, data, sort_key: Callable = identity):
|
|
super().__init__(data)
|
|
self.data = data
|
|
self.sort_key = sort_key
|
|
zip_ = [(i, self.sort_key(row)) for i, row in enumerate(self.data)]
|
|
zip_ = sorted(zip_, key=lambda r: r[1])
|
|
self.sorted_indexes = [item[0] for item in zip_]
|
|
|
|
def __iter__(self):
|
|
return iter(self.sorted_indexes)
|
|
|
|
def __len__(self):
|
|
return len(self.data)
|
|
|
|
|
|
class BucketBatchSampler(BatchSampler):
|
|
"""Bucket batch sampler
|
|
|
|
Adapted from https://github.com/PetrochukM/PyTorch-NLP
|
|
|
|
Args:
|
|
sampler (torch.data.utils.sampler.Sampler):
|
|
batch_size (int): Size of mini-batch.
|
|
drop_last (bool): If `True` the sampler will drop the last batch if its size would be less
|
|
than `batch_size`.
|
|
data (list): List of data samples.
|
|
sort_key (callable, optional): Callable to specify a comparison key for sorting.
|
|
bucket_size_multiplier (int, optional): Buckets are of size
|
|
`batch_size * bucket_size_multiplier`.
|
|
|
|
Example:
|
|
>>> sampler = WeightedRandomSampler(weights, len(weights))
|
|
>>> sampler = BucketBatchSampler(sampler, data=data_items, batch_size=32, drop_last=True)
|
|
"""
|
|
|
|
def __init__(
|
|
self,
|
|
sampler,
|
|
data,
|
|
batch_size,
|
|
drop_last,
|
|
sort_key: Union[Callable, List] = identity,
|
|
bucket_size_multiplier=100,
|
|
):
|
|
super().__init__(sampler, batch_size, drop_last)
|
|
self.data = data
|
|
self.sort_key = sort_key
|
|
_bucket_size = batch_size * bucket_size_multiplier
|
|
if hasattr(sampler, "__len__"):
|
|
_bucket_size = min(_bucket_size, len(sampler))
|
|
self.bucket_sampler = BatchSampler(sampler, _bucket_size, False)
|
|
|
|
def __iter__(self):
|
|
for idxs in self.bucket_sampler:
|
|
bucket_data = [self.data[idx] for idx in idxs]
|
|
sorted_sampler = SortedSampler(bucket_data, self.sort_key)
|
|
for batch_idx in SubsetRandomSampler(list(BatchSampler(sorted_sampler, self.batch_size, self.drop_last))):
|
|
sorted_idxs = [idxs[i] for i in batch_idx]
|
|
yield sorted_idxs
|
|
|
|
def __len__(self):
|
|
if self.drop_last:
|
|
return len(self.sampler) // self.batch_size
|
|
return math.ceil(len(self.sampler) / self.batch_size)
|