Module hub.util

Expand source code
__pdoc__ = {
    "assert_byte_indexes": False,
    "bugout_reporter": False,
    "cache_chain": False,
    "callbacks": False,
    "check_installation": False,
    "exceptions": False,
    "from_tfds": False,
    "get_property": False,
    "get_storage_provider": False,
    "join_chunks": False,
    "keys": False,
    "path": False,
    "remove_cache": False,
    "shape": False,
    "shared_memory": False,
    "tag": False,
    "tests": False,
    "transform": False,
}

from .shuffle import shuffle
from .split import split

__all__ = ["shuffle", "split"]

Sub-modules

hub.util.auto
hub.util.casting
hub.util.compute
hub.util.dataset
hub.util.encoder
hub.util.iterable_ordered_dict
hub.util.prefetch_cache
hub.util.shape_interval
hub.util.storage
hub.util.threading
hub.util.version_control

Functions

def shuffle(ds)

Returns a shuffled wrapper of a given Dataset.

Expand source code
def shuffle(ds):
    """Returns a shuffled wrapper of a given Dataset."""
    idxs = np.arange(len(ds))
    np.random.shuffle(idxs)
    return ds[idxs.tolist()]
def split(ds, values=[0.7, 0.2, 0.1])

Splits a Dataset into multiple datasets with the provided ratio of entries. Returns a list of datasets with length equal to the number of givens. For small datasets or many partitions, some returns may be empty.

Args

ds
The Dataset object from which to construct the splits. If already indexed, the splits will be based off that index.
values
The proportions for the split. Should each sum to one. Defaults to [0.7, 0.2, 0.1] for a 70% 20% 10% split

Returns

List of Datasets, one for each float in the given values.

Raises

ValueError
The values must sum to 1.
Expand source code
def split(ds, values: Sequence[float] = [0.7, 0.2, 0.1]):
    """Splits a Dataset into multiple datasets with the provided ratio of entries.
    Returns a list of datasets with length equal to the number of givens.
    For small datasets or many partitions, some returns may be empty.

    Args:
        ds: The Dataset object from which to construct the splits.
            If already indexed, the splits will be based off that index.
        values: The proportions for the split. Should each sum to one.
            Defaults to [0.7, 0.2, 0.1] for a 70% 20% 10% split

    Returns:
        List of Datasets, one for each float in the given values.

    Raises:
        ValueError: The values must sum to 1.
    """

    if not np.isclose(sum(values), 1.0):
        raise ValueError("Given proportions must sum to 1.")

    count = 0
    length = len(ds)
    partitions = []
    for value in values[:-1]:
        amount = floor(length * value)
        partitions.append(ds[count : count + amount])
        count += amount
    partitions.append(ds[count:])

    return partitions