Module hub.util

Expand source code
__pdoc__ = {
    "assert_byte_indexes": False,
    "bugout_reporter": False,
    "cache_chain": False,
    "callbacks": False,
    "check_installation": False,
    "exceptions": False,
    "from_tfds": False,
    "get_property": False,
    "get_storage_provider": False,
    "join_chunks": False,
    "keys": False,
    "path": False,
    "remove_cache": False,
    "shape": False,
    "shared_memory": False,
    "tag": False,
    "tests": False,
    "transform": False,
}

from .shuffle import shuffle
from .split import split

__all__ = ["shuffle", "split"]

Sub-modules

hub.util.access_method
hub.util.agreement
hub.util.auto
hub.util.bugout_token
hub.util.casting
hub.util.check_latest_version
hub.util.chunk_engine
hub.util.class_label
hub.util.compression
hub.util.compute
hub.util.dataset
hub.util.delete_entry
hub.util.diff
hub.util.empty_sample
hub.util.encoder
hub.util.exif
hub.util.generate_id
hub.util.hash
hub.util.htype
hub.util.image
hub.util.invalid_view_op
hub.util.iterable_ordered_dict
hub.util.json
hub.util.link
hub.util.logging
hub.util.merge
hub.util.modified
hub.util.notebook
hub.util.pretty_print
hub.util.shape_interval
hub.util.storage
hub.util.threading
hub.util.token
hub.util.version_control
hub.util.video
hub.util.warnings

Functions

def shuffle(ds)

Returns a shuffled wrapper of a given Dataset.

Expand source code
def shuffle(ds):
    """Returns a shuffled wrapper of a given Dataset."""
    idxs = np.arange(len(ds))
    np.random.shuffle(idxs)
    return ds[idxs.tolist()]
def split(ds, values=[0.7, 0.2, 0.1])

Splits a Dataset into multiple datasets with the provided ratio of entries. Returns a list of datasets with length equal to the number of givens. For small datasets or many partitions, some returns may be empty.

Args

ds
The Dataset object from which to construct the splits. If already indexed, the splits will be based off that index.
values
The proportions for the split. Should each sum to one. Defaults to [0.7, 0.2, 0.1] for a 70% 20% 10% split

Returns

List of Datasets, one for each float in the given values.

Raises

ValueError
The values must sum to 1.
Expand source code
def split(ds, values: Sequence[float] = [0.7, 0.2, 0.1]):
    """Splits a Dataset into multiple datasets with the provided ratio of entries.
    Returns a list of datasets with length equal to the number of givens.
    For small datasets or many partitions, some returns may be empty.

    Args:
        ds: The Dataset object from which to construct the splits.
            If already indexed, the splits will be based off that index.
        values: The proportions for the split. Should each sum to one.
            Defaults to [0.7, 0.2, 0.1] for a 70% 20% 10% split

    Returns:
        List of Datasets, one for each float in the given values.

    Raises:
        ValueError: The values must sum to 1.
    """

    if not np.isclose(sum(values), 1.0):
        raise ValueError("Given proportions must sum to 1.")

    count = 0
    length = len(ds)
    partitions = []
    for value in values[:-1]:
        amount = floor(length * value)
        partitions.append(ds[count : count + amount])
        count += amount
    partitions.append(ds[count:])

    return partitions