Module hub.util
Expand source code
__pdoc__ = {
"assert_byte_indexes": False,
"bugout_reporter": False,
"cache_chain": False,
"callbacks": False,
"check_installation": False,
"exceptions": False,
"from_tfds": False,
"get_property": False,
"get_storage_provider": False,
"join_chunks": False,
"keys": False,
"path": False,
"remove_cache": False,
"shape": False,
"shared_memory": False,
"tag": False,
"tests": False,
"transform": False,
}
from .shuffle import shuffle
from .split import split
__all__ = ["shuffle", "split"]
Sub-modules
hub.util.access_method
hub.util.agreement
hub.util.auto
hub.util.bugout_token
hub.util.casting
hub.util.check_latest_version
hub.util.chunk_engine
hub.util.class_label
hub.util.compression
hub.util.compute
hub.util.dataset
hub.util.delete_entry
hub.util.diff
hub.util.empty_sample
hub.util.encoder
hub.util.exif
hub.util.generate_id
hub.util.hash
hub.util.htype
hub.util.image
hub.util.invalid_view_op
hub.util.iterable_ordered_dict
hub.util.json
hub.util.link
hub.util.logging
hub.util.merge
hub.util.modified
hub.util.notebook
hub.util.pretty_print
hub.util.shape_interval
hub.util.storage
hub.util.threading
hub.util.token
hub.util.version_control
hub.util.video
hub.util.warnings
Functions
def shuffle(ds)
-
Returns a shuffled wrapper of a given Dataset.
Expand source code
def shuffle(ds): """Returns a shuffled wrapper of a given Dataset.""" idxs = np.arange(len(ds)) np.random.shuffle(idxs) return ds[idxs.tolist()]
def split(ds, values=[0.7, 0.2, 0.1])
-
Splits a Dataset into multiple datasets with the provided ratio of entries. Returns a list of datasets with length equal to the number of givens. For small datasets or many partitions, some returns may be empty.
Args
ds
- The Dataset object from which to construct the splits. If already indexed, the splits will be based off that index.
values
- The proportions for the split. Should each sum to one. Defaults to [0.7, 0.2, 0.1] for a 70% 20% 10% split
Returns
List of Datasets, one for each float in the given values.
Raises
ValueError
- The values must sum to 1.
Expand source code
def split(ds, values: Sequence[float] = [0.7, 0.2, 0.1]): """Splits a Dataset into multiple datasets with the provided ratio of entries. Returns a list of datasets with length equal to the number of givens. For small datasets or many partitions, some returns may be empty. Args: ds: The Dataset object from which to construct the splits. If already indexed, the splits will be based off that index. values: The proportions for the split. Should each sum to one. Defaults to [0.7, 0.2, 0.1] for a 70% 20% 10% split Returns: List of Datasets, one for each float in the given values. Raises: ValueError: The values must sum to 1. """ if not np.isclose(sum(values), 1.0): raise ValueError("Given proportions must sum to 1.") count = 0 length = len(ds) partitions = [] for value in values[:-1]: amount = floor(length * value) partitions.append(ds[count : count + amount]) count += amount partitions.append(ds[count:]) return partitions