Package hub

Expand source code
import numpy as np

__pdoc__ = {
    "core": False,
    "api": False,
    "cli": False,
    "client": False,
    "constants": False,
    "integrations": False,
    "tests": False,
    "Dataset.clear_cache": False,
    "Dataset.delete": False,
    "Dataset.flush": False,
    "Dataset.read_only": False,
    "Dataset.size_approx": False,
    "Dataset.token": False,
    "Dataset.num_samples": False,
}

from .api.dataset import dataset
from .api.read import read
from .core.dataset import Dataset
from .core.transform import compute, compose
from .core.tensor import Tensor
from .util.bugout_reporter import hub_reporter
from .compression import SUPPORTED_COMPRESSIONS
from .htype import HTYPE_CONFIGURATIONS

compressions = list(SUPPORTED_COMPRESSIONS)
htypes = sorted(list(HTYPE_CONFIGURATIONS))
list = dataset.list
load = dataset.load
empty = dataset.empty
like = dataset.like
list = dataset.list
dataset_cl = Dataset
ingest = dataset.ingest
ingest_kaggle = dataset.ingest_kaggle
tensor = Tensor

__all__ = [
    "dataset",
    "tensor",
    "read",
    "__version__",
    "load",
    "empty",
    "compute",
    "compose",
    "like",
    "list",
    "dataset_cl",
    "ingest",
    "ingest_kaggle",
    "compressions",
    "htypes",
]

__version__ = "2.0.15"
__encoded_version__ = np.array(__version__)

hub_reporter.tags.append(f"version:{__version__}")
hub_reporter.system_report(publish=True)
hub_reporter.setup_excepthook(publish=True)

Sub-modules

hub.auto
hub.compression

Supported compressions (formats): Image : bmp, dib, gif, ico, jpeg, jp2, pcx, png, ppm, sgi, tga, tiff, webp, wmf, xbm Audio : flac, mp3, wav …

hub.htype

"htype" is the class of a tensor: image, bounding box, generic tensor, etc …

hub.util

Functions

def compose(functions)

Takes a list of functions decorated using hub.compute and creates a pipeline that can be evaluated using .eval

Expand source code
def compose(functions: List[TransformFunction]):
    """Takes a list of functions decorated using hub.compute and creates a pipeline that can be evaluated using .eval"""
    if not functions:
        raise HubComposeEmptyListError
    for index, fn in enumerate(functions):
        if not isinstance(fn, TransformFunction):
            raise HubComposeIncompatibleFunction(index)
    return Pipeline(functions)
def compute(fn)

Compute is a decorator for functions. The functions should have atleast 2 argument, the first two will correspond to sample_in and samples_out. There can be as many other arguments as required. The output should be appended/extended to the second argument in a hub like syntax. Any value returned by the fn will be ignored.

Example::

@hub.compute
def your_function(sample_in: Any, samples_out, your_arg0, your_arg1=0):
    samples_out.your_tensor.append(your_arg0 * your_arg1)
Expand source code
def compute(fn):
    """Compute is a decorator for functions.
    The functions should have atleast 2 argument, the first two will correspond to sample_in and samples_out.
    There can be as many other arguments as required.
    The output should be appended/extended to the second argument in a hub like syntax.
    Any value returned by the fn will be ignored.

    Example::

        @hub.compute
        def your_function(sample_in: Any, samples_out, your_arg0, your_arg1=0):
            samples_out.your_tensor.append(your_arg0 * your_arg1)
    """

    def inner(*args, **kwargs):
        return TransformFunction(fn, args, kwargs)

    return inner
def empty(path, overwrite=False, public=True, memory_cache_size=256, local_cache_size=0, creds=None, token=None)

Creates an empty dataset

Important

Using overwrite will delete all of your data if it exists! Be very careful when setting this parameter.

Args

path : str
The full path to the dataset. Can be:- - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
overwrite : bool
WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
public : bool, optional
Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
memory_cache_size : int
The size of the memory cache to be used in MB.
local_cache_size : int
The size of the local filesystem cache to be used in MB.
creds : dict, optional
A dictionary containing credentials used to access the dataset at the path. This takes precedence over credentials present in the environment. Currently only works with s3 paths. It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
token : str, optional
Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

Returns

Dataset object created using the arguments provided.

Raises

DatasetHandlerError
If a Dataset already exists at the given path and overwrite is False.
Expand source code
@staticmethod
def empty(
    path: str,
    overwrite: bool = False,
    public: Optional[bool] = True,
    memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
    local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
    creds: Optional[dict] = None,
    token: Optional[str] = None,
) -> Dataset:
    """Creates an empty dataset

    Important:
        Using `overwrite` will delete all of your data if it exists! Be very careful when setting this parameter.

    Args:
        path (str): The full path to the dataset. Can be:-
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
        public (bool, optional): Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
        memory_cache_size (int): The size of the memory cache to be used in MB.
        local_cache_size (int): The size of the local filesystem cache to be used in MB.
        creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
            This takes precedence over credentials present in the environment. Currently only works with s3 paths.
            It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
        token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

    Returns:
        Dataset object created using the arguments provided.

    Raises:
        DatasetHandlerError: If a Dataset already exists at the given path and overwrite is False.
    """
    if creds is None:
        creds = {}

    feature_report_path(path, "empty", {"Overwrite": overwrite})

    storage, cache_chain = get_storage_and_cache_chain(
        path=path,
        read_only=False,
        creds=creds,
        token=token,
        memory_cache_size=memory_cache_size,
        local_cache_size=local_cache_size,
    )

    if overwrite and dataset_exists(storage):
        storage.clear()
    elif dataset_exists(storage):
        raise DatasetHandlerError(
            f"A dataset already exists at the given path ({path}). If you want to create a new empty dataset, either specify another path or use overwrite=True. If you want to load the dataset that exists at this path, use hub.load() instead."
        )

    read_only = storage.read_only
    return get_dataset_instance(
        path, storage=cache_chain, read_only=read_only, public=public, token=token
    )
def ingest(src, dest, images_compression='auto', dest_creds=None, progress_bar=True, summary=True, **dataset_kwargs)

Ingests a dataset from a source and stores it as a structured dataset to destination

Note

  • Currently only local source paths and image classification datasets are supported for automatic ingestion.
  • Supported filetypes: png/jpeg/jpg.
  • All files and sub-directories with unsupported filetypes are ignored.
  • Valid source directory structures look like:
    data/
        img0.jpg
        img1.jpg
        ...

or

    data/
        class0/
            cat0.jpg
            ...
        class1/
            dog0.jpg
            ...
        ...

or

    data/
        train/
            class0/
                img0.jpg
                ...
            ...
        val/
            class0/
                img0.jpg
                ...
            ...
        ...
  • Classes defined as sub-directories can be accessed at ds["test/labels"].info.class_names.
  • Support for train and test sub directories is present under ds["train/images"], ds["train/labels"] and ds["test/images"], ds["test/labels"]
  • Mapping filenames to classes from an external file is currently not supported.

Args

src : str
Local path to where the unstructured dataset is stored.
dest : str
Destination path where the structured dataset will be stored. Can be:- - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
images_compression : str
For image classification datasets, this compression will be used for the images tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
dest_creds : dict
A dictionary containing credentials used to access the destination path of the dataset.
progress_bar : bool
Enables or disables ingestion progress bar. Defaults to True.
summary : bool
If True, a summary of skipped files will be printed after completion. Defaults to True.
**dataset_kwargs
Any arguments passed here will be forwarded to the dataset creator function.

Returns

Dataset
New dataset object with structured dataset.

Raises

InvalidPathException
If the source directory does not exist.
SamePathException
If the source and destination path are same.
AutoCompressionError
If the source director is empty or does not contain a valid extension.
InvalidFileExtension
If the most frequent file extension is found to be 'None' during auto-compression.
Expand source code
@staticmethod
def ingest(
    src: str,
    dest: str,
    images_compression: str = "auto",
    dest_creds: dict = None,
    progress_bar: bool = True,
    summary: bool = True,
    **dataset_kwargs,
) -> Dataset:
    """Ingests a dataset from a source and stores it as a structured dataset to destination

    Note:
        - Currently only local source paths and image classification datasets are supported for automatic ingestion.
        - Supported filetypes: png/jpeg/jpg.
        - All files and sub-directories with unsupported filetypes are ignored.
        - Valid source directory structures look like:

        ```
            data/
                img0.jpg
                img1.jpg
                ...

        ```
        or
        ```
            data/
                class0/
                    cat0.jpg
                    ...
                class1/
                    dog0.jpg
                    ...
                ...

        ```
        or
        ```
            data/
                train/
                    class0/
                        img0.jpg
                        ...
                    ...
                val/
                    class0/
                        img0.jpg
                        ...
                    ...
                ...
        ```

        - Classes defined as sub-directories can be accessed at `ds["test/labels"].info.class_names`.
        - Support for train and test sub directories is present under ds["train/images"], ds["train/labels"] and ds["test/images"], ds["test/labels"]
        - Mapping filenames to classes from an external file is currently not supported.

    Args:
        src (str): Local path to where the unstructured dataset is stored.
        dest (str): Destination path where the structured dataset will be stored. Can be:-
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        images_compression (str): For image classification datasets, this compression will be used for the `images` tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
        dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
        progress_bar (bool): Enables or disables ingestion progress bar. Defaults to True.
        summary (bool): If True, a summary of skipped files will be printed after completion. Defaults to True.
        **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.

    Returns:
        Dataset: New dataset object with structured dataset.

    Raises:
        InvalidPathException: If the source directory does not exist.
        SamePathException: If the source and destination path are same.
        AutoCompressionError: If the source director is empty or does not contain a valid extension.
        InvalidFileExtension: If the most frequent file extension is found to be 'None' during auto-compression.
    """

    feature_report_path(
        dest,
        "ingest",
        {
            "Images_Compression": images_compression,
            "Progress_Bar": progress_bar,
            "Summary": summary,
        },
    )
    if not os.path.isdir(src):
        raise InvalidPathException(src)

    if os.path.isdir(dest) and os.path.samefile(src, dest):
        raise SamePathException(src)

    if images_compression == "auto":
        images_compression = get_most_common_extension(src)
        if images_compression is None:
            raise InvalidFileExtension(src)

    ds = hub.dataset(dest, creds=dest_creds, **dataset_kwargs)

    # TODO: support more than just image classification (and update docstring)
    unstructured = ImageClassification(source=src)

    # TODO: auto detect compression
    unstructured.structure(
        ds,  # type: ignore
        use_progress_bar=progress_bar,
        generate_summary=summary,
        image_tensor_args={"sample_compression": images_compression},
    )

    return ds  # type: ignore
def ingest_kaggle(tag, src, dest, exist_ok=False, images_compression='auto', dest_creds=None, kaggle_credentials=None, progress_bar=True, summary=True, **dataset_kwargs)

Download and ingest a kaggle dataset and store it as a structured dataset to destination

Note

Currently only local source paths and image classification datasets are supported for automatic ingestion.

Args

tag : str
Kaggle dataset tag. Example: "coloradokb/dandelionimages" points to https://www.kaggle.com/coloradokb/dandelionimages
src : str
Local path to where the raw kaggle dataset will be downlaoded to.
dest : str
Destination path where the structured dataset will be stored. Can be: - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
exist_ok : bool
If the kaggle dataset was already downloaded and exist_ok is True, ingestion will proceed without error.
images_compression : str
For image classification datasets, this compression will be used for the images tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
dest_creds : dict
A dictionary containing credentials used to access the destination path of the dataset.
kaggle_credentials : dict
A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If None, environment variables/the kaggle.json file will be used if available.
progress_bar : bool
Enables or disables ingestion progress bar. Set to true by default.
summary : bool
Generates ingestion summary. Set to true by default.
**dataset_kwargs
Any arguments passed here will be forwarded to the dataset creator function.

Returns

Dataset
New dataset object with structured dataset.

Raises

SamePathException
If the source and destination path are same.
Expand source code
@staticmethod
def ingest_kaggle(
    tag: str,
    src: str,
    dest: str,
    exist_ok: bool = False,
    images_compression: str = "auto",
    dest_creds: dict = None,
    kaggle_credentials: dict = None,
    progress_bar: bool = True,
    summary: bool = True,
    **dataset_kwargs,
) -> Dataset:
    """Download and ingest a kaggle dataset and store it as a structured dataset to destination

    Note:
        Currently only local source paths and image classification datasets are supported for automatic ingestion.

    Args:
        tag (str): Kaggle dataset tag. Example: `"coloradokb/dandelionimages"` points to https://www.kaggle.com/coloradokb/dandelionimages
        src (str): Local path to where the raw kaggle dataset will be downlaoded to.
        dest (str): Destination path where the structured dataset will be stored. Can be:
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        exist_ok (bool): If the kaggle dataset was already downloaded and `exist_ok` is True, ingestion will proceed without error.
        images_compression (str): For image classification datasets, this compression will be used for the `images` tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
        dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
        kaggle_credentials (dict): A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If None, environment variables/the kaggle.json file will be used if available.
        progress_bar (bool): Enables or disables ingestion progress bar. Set to true by default.
        summary (bool): Generates ingestion summary. Set to true by default.
        **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.

    Returns:
        Dataset: New dataset object with structured dataset.

    Raises:
        SamePathException: If the source and destination path are same.
    """

    feature_report_path(
        dest,
        "ingest_kaggle",
        {
            "Images_Compression": images_compression,
            "Exist_Ok": exist_ok,
            "Progress_Bar": progress_bar,
            "Summary": summary,
        },
    )

    if os.path.isdir(src) and os.path.isdir(dest):
        if os.path.samefile(src, dest):
            raise SamePathException(src)

    download_kaggle_dataset(
        tag,
        local_path=src,
        kaggle_credentials=kaggle_credentials,
        exist_ok=exist_ok,
    )

    ds = hub.ingest(
        src=src,
        dest=dest,
        images_compression=images_compression,
        dest_creds=dest_creds,
        progress_bar=progress_bar,
        summary=summary,
        **dataset_kwargs,
    )

    return ds
def like(path, source, creds=None, overwrite=False)

Copies the source dataset's structure to a new location. No samples are copied, only the meta/info for the dataset and it's tensors.

Args

path : str
Path where the new dataset will be created.
source : Union[str, Dataset]
Path or dataset object that will be used as the template for the new dataset.
creds : dict
Credentials that will be used to create the new dataset.
overwrite : bool
If True and a dataset exists at destination, it will be overwritten. Defaults to False.

Returns

Dataset
New dataset object.
Expand source code
@staticmethod
def like(
    path: str,
    source: Union[str, Dataset],
    creds: dict = None,
    overwrite: bool = False,
) -> Dataset:
    """Copies the `source` dataset's structure to a new location. No samples are copied, only the meta/info for the dataset and it's tensors.

    Args:
        path (str): Path where the new dataset will be created.
        source (Union[str, Dataset]): Path or dataset object that will be used as the template for the new dataset.
        creds (dict): Credentials that will be used to create the new dataset.
        overwrite (bool): If True and a dataset exists at `destination`, it will be overwritten. Defaults to False.

    Returns:
        Dataset: New dataset object.
    """

    feature_report_path(path, "like", {"Overwrite": overwrite})

    destination_ds = dataset.empty(path, creds=creds, overwrite=overwrite)
    source_ds = source
    if isinstance(source, str):
        source_ds = dataset.load(source)

    for tensor_name in source_ds.version_state["meta"].tensors:  # type: ignore
        destination_ds.create_tensor_like(tensor_name, source_ds[tensor_name])

    destination_ds.info.update(source_ds.info.__getstate__())  # type: ignore

    return destination_ds
def list(workspace='', token=None)

List all available hub cloud datasets.

Args

workspace : str
Specify user/organization name. If not given, returns a list of all datasets that can be accessed, regardless of what workspace they are in. Otherwise, lists all datasets in the given workspace.
token : str, optional
Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

Returns

List of dataset names.

Expand source code
@staticmethod
@hub_reporter.record_call
def list(
    workspace: str = "",
    token: Optional[str] = None,
) -> None:
    """List all available hub cloud datasets.

    Args:
        workspace (str): Specify user/organization name. If not given,
            returns a list of all datasets that can be accessed, regardless of what workspace they are in.
            Otherwise, lists all datasets in the given workspace.
        token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

    Returns:
        List of dataset names.
    """
    client = HubBackendClient(token=token)
    datasets = client.get_datasets(workspace=workspace)
    return datasets
def load(path, read_only=False, overwrite=False, public=True, memory_cache_size=256, local_cache_size=0, creds=None, token=None)

Loads an existing dataset

Important

Using overwrite will delete all of your data if it exists! Be very careful when setting this parameter.

Args

path : str
The full path to the dataset. Can be:- - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
read_only : bool
Opens dataset in read only mode if this is passed as True. Defaults to False. Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
overwrite : bool
WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
public : bool, optional
Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
memory_cache_size : int
The size of the memory cache to be used in MB.
local_cache_size : int
The size of the local filesystem cache to be used in MB.
creds : dict, optional
A dictionary containing credentials used to access the dataset at the path. This takes precedence over credentials present in the environment. Currently only works with s3 paths. It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
token : str, optional
Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

Returns

Dataset object created using the arguments provided.

Raises

DatasetHandlerError
If a Dataset does not exist at the given path.
Expand source code
@staticmethod
def load(
    path: str,
    read_only: bool = False,
    overwrite: bool = False,
    public: Optional[bool] = True,
    memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
    local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
    creds: Optional[dict] = None,
    token: Optional[str] = None,
) -> Dataset:
    """Loads an existing dataset

    Important:
        Using `overwrite` will delete all of your data if it exists! Be very careful when setting this parameter.

    Args:
        path (str): The full path to the dataset. Can be:-
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        read_only (bool): Opens dataset in read only mode if this is passed as True. Defaults to False.
            Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
        overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
        public (bool, optional): Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
        memory_cache_size (int): The size of the memory cache to be used in MB.
        local_cache_size (int): The size of the local filesystem cache to be used in MB.
        creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
            This takes precedence over credentials present in the environment. Currently only works with s3 paths.
            It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
        token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

    Returns:
        Dataset object created using the arguments provided.

    Raises:
        DatasetHandlerError: If a Dataset does not exist at the given path.
    """
    if creds is None:
        creds = {}

    feature_report_path(path, "load", {"Overwrite": overwrite})

    storage, cache_chain = get_storage_and_cache_chain(
        path=path,
        read_only=read_only,
        creds=creds,
        token=token,
        memory_cache_size=memory_cache_size,
        local_cache_size=local_cache_size,
    )

    if not dataset_exists(storage):
        raise DatasetHandlerError(
            f"A Hub dataset does not exist at the given path ({path}). Check the path provided or in case you want to create a new dataset, use hub.empty()."
        )
    if overwrite:
        storage.clear()

    read_only = storage.read_only
    return get_dataset_instance(
        path, storage=cache_chain, read_only=read_only, public=public, token=token
    )
def read(path, verify=False, convert_grayscale=True)

Utility that reads raw data from a file into a np.ndarray in 1 line of code. Also provides access to all important metadata.

Note

No data is actually loaded until you try to get a property of the returned Sample. This is useful for passing along to tensor.append and tensor.extend.

Examples

>>> sample = hub.read("path/to/cat.jpeg")
>>> type(sample.array)
<class 'numpy.ndarray'>
>>> sample.compression
'jpeg'

Supported File Types: image: png, jpeg, and all others supported by PIL: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html#fully-supported-formats

Args

path : str
Path to a supported file.
verify : bool
If True, contents of the file are verified.
convert_grayscale
If True, and if the rest of the dataset is in color (3D), then reshape a grayscale image by appending a 1 to its shape.

Returns

Sample
Sample object. Call sample.array to get the np.ndarray.
Expand source code
def read(path: str, verify: bool = False, convert_grayscale: bool = True) -> Sample:
    """Utility that reads raw data from a file into a `np.ndarray` in 1 line of code. Also provides access to all important metadata.

    Note:
        No data is actually loaded until you try to get a property of the returned `Sample`. This is useful for passing along to
            `tensor.append` and `tensor.extend`.

    Examples:
        >>> sample = hub.read("path/to/cat.jpeg")
        >>> type(sample.array)
        <class 'numpy.ndarray'>
        >>> sample.compression
        'jpeg'

    Supported File Types:
        image: png, jpeg, and all others supported by `PIL`: https://pillow.readthedocs.io/en/stable/handbook/image-file-formats.html#fully-supported-formats

    Args:
        path (str): Path to a supported file.
        verify (bool):  If True, contents of the file are verified.
        convert_grayscale: If True, and if the rest of the dataset is in color (3D), then
                           reshape a grayscale image by appending a 1 to its shape.

    Returns:
        Sample: Sample object. Call `sample.array` to get the `np.ndarray`.
    """

    sample = Sample(path, verify=verify)
    sample._convert_grayscale = convert_grayscale
    return sample

Classes

class dataset_cl (storage, index=None, group_index='', read_only=False, public=True, token=None, verbose=True, version_state=None)

Initializes a new or existing dataset.

Args

storage : LRUCache
The storage provider used to access the dataset.
index : Index, optional
The Index object restricting the view of this dataset's tensors.
group_index : str
Name of the group this dataset instance represents.
read_only : bool
Opens dataset in read only mode if this is passed as True. Defaults to False. Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
public : bool, optional
Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access.
token : str, optional
Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.
verbose : bool
If True, logs will be printed. Defaults to True.
version_state : Dict[str, Any], optional
The version state of the dataset, includes commit_id, commit_node, branch, branch_commit_map and commit_node_map.

Raises

ValueError
If an existing local path is given, it must be a directory.
ImproperDatasetInitialization
Exactly one argument out of 'path' and 'storage' needs to be specified. This is raised if none of them are specified or more than one are specifed.
InvalidHubPathException
If a Hub cloud path (path starting with hub://) is specified and it isn't of the form hub://username/datasetname.
AuthorizationException
If a Hub cloud path (path starting with hub://) is specified and the user doesn't have access to the dataset.
PathNotEmptyException
If the path to the dataset doesn't contain a Hub dataset and is also not empty.
Expand source code
class Dataset:
    def __init__(
        self,
        storage: LRUCache,
        index: Optional[Index] = None,
        group_index: str = "",
        read_only: bool = False,
        public: Optional[bool] = True,
        token: Optional[str] = None,
        verbose: bool = True,
        version_state: Optional[Dict[str, Any]] = None,
    ):
        """Initializes a new or existing dataset.

        Args:
            storage (LRUCache): The storage provider used to access the dataset.
            index (Index, optional): The Index object restricting the view of this dataset's tensors.
            group_index (str): Name of the group this dataset instance represents.
            read_only (bool): Opens dataset in read only mode if this is passed as True. Defaults to False.
                Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
            public (bool, optional): Applied only if storage is Hub cloud storage and a new Dataset is being created. Defines if the dataset will have public access.
            token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.
            verbose (bool): If True, logs will be printed. Defaults to True.
            version_state (Dict[str, Any], optional): The version state of the dataset, includes commit_id, commit_node, branch, branch_commit_map and commit_node_map.


        Raises:
            ValueError: If an existing local path is given, it must be a directory.
            ImproperDatasetInitialization: Exactly one argument out of 'path' and 'storage' needs to be specified.
                This is raised if none of them are specified or more than one are specifed.
            InvalidHubPathException: If a Hub cloud path (path starting with hub://) is specified and it isn't of the form hub://username/datasetname.
            AuthorizationException: If a Hub cloud path (path starting with hub://) is specified and the user doesn't have access to the dataset.
            PathNotEmptyException: If the path to the dataset doesn't contain a Hub dataset and is also not empty.
        """
        # uniquely identifies dataset
        self.path = get_path_from_storage(storage)
        self.storage = storage
        self._read_only = read_only
        base_storage = get_base_storage(storage)
        if (
            not read_only and index is None and isinstance(base_storage, S3Provider)
        ):  # Dataset locking only for S3 datasets
            try:
                lock(base_storage, callback=lambda: self._lock_lost_handler)
            except LockedException:
                self.read_only = True
                warnings.warn(
                    "Opening dataset in read only mode as another machine has locked it for writing."
                )

        self.index: Index = index or Index()
        self.group_index = group_index
        self._token = token
        self.public = public
        self.verbose = verbose
        self.version_state: Dict[str, Any] = version_state or {}
        self._set_derived_attributes()

    def _lock_lost_handler(self):
        """This is called when lock is acquired but lost later on due to slow update."""
        self.read_only = True
        warnings.warn(
            "Unable to update dataset lock as another machine has locked it for writing. Switching to read only mode."
        )

    def __enter__(self):
        self.storage.autoflush = False
        return self

    def __exit__(self, exc_type, exc_val, exc_tb):
        self.storage.autoflush = True
        self.flush()

    @property
    def num_samples(self) -> int:
        """Returns the length of the smallest tensor.
        Ignores any applied indexing and returns the total length.
        """
        return min(map(len, self.version_state["full_tensors"].values()), default=0)

    @property
    def meta(self) -> DatasetMeta:
        """Returns the metadata of the dataset."""
        return self.version_state["meta"]

    def __len__(self):
        """Returns the length of the smallest tensor"""
        tensor_lengths = [len(tensor) for tensor in self.tensors.values()]
        return min(tensor_lengths, default=0)

    def __getstate__(self) -> Dict[str, Any]:
        """Returns a dict that can be pickled and used to restore this dataset.

        Note:
            Pickling a dataset does not copy the dataset, it only saves attributes that can be used to restore the dataset.
            If you pickle a local dataset and try to access it on a machine that does not have the data present, the dataset will not work.
        """
        if self.path.startswith("mem://"):
            raise MemoryDatasetCanNotBePickledError
        return {
            "path": self.path,
            "_read_only": self.read_only,
            "index": self.index,
            "group_index": self.group_index,
            "public": self.public,
            "storage": self.storage,
            "_token": self.token,
            "verbose": self.verbose,
            "version_state": self.version_state,
        }

    def __setstate__(self, state: Dict[str, Any]):
        """Restores dataset from a pickled state.

        Args:
            state (dict): The pickled state used to restore the dataset.
        """
        self.__dict__.update(state)
        self._set_derived_attributes()

    def __getitem__(
        self,
        item: Union[
            str, int, slice, List[int], Tuple[Union[int, slice, Tuple[int]]], Index
        ],
    ):
        if isinstance(item, str):
            fullpath = posixpath.join(self.group_index, item)
            tensor = self._get_tensor_from_root(fullpath)
            if tensor is not None:
                return tensor[self.index]
            elif self._has_group_in_root(fullpath):
                return self.__class__(
                    storage=self.storage,
                    index=self.index,
                    group_index=posixpath.join(self.group_index, item),
                    read_only=self.read_only,
                    token=self._token,
                    verbose=False,
                    version_state=self.version_state,
                )
            elif "/" in item:
                splt = posixpath.split(item)
                return self[splt[0]][splt[1]]
            else:
                raise TensorDoesNotExistError(item)
        elif isinstance(item, (int, slice, list, tuple, Index)):
            return self.__class__(
                storage=self.storage,
                index=self.index[item],
                group_index=self.group_index,
                read_only=self.read_only,
                token=self._token,
                verbose=False,
                version_state=self.version_state,
            )
        else:
            raise InvalidKeyTypeError(item)

    @hub_reporter.record_call
    def create_tensor(
        self,
        name: str,
        htype: str = DEFAULT_HTYPE,
        dtype: Union[str, np.dtype] = UNSPECIFIED,
        sample_compression: str = UNSPECIFIED,
        chunk_compression: str = UNSPECIFIED,
        **kwargs,
    ):
        """Creates a new tensor in the dataset.

        Args:
            name (str): The name of the tensor to be created.
            htype (str): The class of data for the tensor.
                The defaults for other parameters are determined in terms of this value.
                For example, `htype="image"` would have `dtype` default to `uint8`.
                These defaults can be overridden by explicitly passing any of the other parameters to this function.
                May also modify the defaults for other parameters.
            dtype (str): Optionally override this tensor's `dtype`. All subsequent samples are required to have this `dtype`.
            sample_compression (str): All samples will be compressed in the provided format. If `None`, samples are uncompressed.
            chunk_compression (str): All chunks will be compressed in the provided format. If `None`, chunks are uncompressed.
            **kwargs: `htype` defaults can be overridden by passing any of the compatible parameters.
                To see all `htype`s and their correspondent arguments, check out `hub/htypes.py`.

        Returns:
            The new tensor, which can also be accessed by `self[name]`.

        Raises:
            TensorAlreadyExistsError: Duplicate tensors are not allowed.
            TensorGroupAlreadyExistsError: Duplicate tensor groups are not allowed.
            InvalidTensorNameError: If `name` is in dataset attributes.
            NotImplementedError: If trying to override `chunk_compression`.
        """
        # if not the head node, checkout to an auto branch that is newly created
        auto_checkout(self.version_state, self.storage)
        name = name.strip("/")

        while "//" in name:
            name = name.replace("//", "/")

        full_path = posixpath.join(self.group_index, name)

        if tensor_exists(full_path, self.storage, self.version_state["commit_id"]):
            raise TensorAlreadyExistsError(name)

        if full_path in self._groups:
            raise TensorGroupAlreadyExistsError(name)

        if not name or name in dir(self):
            raise InvalidTensorNameError(name)

        if not self._is_root():
            return self.root.create_tensor(
                full_path, htype, dtype, sample_compression, chunk_compression, **kwargs
            )

        if "/" in name:
            self._create_group(posixpath.split(name)[0])

        # Seperate meta and info

        htype_config = HTYPE_CONFIGURATIONS[htype].copy()
        info_keys = htype_config.pop("_info", [])
        info_kwargs = {}
        meta_kwargs = {}
        for k, v in kwargs.items():
            if k in info_keys:
                info_kwargs[k] = v
            else:
                meta_kwargs[k] = v

        # Set defaults
        for k in info_keys:
            if k not in info_kwargs:
                info_kwargs[k] = htype_config[k]

        create_tensor(
            name,
            self.storage,
            htype=htype,
            dtype=dtype,
            sample_compression=sample_compression,
            chunk_compression=chunk_compression,
            version_state=self.version_state,
            **meta_kwargs,
        )
        self.version_state["meta"].tensors.append(name)
        ffw_dataset_meta(self.version_state["meta"])
        self.storage.maybe_flush()
        tensor = Tensor(name, self.storage, self.version_state)  # type: ignore

        self.version_state["full_tensors"][name] = tensor
        tensor.info.update(info_kwargs)
        return tensor

    @hub_reporter.record_call
    def create_tensor_like(self, name: str, source: "Tensor") -> "Tensor":
        """Copies the `source` tensor's meta information and creates a new tensor with it. No samples are copied, only the meta/info for the tensor is.

        Args:
            name (str): Name for the new tensor.
            source (Tensor): Tensor who's meta/info will be copied. May or may not be contained in the same dataset.

        Returns:
            Tensor: New Tensor object.
        """

        info = source.info.__getstate__().copy()
        meta = source.meta.__getstate__().copy()
        del meta["min_shape"]
        del meta["max_shape"]
        del meta["length"]
        del meta["version"]

        destination_tensor = self.create_tensor(
            name,
            **meta,
        )
        destination_tensor.info.update(info)

        return destination_tensor

    __getattr__ = __getitem__

    def __setattr__(self, name: str, value):
        if isinstance(value, (np.ndarray, np.generic)):
            raise TypeError(
                "Setting tensor attributes directly is not supported. To add a tensor, use the `create_tensor` method."
                + "To add data to a tensor, use the `append` and `extend` methods."
            )
        else:
            return super().__setattr__(name, value)

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    def _load_version_info(self):
        """Loads data from version_control_file otherwise assume it doesn't exist and load all empty"""
        branch = "main"
        version_state = {"branch": branch}
        try:
            version_info = pickle.loads(self.storage[get_version_control_info_key()])
            version_state["branch_commit_map"] = version_info["branch_commit_map"]
            version_state["commit_node_map"] = version_info["commit_node_map"]
            commit_id = version_state["branch_commit_map"][branch]
            version_state["commit_id"] = commit_id
            version_state["commit_node"] = version_state["commit_node_map"][commit_id]
        except Exception:
            version_state["branch_commit_map"] = {}
            version_state["commit_node_map"] = {}
            # used to identify that this is the first commit so its data will not be in similar directory structure to the rest
            commit_id = FIRST_COMMIT_ID
            commit_node = CommitNode(branch, commit_id)
            version_state["commit_id"] = commit_id
            version_state["commit_node"] = commit_node
            version_state["branch_commit_map"][branch] = commit_id
            version_state["commit_node_map"][commit_id] = commit_node
        version_state["full_tensors"] = {}  # keeps track of the full unindexed tensors
        self.version_state = version_state

    def commit(self, message: Optional[str] = None) -> None:
        """Stores a snapshot of the current state of the dataset.
        Note: Commiting from a non-head node in any branch, will lead to an auto checkout to a new branch.
        This same behaviour will happen if new samples are added or existing samples are updated from a non-head node.

        Args:
            message (str, optional): Used to describe the commit.

        Returns:
            str: the commit id of the stored commit that can be used to access the snapshot.
        """
        commit_id = self.version_state["commit_id"]
        commit(self.version_state, self.storage, message)

        # do not store commit message
        hub_reporter.feature_report(
            feature_name="commit",
            parameters={},
        )

        return commit_id

    def checkout(self, address: str, create: bool = False) -> str:
        """Checks out to a specific commit_id or branch. If create = True, creates a new branch with name as address.
        Note: Checkout from a head node in any branch that contains uncommitted data will lead to an auto commit before the checkout.

        Args:
            address (str): The commit_id or branch to checkout to.
            create (bool): If True, creates a new branch with name as address.

        Returns:
            str: The commit_id of the dataset after checkout.
        """
        checkout(self.version_state, self.storage, address, create)

        # do not store address
        hub_reporter.feature_report(
            feature_name="checkout",
            parameters={"Create": str(create)},
        )

        return self.version_state["commit_id"]

    def log(self):
        """Displays the details of all the past commits."""
        # TODO: use logger.info instead of prints
        commit_node = self.version_state["commit_node"]
        logger.info("---------------\nHub Version Log\n---------------\n")
        logger.info(f"Current Branch: {self.version_state['branch']}\n")
        while commit_node:
            if commit_node.commit_time is not None:
                logger.info(f"{commit_node}\n")
            commit_node = commit_node.parent

    def _populate_meta(self):
        """Populates the meta information for the dataset."""

        if dataset_exists(self.storage):
            if self.verbose:
                logger.info(f"{self.path} loaded successfully.")
            load_meta(self.storage, self.version_state)

        elif not self.storage.empty():
            # dataset does not exist, but the path was not empty
            raise PathNotEmptyException

        else:
            if self.read_only:
                # cannot create a new dataset when in read_only mode.
                raise CouldNotCreateNewDatasetException(self.path)
            meta_key = get_dataset_meta_key(self.version_state["commit_id"])
            self.version_state["meta"] = DatasetMeta()
            self.storage[meta_key] = self.version_state["meta"]
            self.flush()

    @property
    def read_only(self):
        return self._read_only

    @read_only.setter
    def read_only(self, value: bool):
        if value:
            self.storage.enable_readonly()
        else:
            self.storage.disable_readonly()
        self._read_only = value

    @hub_reporter.record_call
    def pytorch(
        self,
        transform: Optional[Callable] = None,
        tensors: Optional[Sequence[str]] = None,
        num_workers: int = 1,
        batch_size: int = 1,
        drop_last: bool = False,
        collate_fn: Optional[Callable] = None,
        pin_memory: bool = False,
        shuffle: bool = False,
        buffer_size: int = 10 * 1000,
        use_local_cache: bool = False,
        use_progress_bar: bool = False,
    ):
        """Converts the dataset into a pytorch Dataloader.

        Note:
            Pytorch does not support uint16, uint32, uint64 dtypes. These are implicitly type casted to int32, int64 and int64 respectively.
            This spins up it's own workers to fetch data.

        Args:
            transform (Callable, optional) : Transformation function to be applied to each sample.
            tensors (List, optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if `tensors=["image", "label"]`, your training script should expect each batch will be provided as a tuple of (image, label).
            num_workers (int): The number of workers to use for fetching data in parallel.
            batch_size (int): Number of samples per batch to load. Default value is 1.
            drop_last (bool): Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size.
                If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Default value is False.
                Read torch.utils.data.DataLoader docs for more details.
            collate_fn (Callable, optional): merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset.
                Read torch.utils.data.DataLoader docs for more details.
            pin_memory (bool): If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Default value is False.
                Read torch.utils.data.DataLoader docs for more details.
            shuffle (bool): If True, the data loader will shuffle the data indices. Default value is False.
            buffer_size (int): The size of the buffer used to prefetch/shuffle in MB. The buffer uses shared memory under the hood. Default value is 10 GB. Increasing the buffer_size will increase the extent of shuffling.
            use_local_cache (bool): If True, the data loader will use a local cache to store data. This is useful when the dataset can fit on the machine and we don't want to fetch the data multiple times for each iteration. Default value is False.
            use_progress_bar (bool): If True, tqdm will be wrapped around the returned dataloader. Default value is True.

        Returns:
            A torch.utils.data.DataLoader object.
        """
        from hub.integrations import dataset_to_pytorch

        dataloader = dataset_to_pytorch(
            self,
            transform,
            tensors,
            num_workers=num_workers,
            batch_size=batch_size,
            drop_last=drop_last,
            collate_fn=collate_fn,
            pin_memory=pin_memory,
            shuffle=shuffle,
            buffer_size=buffer_size,
            use_local_cache=use_local_cache,
        )

        if use_progress_bar:
            dataloader = tqdm(dataloader, desc=self.path, total=len(self) // batch_size)

        return dataloader

    def _get_total_meta(self):
        """Returns tensor metas all together"""
        return {
            tensor_key: tensor_value.meta
            for tensor_key, tensor_value in self.version_state["full_tensors"].items()
        }

    def _set_derived_attributes(self):
        """Sets derived attributes during init and unpickling."""

        if self.index.is_trivial() and self._is_root():
            self.storage.autoflush = True

        if not self.version_state:
            self._load_version_info()

        self._populate_meta()  # TODO: use the same scheme as `load_info`
        self.info = load_info(get_dataset_info_key(self.version_state["commit_id"]), self.storage, self.version_state)  # type: ignore
        self.index.validate(self.num_samples)

    @hub_reporter.record_call
    def tensorflow(self):
        """Converts the dataset into a tensorflow compatible format.

        See:
            https://www.tensorflow.org/api_docs/python/tf/data/Dataset

        Returns:
            tf.data.Dataset object that can be used for tensorflow training.
        """
        return dataset_to_tensorflow(self)

    def flush(self):
        """Necessary operation after writes if caches are being used.
        Writes all the dirty data from the cache layers (if any) to the underlying storage.
        Here dirty data corresponds to data that has been changed/assigned and but hasn't yet been sent to the
        underlying storage.
        """
        self.storage.flush()

    def clear_cache(self):
        """Flushes (see Dataset.flush documentation) the contents of the cache layers (if any) and then deletes contents
         of all the layers of it.
        This doesn't delete data from the actual storage.
        This is useful if you have multiple datasets with memory caches open, taking up too much RAM.
        Also useful when local cache is no longer needed for certain datasets and is taking up storage space.
        """
        if hasattr(self.storage, "clear_cache"):
            self.storage.clear_cache()

    def size_approx(self):
        """Estimates the size in bytes of the dataset.
        Includes only content, so will generally return an under-estimate.
        """
        tensors = self.version_state["full_tensors"].values()
        chunk_engines = [tensor.chunk_engine for tensor in tensors]
        size = sum(c.num_chunks * c.min_chunk_size for c in chunk_engines)
        return size

    @hub_reporter.record_call
    def delete(self, large_ok=False):
        """Deletes the entire dataset from the cache layers (if any) and the underlying storage.
        This is an IRREVERSIBLE operation. Data once deleted can not be recovered.

        Args:
            large_ok (bool): Delete datasets larger than 1GB. Disabled by default.
        """

        if not large_ok:
            size = self.size_approx()
            if size > hub.constants.DELETE_SAFETY_SIZE:
                logger.info(
                    f"Hub Dataset {self.path} was too large to delete. Try again with large_ok=True."
                )
                return

        unlock(self.storage)
        self.storage.clear()

    def __str__(self):
        path_str = ""
        if self.path:
            path_str = f"path='{self.path}', "

        mode_str = ""
        if self.read_only:
            mode_str = f"read_only=True, "

        index_str = f"index={self.index}, "
        if self.index.is_trivial():
            index_str = ""

        group_index_str = (
            f"group_index='{self.group_index}', " if self.group_index else ""
        )

        return f"Dataset({path_str}{mode_str}{index_str}{group_index_str}tensors={self.version_state['meta'].tensors})"

    __repr__ = __str__

    def _get_tensor_from_root(self, name: str) -> Optional[Tensor]:
        """Gets a tensor from the root dataset.
        Acesses storage only for the first call.
        """
        ret = self.version_state["full_tensors"].get(name)
        if ret is None:
            load_meta(self.storage, self.version_state)
            ret = self.version_state["full_tensors"].get(name)
        return ret

    def _has_group_in_root(self, name: str) -> bool:
        """Checks if a group exists in the root dataset.
        This is faster than checking `if group in self._groups:`
        """
        if name in self.version_state["meta"].groups:
            return True
        load_meta(self.storage, self.version_state)
        return name in self.version_state["meta"].groups

    @property
    def token(self):
        """Get attached token of the dataset"""

        return self._token

    @property
    def _ungrouped_tensors(self) -> Dict[str, Tensor]:
        """Top level tensors in this group that do not belong to any sub groups"""
        return {
            posixpath.basename(k): v
            for k, v in self.version_state["full_tensors"].items()
            if posixpath.dirname(k) == self.group_index
        }

    @property
    def _all_tensors_filtered(self) -> List[str]:
        """Names of all tensors belonging to this group, including those within sub groups"""
        load_meta(self.storage, self.version_state)
        return [
            posixpath.relpath(t, self.group_index)
            for t in self.version_state["full_tensors"]
            if not self.group_index or t.startswith(self.group_index + "/")
        ]

    @property
    def tensors(self) -> Dict[str, Tensor]:
        """All tensors belonging to this group, including those within sub groups. Always returns the sliced tensors."""
        return {
            t: self.version_state["full_tensors"][posixpath.join(self.group_index, t)][
                self.index
            ]
            for t in self._all_tensors_filtered
        }

    @property
    def _groups(self) -> List[str]:
        """Names of all groups in the root dataset"""
        meta_key = get_dataset_meta_key(self.version_state["commit_id"])
        return self.storage.get_cachable(meta_key, DatasetMeta).groups  # type: ignore

    @property
    def _groups_filtered(self) -> List[str]:
        """Names of all sub groups in this group"""
        groups_filtered = []
        for g in self._groups:
            dirname, basename = posixpath.split(g)
            if dirname == self.group_index:
                groups_filtered.append(basename)
        return groups_filtered

    @property
    def groups(self) -> Dict[str, "Dataset"]:
        """All sub groups in this group"""
        return {g: self[g] for g in self._groups_filtered}

    @property
    def commit_id(self) -> str:
        """The current commit_id of the dataset."""
        return self.version_state["commit_id"]

    @property
    def branch(self) -> str:
        """The current branch of the dataset"""
        return self.version_state["branch"]

    def _is_root(self) -> bool:
        return not self.group_index

    @property
    def parent(self):
        """Returns the parent of this group. Returns None if this is the root dataset"""
        if self._is_root():
            return None
        autoflush = self.storage.autoflush
        ds = self.__class__(
            self.storage,
            self.index,
            posixpath.dirname(self.group_index),
            self.read_only,
            self.public,
            self._token,
            self.verbose,
        )
        self.storage.autoflush = autoflush
        return ds

    @property
    def root(self):
        if self._is_root():
            return self
        autoflush = self.storage.autoflush
        ds = self.__class__(
            self.storage,
            self.index,
            "",
            self.read_only,
            self.public,
            self._token,
            self.verbose,
        )
        self.storage.autoflush = autoflush
        return ds

    def _create_group(self, name: str) -> "Dataset":
        """Internal method used by `create_group` and `create_tensor`."""
        meta_key = get_dataset_meta_key(self.version_state["commit_id"])
        meta = self.storage.get_cachable(meta_key, DatasetMeta)
        groups = meta.groups
        if not name or name in dir(self):
            raise InvalidTensorGroupNameError(name)
        fullname = name
        while name:
            if name in self.version_state["full_tensors"]:
                raise TensorAlreadyExistsError(name)
            groups.append(name)
            name, _ = posixpath.split(name)
        meta.groups = list(set(groups))
        self.storage[meta_key] = meta
        self.storage.maybe_flush()
        return self[fullname]

    def create_group(self, name: str) -> "Dataset":
        """Creates a tensor group. Intermediate groups in the path are also created."""
        if not self._is_root():
            return self.root.create_group(posixpath.join(self.group_index, name))
        name = name.strip("/")
        while "//" in name:
            name = name.replace("//", "/")
        if name in self._groups:
            raise TensorGroupAlreadyExistsError(name)
        return self._create_group(name)

    # the below methods are used by cloudpickle dumps
    def __origin__(self):
        return None

    def __values__(self):
        return None

    def __type__(self):
        return None

    def __union_params__(self):
        return None

    def __tuple_params__(self):
        return None

    def __result__(self):
        return None

    def __args__(self):
        return None

Subclasses

  • hub.core.dataset.hub_cloud_dataset.HubCloudDataset

Instance variables

var branch

The current branch of the dataset

Expand source code
@property
def branch(self) -> str:
    """The current branch of the dataset"""
    return self.version_state["branch"]
var commit_id

The current commit_id of the dataset.

Expand source code
@property
def commit_id(self) -> str:
    """The current commit_id of the dataset."""
    return self.version_state["commit_id"]
var groups

All sub groups in this group

Expand source code
@property
def groups(self) -> Dict[str, "Dataset"]:
    """All sub groups in this group"""
    return {g: self[g] for g in self._groups_filtered}
var meta

Returns the metadata of the dataset.

Expand source code
@property
def meta(self) -> DatasetMeta:
    """Returns the metadata of the dataset."""
    return self.version_state["meta"]
var parent

Returns the parent of this group. Returns None if this is the root dataset

Expand source code
@property
def parent(self):
    """Returns the parent of this group. Returns None if this is the root dataset"""
    if self._is_root():
        return None
    autoflush = self.storage.autoflush
    ds = self.__class__(
        self.storage,
        self.index,
        posixpath.dirname(self.group_index),
        self.read_only,
        self.public,
        self._token,
        self.verbose,
    )
    self.storage.autoflush = autoflush
    return ds
var root
Expand source code
@property
def root(self):
    if self._is_root():
        return self
    autoflush = self.storage.autoflush
    ds = self.__class__(
        self.storage,
        self.index,
        "",
        self.read_only,
        self.public,
        self._token,
        self.verbose,
    )
    self.storage.autoflush = autoflush
    return ds
var tensors

All tensors belonging to this group, including those within sub groups. Always returns the sliced tensors.

Expand source code
@property
def tensors(self) -> Dict[str, Tensor]:
    """All tensors belonging to this group, including those within sub groups. Always returns the sliced tensors."""
    return {
        t: self.version_state["full_tensors"][posixpath.join(self.group_index, t)][
            self.index
        ]
        for t in self._all_tensors_filtered
    }

Methods

def checkout(self, address, create=False)

Checks out to a specific commit_id or branch. If create = True, creates a new branch with name as address. Note: Checkout from a head node in any branch that contains uncommitted data will lead to an auto commit before the checkout.

Args

address : str
The commit_id or branch to checkout to.
create : bool
If True, creates a new branch with name as address.

Returns

str
The commit_id of the dataset after checkout.
Expand source code
def checkout(self, address: str, create: bool = False) -> str:
    """Checks out to a specific commit_id or branch. If create = True, creates a new branch with name as address.
    Note: Checkout from a head node in any branch that contains uncommitted data will lead to an auto commit before the checkout.

    Args:
        address (str): The commit_id or branch to checkout to.
        create (bool): If True, creates a new branch with name as address.

    Returns:
        str: The commit_id of the dataset after checkout.
    """
    checkout(self.version_state, self.storage, address, create)

    # do not store address
    hub_reporter.feature_report(
        feature_name="checkout",
        parameters={"Create": str(create)},
    )

    return self.version_state["commit_id"]
def commit(self, message=None)

Stores a snapshot of the current state of the dataset. Note: Commiting from a non-head node in any branch, will lead to an auto checkout to a new branch. This same behaviour will happen if new samples are added or existing samples are updated from a non-head node.

Args

message : str, optional
Used to describe the commit.

Returns

str
the commit id of the stored commit that can be used to access the snapshot.
Expand source code
def commit(self, message: Optional[str] = None) -> None:
    """Stores a snapshot of the current state of the dataset.
    Note: Commiting from a non-head node in any branch, will lead to an auto checkout to a new branch.
    This same behaviour will happen if new samples are added or existing samples are updated from a non-head node.

    Args:
        message (str, optional): Used to describe the commit.

    Returns:
        str: the commit id of the stored commit that can be used to access the snapshot.
    """
    commit_id = self.version_state["commit_id"]
    commit(self.version_state, self.storage, message)

    # do not store commit message
    hub_reporter.feature_report(
        feature_name="commit",
        parameters={},
    )

    return commit_id
def create_group(self, name)

Creates a tensor group. Intermediate groups in the path are also created.

Expand source code
def create_group(self, name: str) -> "Dataset":
    """Creates a tensor group. Intermediate groups in the path are also created."""
    if not self._is_root():
        return self.root.create_group(posixpath.join(self.group_index, name))
    name = name.strip("/")
    while "//" in name:
        name = name.replace("//", "/")
    if name in self._groups:
        raise TensorGroupAlreadyExistsError(name)
    return self._create_group(name)
def create_tensor(self, name, htype='generic', dtype='unspecified', sample_compression='unspecified', chunk_compression='unspecified', **kwargs)

Creates a new tensor in the dataset.

Args

name : str
The name of the tensor to be created.
htype : str
The class of data for the tensor. The defaults for other parameters are determined in terms of this value. For example, htype="image" would have dtype default to uint8. These defaults can be overridden by explicitly passing any of the other parameters to this function. May also modify the defaults for other parameters.
dtype : str
Optionally override this tensor's dtype. All subsequent samples are required to have this dtype.
sample_compression : str
All samples will be compressed in the provided format. If None, samples are uncompressed.
chunk_compression : str
All chunks will be compressed in the provided format. If None, chunks are uncompressed.
**kwargs
hub.htype defaults can be overridden by passing any of the compatible parameters. To see all hub.htypes and their correspondent arguments, check out hub/htypes.py.

Returns

The new tensor, which can also be accessed by self[name].

Raises

TensorAlreadyExistsError
Duplicate tensors are not allowed.
TensorGroupAlreadyExistsError
Duplicate tensor groups are not allowed.
InvalidTensorNameError
If name is in dataset attributes.
NotImplementedError
If trying to override chunk_compression.
Expand source code
@hub_reporter.record_call
def create_tensor(
    self,
    name: str,
    htype: str = DEFAULT_HTYPE,
    dtype: Union[str, np.dtype] = UNSPECIFIED,
    sample_compression: str = UNSPECIFIED,
    chunk_compression: str = UNSPECIFIED,
    **kwargs,
):
    """Creates a new tensor in the dataset.

    Args:
        name (str): The name of the tensor to be created.
        htype (str): The class of data for the tensor.
            The defaults for other parameters are determined in terms of this value.
            For example, `htype="image"` would have `dtype` default to `uint8`.
            These defaults can be overridden by explicitly passing any of the other parameters to this function.
            May also modify the defaults for other parameters.
        dtype (str): Optionally override this tensor's `dtype`. All subsequent samples are required to have this `dtype`.
        sample_compression (str): All samples will be compressed in the provided format. If `None`, samples are uncompressed.
        chunk_compression (str): All chunks will be compressed in the provided format. If `None`, chunks are uncompressed.
        **kwargs: `htype` defaults can be overridden by passing any of the compatible parameters.
            To see all `htype`s and their correspondent arguments, check out `hub/htypes.py`.

    Returns:
        The new tensor, which can also be accessed by `self[name]`.

    Raises:
        TensorAlreadyExistsError: Duplicate tensors are not allowed.
        TensorGroupAlreadyExistsError: Duplicate tensor groups are not allowed.
        InvalidTensorNameError: If `name` is in dataset attributes.
        NotImplementedError: If trying to override `chunk_compression`.
    """
    # if not the head node, checkout to an auto branch that is newly created
    auto_checkout(self.version_state, self.storage)
    name = name.strip("/")

    while "//" in name:
        name = name.replace("//", "/")

    full_path = posixpath.join(self.group_index, name)

    if tensor_exists(full_path, self.storage, self.version_state["commit_id"]):
        raise TensorAlreadyExistsError(name)

    if full_path in self._groups:
        raise TensorGroupAlreadyExistsError(name)

    if not name or name in dir(self):
        raise InvalidTensorNameError(name)

    if not self._is_root():
        return self.root.create_tensor(
            full_path, htype, dtype, sample_compression, chunk_compression, **kwargs
        )

    if "/" in name:
        self._create_group(posixpath.split(name)[0])

    # Seperate meta and info

    htype_config = HTYPE_CONFIGURATIONS[htype].copy()
    info_keys = htype_config.pop("_info", [])
    info_kwargs = {}
    meta_kwargs = {}
    for k, v in kwargs.items():
        if k in info_keys:
            info_kwargs[k] = v
        else:
            meta_kwargs[k] = v

    # Set defaults
    for k in info_keys:
        if k not in info_kwargs:
            info_kwargs[k] = htype_config[k]

    create_tensor(
        name,
        self.storage,
        htype=htype,
        dtype=dtype,
        sample_compression=sample_compression,
        chunk_compression=chunk_compression,
        version_state=self.version_state,
        **meta_kwargs,
    )
    self.version_state["meta"].tensors.append(name)
    ffw_dataset_meta(self.version_state["meta"])
    self.storage.maybe_flush()
    tensor = Tensor(name, self.storage, self.version_state)  # type: ignore

    self.version_state["full_tensors"][name] = tensor
    tensor.info.update(info_kwargs)
    return tensor
def create_tensor_like(self, name, source)

Copies the source tensor's meta information and creates a new tensor with it. No samples are copied, only the meta/info for the tensor is.

Args

name : str
Name for the new tensor.
source : Tensor
Tensor who's meta/info will be copied. May or may not be contained in the same dataset.

Returns

Tensor
New Tensor object.
Expand source code
@hub_reporter.record_call
def create_tensor_like(self, name: str, source: "Tensor") -> "Tensor":
    """Copies the `source` tensor's meta information and creates a new tensor with it. No samples are copied, only the meta/info for the tensor is.

    Args:
        name (str): Name for the new tensor.
        source (Tensor): Tensor who's meta/info will be copied. May or may not be contained in the same dataset.

    Returns:
        Tensor: New Tensor object.
    """

    info = source.info.__getstate__().copy()
    meta = source.meta.__getstate__().copy()
    del meta["min_shape"]
    del meta["max_shape"]
    del meta["length"]
    del meta["version"]

    destination_tensor = self.create_tensor(
        name,
        **meta,
    )
    destination_tensor.info.update(info)

    return destination_tensor
def log(self)

Displays the details of all the past commits.

Expand source code
def log(self):
    """Displays the details of all the past commits."""
    # TODO: use logger.info instead of prints
    commit_node = self.version_state["commit_node"]
    logger.info("---------------\nHub Version Log\n---------------\n")
    logger.info(f"Current Branch: {self.version_state['branch']}\n")
    while commit_node:
        if commit_node.commit_time is not None:
            logger.info(f"{commit_node}\n")
        commit_node = commit_node.parent
def pytorch(self, transform=None, tensors=None, num_workers=1, batch_size=1, drop_last=False, collate_fn=None, pin_memory=False, shuffle=False, buffer_size=10000, use_local_cache=False, use_progress_bar=False)

Converts the dataset into a pytorch Dataloader.

Note

Pytorch does not support uint16, uint32, uint64 dtypes. These are implicitly type casted to int32, int64 and int64 respectively. This spins up it's own workers to fetch data.

Args

transform (Callable, optional) : Transformation function to be applied to each sample.
tensors : List, optional
Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if tensors=["image", "label"], your training script should expect each batch will be provided as a tuple of (image, label).
num_workers : int
The number of workers to use for fetching data in parallel.
batch_size : int
Number of samples per batch to load. Default value is 1.
drop_last : bool
Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Default value is False. Read torch.utils.data.DataLoader docs for more details.
collate_fn : Callable, optional
merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset. Read torch.utils.data.DataLoader docs for more details.
pin_memory : bool
If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Default value is False. Read torch.utils.data.DataLoader docs for more details.
shuffle : bool
If True, the data loader will shuffle the data indices. Default value is False.
buffer_size : int
The size of the buffer used to prefetch/shuffle in MB. The buffer uses shared memory under the hood. Default value is 10 GB. Increasing the buffer_size will increase the extent of shuffling.
use_local_cache : bool
If True, the data loader will use a local cache to store data. This is useful when the dataset can fit on the machine and we don't want to fetch the data multiple times for each iteration. Default value is False.
use_progress_bar : bool
If True, tqdm will be wrapped around the returned dataloader. Default value is True.

Returns

A torch.utils.data.DataLoader object.

Expand source code
@hub_reporter.record_call
def pytorch(
    self,
    transform: Optional[Callable] = None,
    tensors: Optional[Sequence[str]] = None,
    num_workers: int = 1,
    batch_size: int = 1,
    drop_last: bool = False,
    collate_fn: Optional[Callable] = None,
    pin_memory: bool = False,
    shuffle: bool = False,
    buffer_size: int = 10 * 1000,
    use_local_cache: bool = False,
    use_progress_bar: bool = False,
):
    """Converts the dataset into a pytorch Dataloader.

    Note:
        Pytorch does not support uint16, uint32, uint64 dtypes. These are implicitly type casted to int32, int64 and int64 respectively.
        This spins up it's own workers to fetch data.

    Args:
        transform (Callable, optional) : Transformation function to be applied to each sample.
        tensors (List, optional): Optionally provide a list of tensor names in the ordering that your training script expects. For example, if you have a dataset that has "image" and "label" tensors, if `tensors=["image", "label"]`, your training script should expect each batch will be provided as a tuple of (image, label).
        num_workers (int): The number of workers to use for fetching data in parallel.
        batch_size (int): Number of samples per batch to load. Default value is 1.
        drop_last (bool): Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch size.
            If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller. Default value is False.
            Read torch.utils.data.DataLoader docs for more details.
        collate_fn (Callable, optional): merges a list of samples to form a mini-batch of Tensor(s). Used when using batched loading from a map-style dataset.
            Read torch.utils.data.DataLoader docs for more details.
        pin_memory (bool): If True, the data loader will copy Tensors into CUDA pinned memory before returning them. Default value is False.
            Read torch.utils.data.DataLoader docs for more details.
        shuffle (bool): If True, the data loader will shuffle the data indices. Default value is False.
        buffer_size (int): The size of the buffer used to prefetch/shuffle in MB. The buffer uses shared memory under the hood. Default value is 10 GB. Increasing the buffer_size will increase the extent of shuffling.
        use_local_cache (bool): If True, the data loader will use a local cache to store data. This is useful when the dataset can fit on the machine and we don't want to fetch the data multiple times for each iteration. Default value is False.
        use_progress_bar (bool): If True, tqdm will be wrapped around the returned dataloader. Default value is True.

    Returns:
        A torch.utils.data.DataLoader object.
    """
    from hub.integrations import dataset_to_pytorch

    dataloader = dataset_to_pytorch(
        self,
        transform,
        tensors,
        num_workers=num_workers,
        batch_size=batch_size,
        drop_last=drop_last,
        collate_fn=collate_fn,
        pin_memory=pin_memory,
        shuffle=shuffle,
        buffer_size=buffer_size,
        use_local_cache=use_local_cache,
    )

    if use_progress_bar:
        dataloader = tqdm(dataloader, desc=self.path, total=len(self) // batch_size)

    return dataloader
def tensorflow(self)

Converts the dataset into a tensorflow compatible format.

See

https://www.tensorflow.org/api_docs/python/tf/data/Dataset

Returns

tf.data.Dataset object that can be used for tensorflow training.

Expand source code
@hub_reporter.record_call
def tensorflow(self):
    """Converts the dataset into a tensorflow compatible format.

    See:
        https://www.tensorflow.org/api_docs/python/tf/data/Dataset

    Returns:
        tf.data.Dataset object that can be used for tensorflow training.
    """
    return dataset_to_tensorflow(self)
class tensor (key, storage, version_state, index=None)

Initializes a new tensor.

Note

This operation does not create a new tensor in the storage provider, and should normally only be performed by Hub internals.

Args

key : str
The internal identifier for this tensor.
storage : LRUCache
The storage provider for the parent dataset.
version_state : Dict[str, Any]
The version state of the dataset, includes commit_id, commit_node, branch, branch_commit_map and commit_node_map.
index
The Index object restricting the view of this tensor. Can be an int, slice, or (used internally) an Index object.

Raises

TensorDoesNotExistError
If no tensor with key exists and a tensor_meta was not provided.
Expand source code
class Tensor:
    def __init__(
        self,
        key: str,
        storage: LRUCache,
        version_state: Dict[str, Any],
        index: Optional[Index] = None,
    ):
        """Initializes a new tensor.

        Note:
            This operation does not create a new tensor in the storage provider,
            and should normally only be performed by Hub internals.

        Args:
            key (str): The internal identifier for this tensor.
            storage (LRUCache): The storage provider for the parent dataset.
            version_state (Dict[str, Any]): The version state of the dataset, includes commit_id, commit_node, branch, branch_commit_map and commit_node_map.
            index: The Index object restricting the view of this tensor.
                Can be an int, slice, or (used internally) an Index object.

        Raises:
            TensorDoesNotExistError: If no tensor with `key` exists and a `tensor_meta` was not provided.
        """

        self.key = key
        self.storage = storage
        self.index = index or Index()
        self.version_state = version_state

        if not tensor_exists(self.key, self.storage, version_state["commit_id"]):
            raise TensorDoesNotExistError(self.key)

        self.chunk_engine = ChunkEngine(self.key, self.storage, self.version_state)
        self.index.validate(self.num_samples)
        self.info = load_info(
            get_tensor_info_key(self.key, version_state["commit_id"]),
            self.storage,
            version_state,
        )

        # An optimization to skip multiple .numpy() calls when performing inplace ops on slices:
        self._skip_next_setitem = False

    def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
        """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
        or a sequence of `hub.read` outputs, which can be used to load files. See examples down below.

        Example:
            numpy input:
                >>> len(tensor)
                0
                >>> tensor.extend(np.zeros((100, 28, 28, 1)))
                >>> len(tensor)
                100

            file input:
                >>> len(tensor)
                0
                >>> tensor.extend([
                        hub.read("path/to/image1"),
                        hub.read("path/to/image2"),
                    ])
                >>> len(tensor)
                2


        Args:
            samples (np.ndarray, Sequence, Sequence[Sample]): The data to add to the tensor.
                The length should be equal to the number of samples to add.

        Raises:
            TensorDtypeMismatchError: TensorDtypeMismatchError: Dtype for array must be equal to or castable to this tensor's dtype
        """

        self.chunk_engine.extend(samples)

    def append(
        self,
        sample: Union[np.ndarray, float, int, Sample],
    ):
        """Appends a single sample to the end of the tensor. Can be an array, scalar value, or the return value from `hub.read`,
        which can be used to load files. See examples down below.

        Examples:
            numpy input:
                >>> len(tensor)
                0
                >>> tensor.append(np.zeros((28, 28, 1)))
                >>> len(tensor)
                1

            file input:
                >>> len(tensor)
                0
                >>> tensor.append(hub.read("path/to/file"))
                >>> len(tensor)
                1

        Args:
            sample (np.ndarray, float, int, Sample): The data to append to the tensor. `Sample` is generated by `hub.read`. See the above examples.
        """
        self.extend([sample])

    @property
    def meta(self):
        return self.chunk_engine.tensor_meta

    @property
    def shape(self) -> Tuple[Optional[int], ...]:
        """Get the shape of this tensor. Length is included.

        Note:
            If you don't want `None` in the output shape or want the lower/upper bound shapes,
            use `tensor.shape_interval` instead.

        Example:
            >>> tensor.append(np.zeros((10, 10)))
            >>> tensor.append(np.zeros((10, 15)))
            >>> tensor.shape
            (2, 10, None)

        Returns:
            tuple: Tuple where each value is either `None` (if that axis is dynamic) or
                an `int` (if that axis is fixed).
        """
        shape = self.shape_interval.astuple()
        if self.index.values[0].subscriptable():
            return shape
        return shape[1:]

    @property
    def ndim(self) -> int:
        return len(self.shape)

    @property
    def dtype(self) -> np.dtype:
        if self.htype in ("json", "list"):
            return self.dtype
        if self.meta.dtype:
            return np.dtype(self.meta.dtype)
        return None

    @property
    def htype(self):
        return self.meta.htype

    @property
    def shape_interval(self) -> ShapeInterval:
        """Returns a `ShapeInterval` object that describes this tensor's shape more accurately. Length is included.

        Note:
            If you are expecting a `tuple`, use `tensor.shape` instead.

        Example:
            >>> tensor.append(np.zeros((10, 10)))
            >>> tensor.append(np.zeros((10, 15)))
            >>> tensor.shape_interval
            ShapeInterval(lower=(2, 10, 10), upper=(2, 10, 15))
            >>> str(tensor.shape_interval)
            (2, 10, 10:15)

        Returns:
            ShapeInterval: Object containing `lower` and `upper` properties.
        """

        length = [len(self)]

        min_shape = length + list(self.meta.min_shape)
        max_shape = length + list(self.meta.max_shape)

        return ShapeInterval(min_shape, max_shape)

    @property
    def is_dynamic(self) -> bool:
        """Will return True if samples in this tensor have shapes that are unequal."""
        return self.shape_interval.is_dynamic

    @property
    def num_samples(self) -> int:
        """Returns the length of the primary axis of the tensor.
        Ignores any applied indexing and returns the total length.
        """
        return self.chunk_engine.num_samples

    def __len__(self):
        """Returns the length of the primary axis of the tensor.
        Accounts for indexing into the tensor object.

        Examples:
            >>> len(tensor)
            0
            >>> tensor.extend(np.zeros((100, 10, 10)))
            >>> len(tensor)
            100
            >>> len(tensor[5:10])
            5

        Returns:
            int: The current length of this tensor.
        """

        # catch corrupted datasets / user tampering ASAP
        self.chunk_engine.validate_num_samples_is_synchronized()

        return self.index.length(self.meta.length)

    def __getitem__(
        self,
        item: Union[int, slice, List[int], Tuple[Union[int, slice, Tuple[int]]], Index],
    ):
        if not isinstance(item, (int, slice, list, tuple, Index)):
            raise InvalidKeyTypeError(item)
        return Tensor(
            self.key,
            self.storage,
            self.version_state,
            index=self.index[item],
        )

    def _get_bigger_dtype(self, d1, d2):
        if np.can_cast(d1, d2):
            if np.can_cast(d2, d1):
                return d1
            else:
                return d2
        else:
            if np.can_cast(d2, d1):
                return d2
            else:
                return np.object

    def _infer_np_dtype(self, val: Any) -> np.dtype:
        # TODO refac
        if hasattr(val, "dtype"):
            return val.dtype
        elif isinstance(val, int):
            return np.array(0).dtype
        elif isinstance(val, float):
            return np.array(0.0).dtype
        elif isinstance(val, str):
            return np.array("").dtype
        elif isinstance(val, bool):
            return np.bool
        elif isinstance(val, Sequence):
            return reduce(self._get_bigger_dtype, map(self._infer_np_dtype, val))
        else:
            raise TypeError(f"Cannot infer numpy dtype for {val}")

    def __setitem__(self, item: Union[int, slice], value: Any):
        """Update samples with new values.

        Example:
            >>> tensor.append(np.zeros((10, 10)))
            >>> tensor.shape
            (1, 10, 10)
            >>> tensor[0] = np.zeros((3, 3))
            >>> tensor.shape
            (1, 3, 3)
        """
        if isinstance(value, Tensor):
            if value._skip_next_setitem:
                value._skip_next_setitem = False
                return
            value = value.numpy(aslist=True)
        item_index = Index(item)
        self.chunk_engine.update(self.index[item_index], value)

    def __iter__(self):
        for i in range(len(self)):
            yield self[i]

    def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
        """Computes the contents of the tensor in numpy format.

        Args:
            aslist (bool): If True, a list of np.ndarrays will be returned. Helpful for dynamic tensors.
                If False, a single np.ndarray will be returned unless the samples are dynamically shaped, in which case
                an error is raised.

        Raises:
            DynamicTensorNumpyError: If reading a dynamically-shaped array slice without `aslist=True`.

        Returns:
            A numpy array containing the data represented by this tensor.
        """

        return self.chunk_engine.numpy(self.index, aslist=aslist)

    def __str__(self):
        index_str = f", index={self.index}"
        if self.index.is_trivial():
            index_str = ""
        return f"Tensor(key={repr(self.key)}{index_str})"

    __repr__ = __str__

    def __array__(self) -> np.ndarray:
        return self.numpy()

    @_inplace_op
    def __iadd__(self, other):
        pass

    @_inplace_op
    def __isub__(self, other):
        pass

    @_inplace_op
    def __imul__(self, other):
        pass

    @_inplace_op
    def __itruediv__(self, other):
        pass

    @_inplace_op
    def __ifloordiv__(self, other):
        pass

    @_inplace_op
    def __imod__(self, other):
        pass

    @_inplace_op
    def __ipow__(self, other):
        pass

    @_inplace_op
    def __ilshift__(self, other):
        pass

    @_inplace_op
    def __irshift__(self, other):
        pass

    @_inplace_op
    def __iand__(self, other):
        pass

    @_inplace_op
    def __ixor__(self, other):
        pass

    @_inplace_op
    def __ior__(self, other):
        pass

    def data(self) -> Any:
        htype = self.htype
        if htype in ("json", "text"):

            if self.ndim == 1:
                return self.numpy()[0]
            else:
                return [sample[0] for sample in self.numpy(aslist=True)]
        elif htype == "list":
            if self.ndim == 1:
                return list(self.numpy())
            else:
                return list(map(list, self.numpy(aslist=True)))
        else:
            return self.numpy()

Instance variables

var dtype
Expand source code
@property
def dtype(self) -> np.dtype:
    if self.htype in ("json", "list"):
        return self.dtype
    if self.meta.dtype:
        return np.dtype(self.meta.dtype)
    return None
var htype
Expand source code
@property
def htype(self):
    return self.meta.htype
var is_dynamic

Will return True if samples in this tensor have shapes that are unequal.

Expand source code
@property
def is_dynamic(self) -> bool:
    """Will return True if samples in this tensor have shapes that are unequal."""
    return self.shape_interval.is_dynamic
var meta
Expand source code
@property
def meta(self):
    return self.chunk_engine.tensor_meta
var ndim
Expand source code
@property
def ndim(self) -> int:
    return len(self.shape)
var num_samples

Returns the length of the primary axis of the tensor. Ignores any applied indexing and returns the total length.

Expand source code
@property
def num_samples(self) -> int:
    """Returns the length of the primary axis of the tensor.
    Ignores any applied indexing and returns the total length.
    """
    return self.chunk_engine.num_samples
var shape

Get the shape of this tensor. Length is included.

Note

If you don't want None in the output shape or want the lower/upper bound shapes, use tensor.shape_interval instead.

Example

>>> tensor.append(np.zeros((10, 10)))
>>> tensor.append(np.zeros((10, 15)))
>>> tensor.shape
(2, 10, None)

Returns

tuple
Tuple where each value is either None (if that axis is dynamic) or an int (if that axis is fixed).
Expand source code
@property
def shape(self) -> Tuple[Optional[int], ...]:
    """Get the shape of this tensor. Length is included.

    Note:
        If you don't want `None` in the output shape or want the lower/upper bound shapes,
        use `tensor.shape_interval` instead.

    Example:
        >>> tensor.append(np.zeros((10, 10)))
        >>> tensor.append(np.zeros((10, 15)))
        >>> tensor.shape
        (2, 10, None)

    Returns:
        tuple: Tuple where each value is either `None` (if that axis is dynamic) or
            an `int` (if that axis is fixed).
    """
    shape = self.shape_interval.astuple()
    if self.index.values[0].subscriptable():
        return shape
    return shape[1:]
var shape_interval

Returns a ShapeInterval object that describes this tensor's shape more accurately. Length is included.

Note

If you are expecting a tuple, use tensor.shape instead.

Example

>>> tensor.append(np.zeros((10, 10)))
>>> tensor.append(np.zeros((10, 15)))
>>> tensor.shape_interval
ShapeInterval(lower=(2, 10, 10), upper=(2, 10, 15))
>>> str(tensor.shape_interval)
(2, 10, 10:15)

Returns

ShapeInterval
Object containing lower and upper properties.
Expand source code
@property
def shape_interval(self) -> ShapeInterval:
    """Returns a `ShapeInterval` object that describes this tensor's shape more accurately. Length is included.

    Note:
        If you are expecting a `tuple`, use `tensor.shape` instead.

    Example:
        >>> tensor.append(np.zeros((10, 10)))
        >>> tensor.append(np.zeros((10, 15)))
        >>> tensor.shape_interval
        ShapeInterval(lower=(2, 10, 10), upper=(2, 10, 15))
        >>> str(tensor.shape_interval)
        (2, 10, 10:15)

    Returns:
        ShapeInterval: Object containing `lower` and `upper` properties.
    """

    length = [len(self)]

    min_shape = length + list(self.meta.min_shape)
    max_shape = length + list(self.meta.max_shape)

    return ShapeInterval(min_shape, max_shape)

Methods

def append(self, sample)

Appends a single sample to the end of the tensor. Can be an array, scalar value, or the return value from read(), which can be used to load files. See examples down below.

Examples

numpy input: >>> len(tensor) 0 >>> tensor.append(np.zeros((28, 28, 1))) >>> len(tensor) 1

file input: >>> len(tensor) 0 >>> tensor.append(hub.read("path/to/file")) >>> len(tensor) 1

Args

sample : np.ndarray, float, int, Sample
The data to append to the tensor. Sample is generated by read(). See the above examples.
Expand source code
def append(
    self,
    sample: Union[np.ndarray, float, int, Sample],
):
    """Appends a single sample to the end of the tensor. Can be an array, scalar value, or the return value from `hub.read`,
    which can be used to load files. See examples down below.

    Examples:
        numpy input:
            >>> len(tensor)
            0
            >>> tensor.append(np.zeros((28, 28, 1)))
            >>> len(tensor)
            1

        file input:
            >>> len(tensor)
            0
            >>> tensor.append(hub.read("path/to/file"))
            >>> len(tensor)
            1

    Args:
        sample (np.ndarray, float, int, Sample): The data to append to the tensor. `Sample` is generated by `hub.read`. See the above examples.
    """
    self.extend([sample])
def data(self)
Expand source code
def data(self) -> Any:
    htype = self.htype
    if htype in ("json", "text"):

        if self.ndim == 1:
            return self.numpy()[0]
        else:
            return [sample[0] for sample in self.numpy(aslist=True)]
    elif htype == "list":
        if self.ndim == 1:
            return list(self.numpy())
        else:
            return list(map(list, self.numpy(aslist=True)))
    else:
        return self.numpy()
def extend(self, samples)

Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array, or a sequence of read() outputs, which can be used to load files. See examples down below.

Example

numpy input: >>> len(tensor) 0 >>> tensor.extend(np.zeros((100, 28, 28, 1))) >>> len(tensor) 100

file input: >>> len(tensor) 0 >>> tensor.extend([ hub.read("path/to/image1"), hub.read("path/to/image2"), ]) >>> len(tensor) 2

Args

samples : np.ndarray, Sequence, Sequence[Sample]
The data to add to the tensor. The length should be equal to the number of samples to add.

Raises

TensorDtypeMismatchError
TensorDtypeMismatchError: Dtype for array must be equal to or castable to this tensor's dtype
Expand source code
def extend(self, samples: Union[np.ndarray, Sequence[SampleValue]]):
    """Extends the end of the tensor by appending multiple elements from a sequence. Accepts a sequence, a single batched numpy array,
    or a sequence of `hub.read` outputs, which can be used to load files. See examples down below.

    Example:
        numpy input:
            >>> len(tensor)
            0
            >>> tensor.extend(np.zeros((100, 28, 28, 1)))
            >>> len(tensor)
            100

        file input:
            >>> len(tensor)
            0
            >>> tensor.extend([
                    hub.read("path/to/image1"),
                    hub.read("path/to/image2"),
                ])
            >>> len(tensor)
            2


    Args:
        samples (np.ndarray, Sequence, Sequence[Sample]): The data to add to the tensor.
            The length should be equal to the number of samples to add.

    Raises:
        TensorDtypeMismatchError: TensorDtypeMismatchError: Dtype for array must be equal to or castable to this tensor's dtype
    """

    self.chunk_engine.extend(samples)
def numpy(self, aslist=False)

Computes the contents of the tensor in numpy format.

Args

aslist : bool
If True, a list of np.ndarrays will be returned. Helpful for dynamic tensors. If False, a single np.ndarray will be returned unless the samples are dynamically shaped, in which case an error is raised.

Raises

DynamicTensorNumpyError
If reading a dynamically-shaped array slice without aslist=True.

Returns

A numpy array containing the data represented by this tensor.

Expand source code
def numpy(self, aslist=False) -> Union[np.ndarray, List[np.ndarray]]:
    """Computes the contents of the tensor in numpy format.

    Args:
        aslist (bool): If True, a list of np.ndarrays will be returned. Helpful for dynamic tensors.
            If False, a single np.ndarray will be returned unless the samples are dynamically shaped, in which case
            an error is raised.

    Raises:
        DynamicTensorNumpyError: If reading a dynamically-shaped array slice without `aslist=True`.

    Returns:
        A numpy array containing the data represented by this tensor.
    """

    return self.chunk_engine.numpy(self.index, aslist=aslist)
class dataset (path, read_only=False, overwrite=False, public=True, memory_cache_size=256, local_cache_size=0, creds=None, token=None)
Expand source code
class dataset:
    def __new__(
        cls,
        path: str,
        read_only: bool = False,
        overwrite: bool = False,
        public: bool = True,
        memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
        local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
        creds: Optional[dict] = None,
        token: Optional[str] = None,
    ):
        """Returns a Dataset object referencing either a new or existing dataset.

        Important:
            Using `overwrite` will delete all of your data if it exists! Be very careful when setting this parameter.

        Args:
            path (str): The full path to the dataset. Can be:-
                - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
                - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
                - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
            read_only (bool): Opens dataset in read only mode if this is passed as True. Defaults to False.
                Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
            overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
            public (bool): Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
            memory_cache_size (int): The size of the memory cache to be used in MB.
            local_cache_size (int): The size of the local filesystem cache to be used in MB.
            creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
                This takes precedence over credentials present in the environment. Currently only works with s3 paths.
                It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
            token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

        Returns:
            Dataset object created using the arguments provided.
        """
        if creds is None:
            creds = {}

        feature_report_path(path, "dataset", {"Overwrite": overwrite})

        storage, cache_chain = get_storage_and_cache_chain(
            path=path,
            read_only=read_only,
            creds=creds,
            token=token,
            memory_cache_size=memory_cache_size,
            local_cache_size=local_cache_size,
        )
        if overwrite and dataset_exists(storage):
            storage.clear()

        read_only = storage.read_only
        return get_dataset_instance(
            path, storage=cache_chain, read_only=read_only, public=public, token=token
        )

    @staticmethod
    def empty(
        path: str,
        overwrite: bool = False,
        public: Optional[bool] = True,
        memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
        local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
        creds: Optional[dict] = None,
        token: Optional[str] = None,
    ) -> Dataset:
        """Creates an empty dataset

        Important:
            Using `overwrite` will delete all of your data if it exists! Be very careful when setting this parameter.

        Args:
            path (str): The full path to the dataset. Can be:-
                - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
                - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
                - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
            overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
            public (bool, optional): Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
            memory_cache_size (int): The size of the memory cache to be used in MB.
            local_cache_size (int): The size of the local filesystem cache to be used in MB.
            creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
                This takes precedence over credentials present in the environment. Currently only works with s3 paths.
                It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
            token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

        Returns:
            Dataset object created using the arguments provided.

        Raises:
            DatasetHandlerError: If a Dataset already exists at the given path and overwrite is False.
        """
        if creds is None:
            creds = {}

        feature_report_path(path, "empty", {"Overwrite": overwrite})

        storage, cache_chain = get_storage_and_cache_chain(
            path=path,
            read_only=False,
            creds=creds,
            token=token,
            memory_cache_size=memory_cache_size,
            local_cache_size=local_cache_size,
        )

        if overwrite and dataset_exists(storage):
            storage.clear()
        elif dataset_exists(storage):
            raise DatasetHandlerError(
                f"A dataset already exists at the given path ({path}). If you want to create a new empty dataset, either specify another path or use overwrite=True. If you want to load the dataset that exists at this path, use hub.load() instead."
            )

        read_only = storage.read_only
        return get_dataset_instance(
            path, storage=cache_chain, read_only=read_only, public=public, token=token
        )

    @staticmethod
    def load(
        path: str,
        read_only: bool = False,
        overwrite: bool = False,
        public: Optional[bool] = True,
        memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
        local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
        creds: Optional[dict] = None,
        token: Optional[str] = None,
    ) -> Dataset:
        """Loads an existing dataset

        Important:
            Using `overwrite` will delete all of your data if it exists! Be very careful when setting this parameter.

        Args:
            path (str): The full path to the dataset. Can be:-
                - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
                - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
                - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
            read_only (bool): Opens dataset in read only mode if this is passed as True. Defaults to False.
                Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
            overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
            public (bool, optional): Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
            memory_cache_size (int): The size of the memory cache to be used in MB.
            local_cache_size (int): The size of the local filesystem cache to be used in MB.
            creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
                This takes precedence over credentials present in the environment. Currently only works with s3 paths.
                It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
            token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

        Returns:
            Dataset object created using the arguments provided.

        Raises:
            DatasetHandlerError: If a Dataset does not exist at the given path.
        """
        if creds is None:
            creds = {}

        feature_report_path(path, "load", {"Overwrite": overwrite})

        storage, cache_chain = get_storage_and_cache_chain(
            path=path,
            read_only=read_only,
            creds=creds,
            token=token,
            memory_cache_size=memory_cache_size,
            local_cache_size=local_cache_size,
        )

        if not dataset_exists(storage):
            raise DatasetHandlerError(
                f"A Hub dataset does not exist at the given path ({path}). Check the path provided or in case you want to create a new dataset, use hub.empty()."
            )
        if overwrite:
            storage.clear()

        read_only = storage.read_only
        return get_dataset_instance(
            path, storage=cache_chain, read_only=read_only, public=public, token=token
        )

    @staticmethod
    def delete(path: str, force: bool = False, large_ok: bool = False) -> None:
        """Deletes a dataset at a given path.
        This is an IRREVERSIBLE operation. Data once deleted can not be recovered.

        Args:
            path (str): The path to the dataset to be deleted.
            force (bool): Delete data regardless of whether
                it looks like a hub dataset. All data at the path will be removed.
            large_ok (bool): Delete datasets larger than 1GB. Disabled by default.
        """

        feature_report_path(path, "delete", {"Force": force, "Large_OK": large_ok})

        try:
            ds = hub.load(path)
            ds.delete(large_ok=large_ok)
        except:
            if force:
                base_storage = storage_provider_from_path(
                    path, creds={}, read_only=False, token=None
                )
                base_storage.clear()
            else:
                raise

    @staticmethod
    def like(
        path: str,
        source: Union[str, Dataset],
        creds: dict = None,
        overwrite: bool = False,
    ) -> Dataset:
        """Copies the `source` dataset's structure to a new location. No samples are copied, only the meta/info for the dataset and it's tensors.

        Args:
            path (str): Path where the new dataset will be created.
            source (Union[str, Dataset]): Path or dataset object that will be used as the template for the new dataset.
            creds (dict): Credentials that will be used to create the new dataset.
            overwrite (bool): If True and a dataset exists at `destination`, it will be overwritten. Defaults to False.

        Returns:
            Dataset: New dataset object.
        """

        feature_report_path(path, "like", {"Overwrite": overwrite})

        destination_ds = dataset.empty(path, creds=creds, overwrite=overwrite)
        source_ds = source
        if isinstance(source, str):
            source_ds = dataset.load(source)

        for tensor_name in source_ds.version_state["meta"].tensors:  # type: ignore
            destination_ds.create_tensor_like(tensor_name, source_ds[tensor_name])

        destination_ds.info.update(source_ds.info.__getstate__())  # type: ignore

        return destination_ds

    @staticmethod
    def ingest(
        src: str,
        dest: str,
        images_compression: str = "auto",
        dest_creds: dict = None,
        progress_bar: bool = True,
        summary: bool = True,
        **dataset_kwargs,
    ) -> Dataset:
        """Ingests a dataset from a source and stores it as a structured dataset to destination

        Note:
            - Currently only local source paths and image classification datasets are supported for automatic ingestion.
            - Supported filetypes: png/jpeg/jpg.
            - All files and sub-directories with unsupported filetypes are ignored.
            - Valid source directory structures look like:

            ```
                data/
                    img0.jpg
                    img1.jpg
                    ...

            ```
            or
            ```
                data/
                    class0/
                        cat0.jpg
                        ...
                    class1/
                        dog0.jpg
                        ...
                    ...

            ```
            or
            ```
                data/
                    train/
                        class0/
                            img0.jpg
                            ...
                        ...
                    val/
                        class0/
                            img0.jpg
                            ...
                        ...
                    ...
            ```

            - Classes defined as sub-directories can be accessed at `ds["test/labels"].info.class_names`.
            - Support for train and test sub directories is present under ds["train/images"], ds["train/labels"] and ds["test/images"], ds["test/labels"]
            - Mapping filenames to classes from an external file is currently not supported.

        Args:
            src (str): Local path to where the unstructured dataset is stored.
            dest (str): Destination path where the structured dataset will be stored. Can be:-
                - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
                - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
                - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
            images_compression (str): For image classification datasets, this compression will be used for the `images` tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
            dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
            progress_bar (bool): Enables or disables ingestion progress bar. Defaults to True.
            summary (bool): If True, a summary of skipped files will be printed after completion. Defaults to True.
            **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.

        Returns:
            Dataset: New dataset object with structured dataset.

        Raises:
            InvalidPathException: If the source directory does not exist.
            SamePathException: If the source and destination path are same.
            AutoCompressionError: If the source director is empty or does not contain a valid extension.
            InvalidFileExtension: If the most frequent file extension is found to be 'None' during auto-compression.
        """

        feature_report_path(
            dest,
            "ingest",
            {
                "Images_Compression": images_compression,
                "Progress_Bar": progress_bar,
                "Summary": summary,
            },
        )
        if not os.path.isdir(src):
            raise InvalidPathException(src)

        if os.path.isdir(dest) and os.path.samefile(src, dest):
            raise SamePathException(src)

        if images_compression == "auto":
            images_compression = get_most_common_extension(src)
            if images_compression is None:
                raise InvalidFileExtension(src)

        ds = hub.dataset(dest, creds=dest_creds, **dataset_kwargs)

        # TODO: support more than just image classification (and update docstring)
        unstructured = ImageClassification(source=src)

        # TODO: auto detect compression
        unstructured.structure(
            ds,  # type: ignore
            use_progress_bar=progress_bar,
            generate_summary=summary,
            image_tensor_args={"sample_compression": images_compression},
        )

        return ds  # type: ignore

    @staticmethod
    def ingest_kaggle(
        tag: str,
        src: str,
        dest: str,
        exist_ok: bool = False,
        images_compression: str = "auto",
        dest_creds: dict = None,
        kaggle_credentials: dict = None,
        progress_bar: bool = True,
        summary: bool = True,
        **dataset_kwargs,
    ) -> Dataset:
        """Download and ingest a kaggle dataset and store it as a structured dataset to destination

        Note:
            Currently only local source paths and image classification datasets are supported for automatic ingestion.

        Args:
            tag (str): Kaggle dataset tag. Example: `"coloradokb/dandelionimages"` points to https://www.kaggle.com/coloradokb/dandelionimages
            src (str): Local path to where the raw kaggle dataset will be downlaoded to.
            dest (str): Destination path where the structured dataset will be stored. Can be:
                - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
                - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
                - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
                - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
            exist_ok (bool): If the kaggle dataset was already downloaded and `exist_ok` is True, ingestion will proceed without error.
            images_compression (str): For image classification datasets, this compression will be used for the `images` tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
            dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
            kaggle_credentials (dict): A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If None, environment variables/the kaggle.json file will be used if available.
            progress_bar (bool): Enables or disables ingestion progress bar. Set to true by default.
            summary (bool): Generates ingestion summary. Set to true by default.
            **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.

        Returns:
            Dataset: New dataset object with structured dataset.

        Raises:
            SamePathException: If the source and destination path are same.
        """

        feature_report_path(
            dest,
            "ingest_kaggle",
            {
                "Images_Compression": images_compression,
                "Exist_Ok": exist_ok,
                "Progress_Bar": progress_bar,
                "Summary": summary,
            },
        )

        if os.path.isdir(src) and os.path.isdir(dest):
            if os.path.samefile(src, dest):
                raise SamePathException(src)

        download_kaggle_dataset(
            tag,
            local_path=src,
            kaggle_credentials=kaggle_credentials,
            exist_ok=exist_ok,
        )

        ds = hub.ingest(
            src=src,
            dest=dest,
            images_compression=images_compression,
            dest_creds=dest_creds,
            progress_bar=progress_bar,
            summary=summary,
            **dataset_kwargs,
        )

        return ds

    @staticmethod
    @hub_reporter.record_call
    def list(
        workspace: str = "",
        token: Optional[str] = None,
    ) -> None:
        """List all available hub cloud datasets.

        Args:
            workspace (str): Specify user/organization name. If not given,
                returns a list of all datasets that can be accessed, regardless of what workspace they are in.
                Otherwise, lists all datasets in the given workspace.
            token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

        Returns:
            List of dataset names.
        """
        client = HubBackendClient(token=token)
        datasets = client.get_datasets(workspace=workspace)
        return datasets

Static methods

def delete(path, force=False, large_ok=False)

Deletes a dataset at a given path. This is an IRREVERSIBLE operation. Data once deleted can not be recovered.

Args

path : str
The path to the dataset to be deleted.
force : bool
Delete data regardless of whether it looks like a hub dataset. All data at the path will be removed.
large_ok : bool
Delete datasets larger than 1GB. Disabled by default.
Expand source code
@staticmethod
def delete(path: str, force: bool = False, large_ok: bool = False) -> None:
    """Deletes a dataset at a given path.
    This is an IRREVERSIBLE operation. Data once deleted can not be recovered.

    Args:
        path (str): The path to the dataset to be deleted.
        force (bool): Delete data regardless of whether
            it looks like a hub dataset. All data at the path will be removed.
        large_ok (bool): Delete datasets larger than 1GB. Disabled by default.
    """

    feature_report_path(path, "delete", {"Force": force, "Large_OK": large_ok})

    try:
        ds = hub.load(path)
        ds.delete(large_ok=large_ok)
    except:
        if force:
            base_storage = storage_provider_from_path(
                path, creds={}, read_only=False, token=None
            )
            base_storage.clear()
        else:
            raise
def empty(path, overwrite=False, public=True, memory_cache_size=256, local_cache_size=0, creds=None, token=None)

Creates an empty dataset

Important

Using overwrite will delete all of your data if it exists! Be very careful when setting this parameter.

Args

path : str
The full path to the dataset. Can be:- - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
overwrite : bool
WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
public : bool, optional
Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
memory_cache_size : int
The size of the memory cache to be used in MB.
local_cache_size : int
The size of the local filesystem cache to be used in MB.
creds : dict, optional
A dictionary containing credentials used to access the dataset at the path. This takes precedence over credentials present in the environment. Currently only works with s3 paths. It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
token : str, optional
Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

Returns

Dataset object created using the arguments provided.

Raises

DatasetHandlerError
If a Dataset already exists at the given path and overwrite is False.
Expand source code
@staticmethod
def empty(
    path: str,
    overwrite: bool = False,
    public: Optional[bool] = True,
    memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
    local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
    creds: Optional[dict] = None,
    token: Optional[str] = None,
) -> Dataset:
    """Creates an empty dataset

    Important:
        Using `overwrite` will delete all of your data if it exists! Be very careful when setting this parameter.

    Args:
        path (str): The full path to the dataset. Can be:-
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
        public (bool, optional): Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
        memory_cache_size (int): The size of the memory cache to be used in MB.
        local_cache_size (int): The size of the local filesystem cache to be used in MB.
        creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
            This takes precedence over credentials present in the environment. Currently only works with s3 paths.
            It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
        token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

    Returns:
        Dataset object created using the arguments provided.

    Raises:
        DatasetHandlerError: If a Dataset already exists at the given path and overwrite is False.
    """
    if creds is None:
        creds = {}

    feature_report_path(path, "empty", {"Overwrite": overwrite})

    storage, cache_chain = get_storage_and_cache_chain(
        path=path,
        read_only=False,
        creds=creds,
        token=token,
        memory_cache_size=memory_cache_size,
        local_cache_size=local_cache_size,
    )

    if overwrite and dataset_exists(storage):
        storage.clear()
    elif dataset_exists(storage):
        raise DatasetHandlerError(
            f"A dataset already exists at the given path ({path}). If you want to create a new empty dataset, either specify another path or use overwrite=True. If you want to load the dataset that exists at this path, use hub.load() instead."
        )

    read_only = storage.read_only
    return get_dataset_instance(
        path, storage=cache_chain, read_only=read_only, public=public, token=token
    )
def ingest(src, dest, images_compression='auto', dest_creds=None, progress_bar=True, summary=True, **dataset_kwargs)

Ingests a dataset from a source and stores it as a structured dataset to destination

Note

  • Currently only local source paths and image classification datasets are supported for automatic ingestion.
  • Supported filetypes: png/jpeg/jpg.
  • All files and sub-directories with unsupported filetypes are ignored.
  • Valid source directory structures look like:
    data/
        img0.jpg
        img1.jpg
        ...

or

    data/
        class0/
            cat0.jpg
            ...
        class1/
            dog0.jpg
            ...
        ...

or

    data/
        train/
            class0/
                img0.jpg
                ...
            ...
        val/
            class0/
                img0.jpg
                ...
            ...
        ...
  • Classes defined as sub-directories can be accessed at ds["test/labels"].info.class_names.
  • Support for train and test sub directories is present under ds["train/images"], ds["train/labels"] and ds["test/images"], ds["test/labels"]
  • Mapping filenames to classes from an external file is currently not supported.

Args

src : str
Local path to where the unstructured dataset is stored.
dest : str
Destination path where the structured dataset will be stored. Can be:- - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
images_compression : str
For image classification datasets, this compression will be used for the images tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
dest_creds : dict
A dictionary containing credentials used to access the destination path of the dataset.
progress_bar : bool
Enables or disables ingestion progress bar. Defaults to True.
summary : bool
If True, a summary of skipped files will be printed after completion. Defaults to True.
**dataset_kwargs
Any arguments passed here will be forwarded to the dataset creator function.

Returns

Dataset
New dataset object with structured dataset.

Raises

InvalidPathException
If the source directory does not exist.
SamePathException
If the source and destination path are same.
AutoCompressionError
If the source director is empty or does not contain a valid extension.
InvalidFileExtension
If the most frequent file extension is found to be 'None' during auto-compression.
Expand source code
@staticmethod
def ingest(
    src: str,
    dest: str,
    images_compression: str = "auto",
    dest_creds: dict = None,
    progress_bar: bool = True,
    summary: bool = True,
    **dataset_kwargs,
) -> Dataset:
    """Ingests a dataset from a source and stores it as a structured dataset to destination

    Note:
        - Currently only local source paths and image classification datasets are supported for automatic ingestion.
        - Supported filetypes: png/jpeg/jpg.
        - All files and sub-directories with unsupported filetypes are ignored.
        - Valid source directory structures look like:

        ```
            data/
                img0.jpg
                img1.jpg
                ...

        ```
        or
        ```
            data/
                class0/
                    cat0.jpg
                    ...
                class1/
                    dog0.jpg
                    ...
                ...

        ```
        or
        ```
            data/
                train/
                    class0/
                        img0.jpg
                        ...
                    ...
                val/
                    class0/
                        img0.jpg
                        ...
                    ...
                ...
        ```

        - Classes defined as sub-directories can be accessed at `ds["test/labels"].info.class_names`.
        - Support for train and test sub directories is present under ds["train/images"], ds["train/labels"] and ds["test/images"], ds["test/labels"]
        - Mapping filenames to classes from an external file is currently not supported.

    Args:
        src (str): Local path to where the unstructured dataset is stored.
        dest (str): Destination path where the structured dataset will be stored. Can be:-
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        images_compression (str): For image classification datasets, this compression will be used for the `images` tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
        dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
        progress_bar (bool): Enables or disables ingestion progress bar. Defaults to True.
        summary (bool): If True, a summary of skipped files will be printed after completion. Defaults to True.
        **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.

    Returns:
        Dataset: New dataset object with structured dataset.

    Raises:
        InvalidPathException: If the source directory does not exist.
        SamePathException: If the source and destination path are same.
        AutoCompressionError: If the source director is empty or does not contain a valid extension.
        InvalidFileExtension: If the most frequent file extension is found to be 'None' during auto-compression.
    """

    feature_report_path(
        dest,
        "ingest",
        {
            "Images_Compression": images_compression,
            "Progress_Bar": progress_bar,
            "Summary": summary,
        },
    )
    if not os.path.isdir(src):
        raise InvalidPathException(src)

    if os.path.isdir(dest) and os.path.samefile(src, dest):
        raise SamePathException(src)

    if images_compression == "auto":
        images_compression = get_most_common_extension(src)
        if images_compression is None:
            raise InvalidFileExtension(src)

    ds = hub.dataset(dest, creds=dest_creds, **dataset_kwargs)

    # TODO: support more than just image classification (and update docstring)
    unstructured = ImageClassification(source=src)

    # TODO: auto detect compression
    unstructured.structure(
        ds,  # type: ignore
        use_progress_bar=progress_bar,
        generate_summary=summary,
        image_tensor_args={"sample_compression": images_compression},
    )

    return ds  # type: ignore
def ingest_kaggle(tag, src, dest, exist_ok=False, images_compression='auto', dest_creds=None, kaggle_credentials=None, progress_bar=True, summary=True, **dataset_kwargs)

Download and ingest a kaggle dataset and store it as a structured dataset to destination

Note

Currently only local source paths and image classification datasets are supported for automatic ingestion.

Args

tag : str
Kaggle dataset tag. Example: "coloradokb/dandelionimages" points to https://www.kaggle.com/coloradokb/dandelionimages
src : str
Local path to where the raw kaggle dataset will be downlaoded to.
dest : str
Destination path where the structured dataset will be stored. Can be: - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
exist_ok : bool
If the kaggle dataset was already downloaded and exist_ok is True, ingestion will proceed without error.
images_compression : str
For image classification datasets, this compression will be used for the images tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
dest_creds : dict
A dictionary containing credentials used to access the destination path of the dataset.
kaggle_credentials : dict
A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If None, environment variables/the kaggle.json file will be used if available.
progress_bar : bool
Enables or disables ingestion progress bar. Set to true by default.
summary : bool
Generates ingestion summary. Set to true by default.
**dataset_kwargs
Any arguments passed here will be forwarded to the dataset creator function.

Returns

Dataset
New dataset object with structured dataset.

Raises

SamePathException
If the source and destination path are same.
Expand source code
@staticmethod
def ingest_kaggle(
    tag: str,
    src: str,
    dest: str,
    exist_ok: bool = False,
    images_compression: str = "auto",
    dest_creds: dict = None,
    kaggle_credentials: dict = None,
    progress_bar: bool = True,
    summary: bool = True,
    **dataset_kwargs,
) -> Dataset:
    """Download and ingest a kaggle dataset and store it as a structured dataset to destination

    Note:
        Currently only local source paths and image classification datasets are supported for automatic ingestion.

    Args:
        tag (str): Kaggle dataset tag. Example: `"coloradokb/dandelionimages"` points to https://www.kaggle.com/coloradokb/dandelionimages
        src (str): Local path to where the raw kaggle dataset will be downlaoded to.
        dest (str): Destination path where the structured dataset will be stored. Can be:
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        exist_ok (bool): If the kaggle dataset was already downloaded and `exist_ok` is True, ingestion will proceed without error.
        images_compression (str): For image classification datasets, this compression will be used for the `images` tensor. If images_compression is "auto", compression will be automatically determined by the most common extension in the directory.
        dest_creds (dict): A dictionary containing credentials used to access the destination path of the dataset.
        kaggle_credentials (dict): A dictionary containing kaggle credentials {"username":"YOUR_USERNAME", "key": "YOUR_KEY"}. If None, environment variables/the kaggle.json file will be used if available.
        progress_bar (bool): Enables or disables ingestion progress bar. Set to true by default.
        summary (bool): Generates ingestion summary. Set to true by default.
        **dataset_kwargs: Any arguments passed here will be forwarded to the dataset creator function.

    Returns:
        Dataset: New dataset object with structured dataset.

    Raises:
        SamePathException: If the source and destination path are same.
    """

    feature_report_path(
        dest,
        "ingest_kaggle",
        {
            "Images_Compression": images_compression,
            "Exist_Ok": exist_ok,
            "Progress_Bar": progress_bar,
            "Summary": summary,
        },
    )

    if os.path.isdir(src) and os.path.isdir(dest):
        if os.path.samefile(src, dest):
            raise SamePathException(src)

    download_kaggle_dataset(
        tag,
        local_path=src,
        kaggle_credentials=kaggle_credentials,
        exist_ok=exist_ok,
    )

    ds = hub.ingest(
        src=src,
        dest=dest,
        images_compression=images_compression,
        dest_creds=dest_creds,
        progress_bar=progress_bar,
        summary=summary,
        **dataset_kwargs,
    )

    return ds
def like(path, source, creds=None, overwrite=False)

Copies the source dataset's structure to a new location. No samples are copied, only the meta/info for the dataset and it's tensors.

Args

path : str
Path where the new dataset will be created.
source : Union[str, Dataset]
Path or dataset object that will be used as the template for the new dataset.
creds : dict
Credentials that will be used to create the new dataset.
overwrite : bool
If True and a dataset exists at destination, it will be overwritten. Defaults to False.

Returns

Dataset
New dataset object.
Expand source code
@staticmethod
def like(
    path: str,
    source: Union[str, Dataset],
    creds: dict = None,
    overwrite: bool = False,
) -> Dataset:
    """Copies the `source` dataset's structure to a new location. No samples are copied, only the meta/info for the dataset and it's tensors.

    Args:
        path (str): Path where the new dataset will be created.
        source (Union[str, Dataset]): Path or dataset object that will be used as the template for the new dataset.
        creds (dict): Credentials that will be used to create the new dataset.
        overwrite (bool): If True and a dataset exists at `destination`, it will be overwritten. Defaults to False.

    Returns:
        Dataset: New dataset object.
    """

    feature_report_path(path, "like", {"Overwrite": overwrite})

    destination_ds = dataset.empty(path, creds=creds, overwrite=overwrite)
    source_ds = source
    if isinstance(source, str):
        source_ds = dataset.load(source)

    for tensor_name in source_ds.version_state["meta"].tensors:  # type: ignore
        destination_ds.create_tensor_like(tensor_name, source_ds[tensor_name])

    destination_ds.info.update(source_ds.info.__getstate__())  # type: ignore

    return destination_ds
def list(workspace='', token=None)

List all available hub cloud datasets.

Args

workspace : str
Specify user/organization name. If not given, returns a list of all datasets that can be accessed, regardless of what workspace they are in. Otherwise, lists all datasets in the given workspace.
token : str, optional
Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

Returns

List of dataset names.

Expand source code
@staticmethod
@hub_reporter.record_call
def list(
    workspace: str = "",
    token: Optional[str] = None,
) -> None:
    """List all available hub cloud datasets.

    Args:
        workspace (str): Specify user/organization name. If not given,
            returns a list of all datasets that can be accessed, regardless of what workspace they are in.
            Otherwise, lists all datasets in the given workspace.
        token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

    Returns:
        List of dataset names.
    """
    client = HubBackendClient(token=token)
    datasets = client.get_datasets(workspace=workspace)
    return datasets
def load(path, read_only=False, overwrite=False, public=True, memory_cache_size=256, local_cache_size=0, creds=None, token=None)

Loads an existing dataset

Important

Using overwrite will delete all of your data if it exists! Be very careful when setting this parameter.

Args

path : str
The full path to the dataset. Can be:- - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line) - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument. - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset. - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
read_only : bool
Opens dataset in read only mode if this is passed as True. Defaults to False. Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
overwrite : bool
WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
public : bool, optional
Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
memory_cache_size : int
The size of the memory cache to be used in MB.
local_cache_size : int
The size of the local filesystem cache to be used in MB.
creds : dict, optional
A dictionary containing credentials used to access the dataset at the path. This takes precedence over credentials present in the environment. Currently only works with s3 paths. It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
token : str, optional
Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

Returns

Dataset object created using the arguments provided.

Raises

DatasetHandlerError
If a Dataset does not exist at the given path.
Expand source code
@staticmethod
def load(
    path: str,
    read_only: bool = False,
    overwrite: bool = False,
    public: Optional[bool] = True,
    memory_cache_size: int = DEFAULT_MEMORY_CACHE_SIZE,
    local_cache_size: int = DEFAULT_LOCAL_CACHE_SIZE,
    creds: Optional[dict] = None,
    token: Optional[str] = None,
) -> Dataset:
    """Loads an existing dataset

    Important:
        Using `overwrite` will delete all of your data if it exists! Be very careful when setting this parameter.

    Args:
        path (str): The full path to the dataset. Can be:-
            - a Hub cloud path of the form hub://username/datasetname. To write to Hub cloud datasets, ensure that you are logged in to Hub (use 'activeloop login' from command line)
            - an s3 path of the form s3://bucketname/path/to/dataset. Credentials are required in either the environment or passed to the creds argument.
            - a local file system path of the form ./path/to/dataset or ~/path/to/dataset or path/to/dataset.
            - a memory path of the form mem://path/to/dataset which doesn't save the dataset but keeps it in memory instead. Should be used only for testing as it does not persist.
        read_only (bool): Opens dataset in read only mode if this is passed as True. Defaults to False.
            Datasets stored on Hub cloud that your account does not have write access to will automatically open in read mode.
        overwrite (bool): WARNING: If set to True this overwrites the dataset if it already exists. This can NOT be undone! Defaults to False.
        public (bool, optional): Defines if the dataset will have public access. Applicable only if Hub cloud storage is used and a new Dataset is being created. Defaults to True.
        memory_cache_size (int): The size of the memory cache to be used in MB.
        local_cache_size (int): The size of the local filesystem cache to be used in MB.
        creds (dict, optional): A dictionary containing credentials used to access the dataset at the path.
            This takes precedence over credentials present in the environment. Currently only works with s3 paths.
            It supports 'aws_access_key_id', 'aws_secret_access_key', 'aws_session_token', 'endpoint_url' and 'region' as keys.
        token (str, optional): Activeloop token, used for fetching credentials for Hub datasets. This is optional, tokens are normally autogenerated.

    Returns:
        Dataset object created using the arguments provided.

    Raises:
        DatasetHandlerError: If a Dataset does not exist at the given path.
    """
    if creds is None:
        creds = {}

    feature_report_path(path, "load", {"Overwrite": overwrite})

    storage, cache_chain = get_storage_and_cache_chain(
        path=path,
        read_only=read_only,
        creds=creds,
        token=token,
        memory_cache_size=memory_cache_size,
        local_cache_size=local_cache_size,
    )

    if not dataset_exists(storage):
        raise DatasetHandlerError(
            f"A Hub dataset does not exist at the given path ({path}). Check the path provided or in case you want to create a new dataset, use hub.empty()."
        )
    if overwrite:
        storage.clear()

    read_only = storage.read_only
    return get_dataset_instance(
        path, storage=cache_chain, read_only=read_only, public=public, token=token
    )