Module hub.core.dataset.view_entry
Expand source code
from typing import Dict, Optional, Any
from hub.client.log import logger
from hub.util.tag import process_hub_path
from hub.util.path import get_org_id_and_ds_name, is_hub_cloud_path
from hub.util.logging import log_visualizer_link
from hub.constants import HUB_CLOUD_DEV_USERNAME
class ViewEntry:
"""Represents a view saved inside a dataset."""
def __init__(self, info: Dict, dataset, external: bool = False):
self.info = info
self._ds = dataset
self._external = external
def __getitem__(self, key: str):
return self.info[key]
def get(self, key: str, default: Optional[Any] = None):
return self.info.get(key, default)
@property
def id(self) -> str:
"""Returns id of the view."""
return self.info["id"].split("]")[-1]
@property
def query(self) -> Optional[str]:
return self.info.get("query")
@property
def message(self) -> str:
"""Returns the message with which the view was saved."""
return self.info.get("message", "")
def __str__(self):
return f"View(id='{self.id}', message='{self.message}', virtual={self.virtual})"
__repr__ = __str__
@property
def virtual(self) -> bool:
return self.info["virtual-datasource"]
def load(self, verbose=True):
"Loads the view and returns the `hub.Dataset`."
ds = self._ds._sub_ds(
".queries/" + (self.info.get("path") or self.info["id"]),
lock=False,
verbose=False,
)
sub_ds_path = ds.path
if self.virtual:
ds = ds._get_view(inherit_creds=not self._external)
ds._view_entry = self
if verbose:
log_visualizer_link(sub_ds_path, source_ds_url=self.info["source-dataset"])
return ds
def optimize(
self, unlink=True, num_workers=0, scheduler="threaded", progressbar=True
):
"""Optimizes the dataset view by copying and rechunking the required data. This is necessary to achieve fast streaming
speeds when training models using the dataset view. The optimization process will take some time, depending on
the size of the data.
Args:
unlink (bool): - If True, this unlinks linked tensors (if any) by copying data from the links to the view.
- This does not apply to linked videos. Set `hub.\0constants._UNLINK_VIDEOS` to `True` to change this behavior.
num_workers (int): Number of workers to be used for the optimization process. Defaults to 0.
scheduler (str): The scheduler to be used for optimization. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
Only applicable if `optimize=True`. Defaults to 'threaded'.
progressbar (bool): Whether to display a progressbar.
Returns:
`hub.core.dataset.view_entry.ViewEntry`
Examples:
```
# save view
ds[:10].save_view(view_id="first_10")
# optimize view
ds.get_view("first_10").optimize()
# load optimized view
ds.load_view("first_10")
```
"""
self.info = self._ds._optimize_saved_view(
self.info["id"],
external=self._external,
unlink=unlink,
num_workers=num_workers,
scheduler=scheduler,
progressbar=progressbar,
)
return self
def delete(self):
"""Deletes the view."""
self._ds.delete_view(id=self.info["id"])
Classes
class ViewEntry (info, dataset, external=False)
-
Represents a view saved inside a dataset.
Expand source code
class ViewEntry: """Represents a view saved inside a dataset.""" def __init__(self, info: Dict, dataset, external: bool = False): self.info = info self._ds = dataset self._external = external def __getitem__(self, key: str): return self.info[key] def get(self, key: str, default: Optional[Any] = None): return self.info.get(key, default) @property def id(self) -> str: """Returns id of the view.""" return self.info["id"].split("]")[-1] @property def query(self) -> Optional[str]: return self.info.get("query") @property def message(self) -> str: """Returns the message with which the view was saved.""" return self.info.get("message", "") def __str__(self): return f"View(id='{self.id}', message='{self.message}', virtual={self.virtual})" __repr__ = __str__ @property def virtual(self) -> bool: return self.info["virtual-datasource"] def load(self, verbose=True): "Loads the view and returns the `hub.Dataset`." ds = self._ds._sub_ds( ".queries/" + (self.info.get("path") or self.info["id"]), lock=False, verbose=False, ) sub_ds_path = ds.path if self.virtual: ds = ds._get_view(inherit_creds=not self._external) ds._view_entry = self if verbose: log_visualizer_link(sub_ds_path, source_ds_url=self.info["source-dataset"]) return ds def optimize( self, unlink=True, num_workers=0, scheduler="threaded", progressbar=True ): """Optimizes the dataset view by copying and rechunking the required data. This is necessary to achieve fast streaming speeds when training models using the dataset view. The optimization process will take some time, depending on the size of the data. Args: unlink (bool): - If True, this unlinks linked tensors (if any) by copying data from the links to the view. - This does not apply to linked videos. Set `hub.\0constants._UNLINK_VIDEOS` to `True` to change this behavior. num_workers (int): Number of workers to be used for the optimization process. Defaults to 0. scheduler (str): The scheduler to be used for optimization. Supported values include: 'serial', 'threaded', 'processed' and 'ray'. Only applicable if `optimize=True`. Defaults to 'threaded'. progressbar (bool): Whether to display a progressbar. Returns: `hub.core.dataset.view_entry.ViewEntry` Examples: ``` # save view ds[:10].save_view(view_id="first_10") # optimize view ds.get_view("first_10").optimize() # load optimized view ds.load_view("first_10") ``` """ self.info = self._ds._optimize_saved_view( self.info["id"], external=self._external, unlink=unlink, num_workers=num_workers, scheduler=scheduler, progressbar=progressbar, ) return self def delete(self): """Deletes the view.""" self._ds.delete_view(id=self.info["id"])
Instance variables
var id
-
Returns id of the view.
Expand source code
@property def id(self) -> str: """Returns id of the view.""" return self.info["id"].split("]")[-1]
var message
-
Returns the message with which the view was saved.
Expand source code
@property def message(self) -> str: """Returns the message with which the view was saved.""" return self.info.get("message", "")
var query
-
Expand source code
@property def query(self) -> Optional[str]: return self.info.get("query")
var virtual
-
Expand source code
@property def virtual(self) -> bool: return self.info["virtual-datasource"]
Methods
def delete(self)
-
Deletes the view.
Expand source code
def delete(self): """Deletes the view.""" self._ds.delete_view(id=self.info["id"])
def get(self, key, default=None)
-
Expand source code
def get(self, key: str, default: Optional[Any] = None): return self.info.get(key, default)
def load(self, verbose=True)
-
Loads the view and returns the
Dataset
.Expand source code
def load(self, verbose=True): "Loads the view and returns the `hub.Dataset`." ds = self._ds._sub_ds( ".queries/" + (self.info.get("path") or self.info["id"]), lock=False, verbose=False, ) sub_ds_path = ds.path if self.virtual: ds = ds._get_view(inherit_creds=not self._external) ds._view_entry = self if verbose: log_visualizer_link(sub_ds_path, source_ds_url=self.info["source-dataset"]) return ds
def optimize(self, unlink=True, num_workers=0, scheduler='threaded', progressbar=True)
-
Optimizes the dataset view by copying and rechunking the required data. This is necessary to achieve fast streaming speeds when training models using the dataset view. The optimization process will take some time, depending on the size of the data.
Args
unlink
:bool
-
- If True, this unlinks linked tensors (if any) by copying data from the links to the view.
- This does not apply to linked videos. Set
hub. constants._UNLINK_VIDEOS
toTrue
to change this behavior.
- This does not apply to linked videos. Set
- If True, this unlinks linked tensors (if any) by copying data from the links to the view.
num_workers
:int
- Number of workers to be used for the optimization process. Defaults to 0.
scheduler
:str
- The scheduler to be used for optimization. Supported values include: 'serial', 'threaded', 'processed' and 'ray'.
Only applicable if
optimize=True
. Defaults to 'threaded'. progressbar
:bool
- Whether to display a progressbar.
Returns
Examples
# save view ds[:10].save_view(view_id="first_10") # optimize view ds.get_view("first_10").optimize() # load optimized view ds.load_view("first_10")
Expand source code
def optimize( self, unlink=True, num_workers=0, scheduler="threaded", progressbar=True ): """Optimizes the dataset view by copying and rechunking the required data. This is necessary to achieve fast streaming speeds when training models using the dataset view. The optimization process will take some time, depending on the size of the data. Args: unlink (bool): - If True, this unlinks linked tensors (if any) by copying data from the links to the view. - This does not apply to linked videos. Set `hub.\0constants._UNLINK_VIDEOS` to `True` to change this behavior. num_workers (int): Number of workers to be used for the optimization process. Defaults to 0. scheduler (str): The scheduler to be used for optimization. Supported values include: 'serial', 'threaded', 'processed' and 'ray'. Only applicable if `optimize=True`. Defaults to 'threaded'. progressbar (bool): Whether to display a progressbar. Returns: `hub.core.dataset.view_entry.ViewEntry` Examples: ``` # save view ds[:10].save_view(view_id="first_10") # optimize view ds.get_view("first_10").optimize() # load optimized view ds.load_view("first_10") ``` """ self.info = self._ds._optimize_saved_view( self.info["id"], external=self._external, unlink=unlink, num_workers=num_workers, scheduler=scheduler, progressbar=progressbar, ) return self