import os
from abc import ABC, abstractmethod
from typing import List
from pathlib import Path
from functools import lru_cache
import numpy as np
import h5py
from wcmatch.glob import globfilter
[docs]class DPPath(ABC):
"""The path class to data system (DeepmdData).
Parameters
----------
path : str
path
"""
def __new__(cls, path: str):
if cls is DPPath:
if os.path.isdir(path):
return super().__new__(DPOSPath)
elif os.path.isfile(path.split("#")[0]):
# assume h5 if it is not dir
# TODO: check if it is a real h5? or just check suffix?
return super().__new__(DPH5Path)
raise FileNotFoundError("%s not found" % path)
return super().__new__(cls)
[docs] @abstractmethod
def load_numpy(self) -> np.ndarray:
"""Load NumPy array.
Returns
-------
np.ndarray
loaded NumPy array
"""
[docs] @abstractmethod
def load_txt(self, **kwargs) -> np.ndarray:
"""Load NumPy array from text.
Returns
-------
np.ndarray
loaded NumPy array
"""
[docs] @abstractmethod
def glob(self, pattern: str) -> List["DPPath"]:
"""Search path using the glob pattern.
Parameters
----------
pattern : str
glob pattern
Returns
-------
List[DPPath]
list of paths
"""
[docs] @abstractmethod
def rglob(self, pattern: str) -> List["DPPath"]:
"""This is like calling :metd:`DPPath.glob()` with `**/` added in front
of the given relative pattern.
Parameters
----------
pattern : str
glob pattern
Returns
-------
List[DPPath]
list of paths
"""
[docs] @abstractmethod
def is_file(self) -> bool:
"""Check if self is file."""
[docs] @abstractmethod
def is_dir(self) -> bool:
"""Check if self is directory."""
@abstractmethod
def __truediv__(self, key: str) -> "DPPath":
"""Used for / operator."""
@abstractmethod
def __lt__(self, other: "DPPath") -> bool:
"""whether this DPPath is less than other for sorting"""
@abstractmethod
def __str__(self) -> str:
"""Represent string"""
def __repr__(self) -> str:
return "%s (%s)" % (type(self), str(self))
def __eq__(self, other) -> bool:
return str(self) == str(other)
def __hash__(self):
return hash(str(self))
[docs]class DPOSPath(DPPath):
"""The OS path class to data system (DeepmdData) for real directories.
Parameters
----------
path : str
path
"""
def __init__(self, path: str) -> None:
super().__init__()
if isinstance(path, Path):
self.path = path
else:
self.path = Path(path)
[docs] def load_numpy(self) -> np.ndarray:
"""Load NumPy array.
Returns
-------
np.ndarray
loaded NumPy array
"""
return np.load(str(self.path))
[docs] def load_txt(self, **kwargs) -> np.ndarray:
"""Load NumPy array from text.
Returns
-------
np.ndarray
loaded NumPy array
"""
return np.loadtxt(str(self.path), **kwargs)
[docs] def glob(self, pattern: str) -> List["DPPath"]:
"""Search path using the glob pattern.
Parameters
----------
pattern : str
glob pattern
Returns
-------
List[DPPath]
list of paths
"""
# currently DPOSPath will only derivative DPOSPath
# TODO: discuss if we want to mix DPOSPath and DPH5Path?
return list([type(self)(p) for p in self.path.glob(pattern)])
[docs] def rglob(self, pattern: str) -> List["DPPath"]:
"""This is like calling :metd:`DPPath.glob()` with `**/` added in front
of the given relative pattern.
Parameters
----------
pattern : str
glob pattern
Returns
-------
List[DPPath]
list of paths
"""
return list([type(self)(p) for p in self.path.rglob(pattern)])
[docs] def is_file(self) -> bool:
"""Check if self is file."""
return self.path.is_file()
[docs] def is_dir(self) -> bool:
"""Check if self is directory."""
return self.path.is_dir()
def __truediv__(self, key: str) -> "DPPath":
"""Used for / operator."""
return type(self)(self.path / key)
def __lt__(self, other: "DPOSPath") -> bool:
"""whether this DPPath is less than other for sorting"""
return self.path < other.path
def __str__(self) -> str:
"""Represent string"""
return str(self.path)
[docs]class DPH5Path(DPPath):
"""The path class to data system (DeepmdData) for HDF5 files.
Notes
-----
OS - HDF5 relationship:
directory - Group
file - Dataset
Parameters
----------
path : str
path
"""
def __init__(self, path: str) -> None:
super().__init__()
# we use "#" to split path
# so we do not support file names containing #...
s = path.split("#")
self.root_path = s[0]
self.root = self._load_h5py(s[0])
# h5 path: default is the root path
self.name = s[1] if len(s) > 1 else "/"
@classmethod
@lru_cache(None)
def _load_h5py(cls, path: str) -> h5py.File:
"""Load hdf5 file.
Parameters
----------
path : str
path to hdf5 file
"""
# this method has cache to avoid duplicated
# loading from different DPH5Path
# However the file will be never closed?
return h5py.File(path, 'r')
[docs] def load_numpy(self) -> np.ndarray:
"""Load NumPy array.
Returns
-------
np.ndarray
loaded NumPy array
"""
return self.root[self.name][:]
[docs] def load_txt(self, dtype: np.dtype = None, **kwargs) -> np.ndarray:
"""Load NumPy array from text.
Returns
-------
np.ndarray
loaded NumPy array
"""
arr = self.load_numpy()
if dtype:
arr = arr.astype(dtype)
return arr
[docs] def glob(self, pattern: str) -> List["DPPath"]:
"""Search path using the glob pattern.
Parameters
----------
pattern : str
glob pattern
Returns
-------
List[DPPath]
list of paths
"""
# got paths starts with current path first, which is faster
subpaths = [ii for ii in self._keys if ii.startswith(self.name)]
return list([type(self)("%s#%s"%(self.root_path, pp)) for pp in globfilter(subpaths, self._connect_path(pattern))])
[docs] def rglob(self, pattern: str) -> List["DPPath"]:
"""This is like calling :metd:`DPPath.glob()` with `**/` added in front
of the given relative pattern.
Parameters
----------
pattern : str
glob pattern
Returns
-------
List[DPPath]
list of paths
"""
return self.glob("**" + pattern)
@property
def _keys(self) -> List[str]:
"""Walk all groups and dataset"""
return self._file_keys(self.root)
@classmethod
@lru_cache(None)
def _file_keys(cls, file: h5py.File) -> List[str]:
"""Walk all groups and dataset"""
l = []
file.visit(lambda x: l.append("/" + x))
return l
[docs] def is_file(self) -> bool:
"""Check if self is file."""
if self.name not in self._keys:
return False
return isinstance(self.root[self.name], h5py.Dataset)
[docs] def is_dir(self) -> bool:
"""Check if self is directory."""
if self.name not in self._keys:
return False
return isinstance(self.root[self.name], h5py.Group)
def __truediv__(self, key: str) -> "DPPath":
"""Used for / operator."""
return type(self)("%s#%s" % (self.root_path, self._connect_path(key)))
def _connect_path(self, path: str) -> str:
"""Connect self with path"""
if self.name.endswith("/"):
return "%s%s" % (self.name, path)
return "%s/%s" % (self.name, path)
def __lt__(self, other: "DPH5Path") -> bool:
"""whether this DPPath is less than other for sorting"""
if self.root_path == other.root_path:
return self.name < other.name
return self.root_path < other.root_path
def __str__(self) -> str:
"""returns path of self"""
return "%s#%s" % (self.root_path, self.name)