Source code for deepmd.utils.path

# SPDX-License-Identifier: LGPL-3.0-or-later
import os
from abc import (
    ABC,
    abstractmethod,
)
from functools import (
    lru_cache,
)
from pathlib import (
    Path,
)
from typing import (
    ClassVar,
    Dict,
    List,
    Optional,
)

import h5py
import numpy as np
from wcmatch.glob import (
    globfilter,
)



[docs]
class DPPath(ABC):
    """The path class to data system (DeepmdData).

    Parameters
    ----------
    path : str
        path
    mode : str, optional
        mode, by default "r"
    """

    def __new__(cls, path: str, mode: str = "r"):
        if cls is DPPath:
            if os.path.isdir(path):
                return super().__new__(DPOSPath)
            elif os.path.isfile(path.split("#")[0]):
                # assume h5 if it is not dir
                return super().__new__(DPH5Path)
            raise FileNotFoundError("%s not found" % path)
        return super().__new__(cls)

    @abstractmethod

[docs]
    def load_numpy(self) -> np.ndarray:
        """Load NumPy array.

        Returns
        -------
        np.ndarray
            loaded NumPy array
        """


    @abstractmethod

[docs]
    def load_txt(self, **kwargs) -> np.ndarray:
        """Load NumPy array from text.

        Returns
        -------
        np.ndarray
            loaded NumPy array
        """


    @abstractmethod

[docs]
    def save_numpy(self, arr: np.ndarray) -> None:
        """Save NumPy array.

        Parameters
        ----------
        arr : np.ndarray
            NumPy array
        """


    @abstractmethod

[docs]
    def glob(self, pattern: str) -> List["DPPath"]:
        """Search path using the glob pattern.

        Parameters
        ----------
        pattern : str
            glob pattern

        Returns
        -------
        List[DPPath]
            list of paths
        """


    @abstractmethod

[docs]
    def rglob(self, pattern: str) -> List["DPPath"]:
        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
        of the given relative pattern.

        Parameters
        ----------
        pattern : str
            glob pattern

        Returns
        -------
        List[DPPath]
            list of paths
        """


    @abstractmethod

[docs]
    def is_file(self) -> bool:
        """Check if self is file."""


    @abstractmethod

[docs]
    def is_dir(self) -> bool:
        """Check if self is directory."""


    @abstractmethod

[docs]
    def __truediv__(self, key: str) -> "DPPath":
        """Used for / operator."""


    @abstractmethod

[docs]
    def __lt__(self, other: "DPPath") -> bool:
        """Whether this DPPath is less than other for sorting."""


    @abstractmethod

[docs]
    def __str__(self) -> str:
        """Represent string."""



[docs]
    def __repr__(self) -> str:
        return f"{type(self)} ({self!s})"



[docs]
    def __eq__(self, other) -> bool:
        return str(self) == str(other)



[docs]
    def __hash__(self):
        return hash(str(self))


    @property
    @abstractmethod

[docs]
    def name(self) -> str:
        """Name of the path."""


    @abstractmethod

[docs]
    def mkdir(self, parents: bool = False, exist_ok: bool = False) -> None:
        """Make directory.

        Parameters
        ----------
        parents : bool, optional
            If true, any missing parents of this directory are created as well.
        exist_ok : bool, optional
            If true, no error will be raised if the target directory already exists.
        """





[docs]
class DPOSPath(DPPath):
    """The OS path class to data system (DeepmdData) for real directories.

    Parameters
    ----------
    path : str
        path
    mode : str, optional
        mode, by default "r"
    """

    def __init__(self, path: str, mode: str = "r") -> None:
        super().__init__()
        self.mode = mode
        if isinstance(path, Path):
            self.path = path
        else:
            self.path = Path(path)


[docs]
    def load_numpy(self) -> np.ndarray:
        """Load NumPy array.

        Returns
        -------
        np.ndarray
            loaded NumPy array
        """
        return np.load(str(self.path))



[docs]
    def load_txt(self, **kwargs) -> np.ndarray:
        """Load NumPy array from text.

        Returns
        -------
        np.ndarray
            loaded NumPy array
        """
        return np.loadtxt(str(self.path), **kwargs)



[docs]
    def save_numpy(self, arr: np.ndarray) -> None:
        """Save NumPy array.

        Parameters
        ----------
        arr : np.ndarray
            NumPy array
        """
        if self.mode == "r":
            raise ValueError("Cannot save to read-only path")
        with self.path.open("wb") as f:
            np.save(f, arr)



[docs]
    def glob(self, pattern: str) -> List["DPPath"]:
        """Search path using the glob pattern.

        Parameters
        ----------
        pattern : str
            glob pattern

        Returns
        -------
        List[DPPath]
            list of paths
        """
        # currently DPOSPath will only derivative DPOSPath
        return [type(self)(p, mode=self.mode) for p in self.path.glob(pattern)]



[docs]
    def rglob(self, pattern: str) -> List["DPPath"]:
        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
        of the given relative pattern.

        Parameters
        ----------
        pattern : str
            glob pattern

        Returns
        -------
        List[DPPath]
            list of paths
        """
        return [type(self)(p, mode=self.mode) for p in self.path.rglob(pattern)]



[docs]
    def is_file(self) -> bool:
        """Check if self is file."""
        return self.path.is_file()



[docs]
    def is_dir(self) -> bool:
        """Check if self is directory."""
        return self.path.is_dir()



[docs]
    def __truediv__(self, key: str) -> "DPPath":
        """Used for / operator."""
        return type(self)(self.path / key, mode=self.mode)



[docs]
    def __lt__(self, other: "DPOSPath") -> bool:
        """Whether this DPPath is less than other for sorting."""
        return self.path < other.path



[docs]
    def __str__(self) -> str:
        """Represent string."""
        return str(self.path)


    @property

[docs]
    def name(self) -> str:
        """Name of the path."""
        return self.path.name



[docs]
    def mkdir(self, parents: bool = False, exist_ok: bool = False) -> None:
        """Make directory.

        Parameters
        ----------
        parents : bool, optional
            If true, any missing parents of this directory are created as well.
        exist_ok : bool, optional
            If true, no error will be raised if the target directory already exists.
        """
        if self.mode == "r":
            raise ValueError("Cannot mkdir to read-only path")
        self.path.mkdir(parents=parents, exist_ok=exist_ok)





[docs]
class DPH5Path(DPPath):
    """The path class to data system (DeepmdData) for HDF5 files.

    Notes
    -----
    OS - HDF5 relationship:
        directory - Group
        file - Dataset

    Parameters
    ----------
    path : str
        path
    mode : str, optional
        mode, by default "r"
    """

    def __init__(self, path: str, mode: str = "r") -> None:
        super().__init__()
        self.mode = mode
        # we use "#" to split path
        # so we do not support file names containing #...
        s = path.split("#")
        self.root_path = s[0]
        self.root = self._load_h5py(s[0], mode)
        # h5 path: default is the root path
        self._name = s[1] if len(s) > 1 else "/"

    @classmethod
    @lru_cache(None)

[docs]
    def _load_h5py(cls, path: str, mode: str = "r") -> h5py.File:
        """Load hdf5 file.

        Parameters
        ----------
        path : str
            path to hdf5 file
        mode : str, optional
            mode, by default 'r'
        """
        # this method has cache to avoid duplicated
        # loading from different DPH5Path
        # However the file will be never closed?
        return h5py.File(path, mode)



[docs]
    def load_numpy(self) -> np.ndarray:
        """Load NumPy array.

        Returns
        -------
        np.ndarray
            loaded NumPy array
        """
        return self.root[self._name][:]



[docs]
    def load_txt(self, dtype: Optional[np.dtype] = None, **kwargs) -> np.ndarray:
        """Load NumPy array from text.

        Returns
        -------
        np.ndarray
            loaded NumPy array
        """
        arr = self.load_numpy()
        if dtype:
            arr = arr.astype(dtype)
        return arr



[docs]
    def save_numpy(self, arr: np.ndarray) -> None:
        """Save NumPy array.

        Parameters
        ----------
        arr : np.ndarray
            NumPy array
        """
        if self._name in self._keys:
            del self.root[self._name]
        self.root.create_dataset(self._name, data=arr)
        self.root.flush()
        self._new_keys.append(self._name)



[docs]
    def glob(self, pattern: str) -> List["DPPath"]:
        """Search path using the glob pattern.

        Parameters
        ----------
        pattern : str
            glob pattern

        Returns
        -------
        List[DPPath]
            list of paths
        """
        # got paths starts with current path first, which is faster
        subpaths = [ii for ii in self._keys if ii.startswith(self._name)]
        return [
            type(self)(f"{self.root_path}#{pp}", mode=self.mode)
            for pp in globfilter(subpaths, self._connect_path(pattern))
        ]



[docs]
    def rglob(self, pattern: str) -> List["DPPath"]:
        """This is like calling :meth:`DPPath.glob()` with `**/` added in front
        of the given relative pattern.

        Parameters
        ----------
        pattern : str
            glob pattern

        Returns
        -------
        List[DPPath]
            list of paths
        """
        return self.glob("**" + pattern)


    @property

[docs]
    def _keys(self) -> List[str]:
        """Walk all groups and dataset."""
        return self._file_keys(self.root)



[docs]
    __file_new_keys: ClassVar[Dict[h5py.File, List[str]]] = {}


    @property

[docs]
    def _new_keys(self):
        """New keys that haven't been cached."""
        self.__file_new_keys.setdefault(self.root, [])
        return self.__file_new_keys[self.root]


    @classmethod
    @lru_cache(None)

[docs]
    def _file_keys(cls, file: h5py.File) -> List[str]:
        """Walk all groups and dataset."""
        l = []
        file.visit(lambda x: l.append("/" + x))
        return l



[docs]
    def is_file(self) -> bool:
        """Check if self is file."""
        if self._name not in self._keys and self._name not in self._new_keys:
            return False
        return isinstance(self.root[self._name], h5py.Dataset)



[docs]
    def is_dir(self) -> bool:
        """Check if self is directory."""
        if self._name == "/":
            return True
        if self._name not in self._keys and self._name not in self._new_keys:
            return False
        return isinstance(self.root[self._name], h5py.Group)



[docs]
    def __truediv__(self, key: str) -> "DPPath":
        """Used for / operator."""
        return type(self)(f"{self.root_path}#{self._connect_path(key)}", mode=self.mode)



[docs]
    def _connect_path(self, path: str) -> str:
        """Connect self with path."""
        if self._name.endswith("/"):
            return f"{self._name}{path}"
        return f"{self._name}/{path}"



[docs]
    def __lt__(self, other: "DPH5Path") -> bool:
        """Whether this DPPath is less than other for sorting."""
        if self.root_path == other.root_path:
            return self._name < other._name
        return self.root_path < other.root_path



[docs]
    def __str__(self) -> str:
        """Returns path of self."""
        return f"{self.root_path}#{self._name}"


    @property

[docs]
    def name(self) -> str:
        """Name of the path."""
        return self._name.split("/")[-1]



[docs]
    def mkdir(self, parents: bool = False, exist_ok: bool = False) -> None:
        """Make directory.

        Parameters
        ----------
        parents : bool, optional
            If true, any missing parents of this directory are created as well.
        exist_ok : bool, optional
            If true, no error will be raised if the target directory already exists.
        """
        if self._name in self._keys:
            if not exist_ok:
                raise FileExistsError(f"{self} already exists")
            return
        if parents:
            self.root.require_group(self._name)
        else:
            self.root.create_group(self._name)
        self._new_keys.append(self._name)