# SPDX-License-Identifier: LGPL-3.0-or-later
"""MOdule to get resources on SLURM cluster.
References
----------
https://github.com/deepsense-ai/tensorflow_on_slurm ####
"""
import os
from typing import (
List,
Optional,
Tuple,
)
import hostlist
from deepmd.cluster import (
local,
)
__all__ = ["get_resource"]
[docs]def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
"""Get SLURM resources: nodename, nodelist, and gpus.
Returns
-------
Tuple[str, List[str], Optional[List[int]]]
nodename, nodelist, and gpus
Raises
------
RuntimeError
if number of nodes could not be retrieved
ValueError
list of nodes is not of the same length sa number of nodes
ValueError
if current nodename is not found in node list
"""
nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"])
nodename = os.environ["SLURMD_NODENAME"]
num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES")
if num_nodes_env:
num_nodes = int(num_nodes_env)
else:
raise RuntimeError("Could not get SLURM number of nodes")
if len(nodelist) != num_nodes:
raise ValueError(
f"Number of slurm nodes {len(nodelist)} not equal to {num_nodes}"
)
if nodename not in nodelist:
raise ValueError(
f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!"
)
gpus = local.get_gpus()
return nodename, nodelist, gpus