"""MOdule to get resources on SLURM cluster.
References
----------
https://github.com/deepsense-ai/tensorflow_on_slurm ####
"""
import hostlist
import os
from deepmd.cluster import local
from typing import List, Tuple, Optional
__all__ = ["get_resource"]
[docs]def get_resource() -> Tuple[str, List[str], Optional[List[int]]]:
"""Get SLURM resources: nodename, nodelist, and gpus.
Returns
-------
Tuple[str, List[str], Optional[List[int]]]
nodename, nodelist, and gpus
Raises
------
RuntimeError
if number of nodes could not be retrieved
ValueError
list of nodes is not of the same length sa number of nodes
ValueError
if current nodename is not found in node list
"""
nodelist = hostlist.expand_hostlist(os.environ["SLURM_JOB_NODELIST"])
nodename = os.environ["SLURMD_NODENAME"]
num_nodes_env = os.getenv("SLURM_JOB_NUM_NODES")
if num_nodes_env:
num_nodes = int(num_nodes_env)
else:
raise RuntimeError("Could not get SLURM number of nodes")
if len(nodelist) != num_nodes:
raise ValueError(
f"Number of slurm nodes {len(nodelist)} not equal to {num_nodes}"
)
if nodename not in nodelist:
raise ValueError(
f"Nodename({nodename}) not in nodelist({nodelist}). This should not happen!"
)
gpus = local.get_gpus()
return nodename, nodelist, gpus