+59
-2
lines changedFilter options
+59
-2
lines changed Original file line number Diff line number Diff line change
@@ -4,6 +4,7 @@
4
4
PDSH_MAX_FAN_OUT = 1024
5
5
6
6
OPENMPI_LAUNCHER = 'openmpi'
7
+
JSRUN_LAUNCHER = 'jsrun'
7
8
MPICH_LAUNCHER = 'mpich'
8
9
SLURM_LAUNCHER = 'slurm'
9
10
MVAPICH_LAUNCHER = 'mvapich'
Original file line number Diff line number Diff line change
@@ -169,6 +169,60 @@ def get_cmd(self, environment, active_resources):
169
169
return mpirun_cmd + export_cmd + python_exec + [self.user_script
170
170
] + self.user_arguments
171
171
172
+
class JSRunner(MultiNodeRunner):
173
+
def __init__(self, args, world_info_base64, resource_pool):
174
+
super().__init__(args, world_info_base64)
175
+
self.resource_pool = resource_pool
176
+
# Hard coded for Summit
177
+
self.add_export('CUDA_VISIBLE_DEVICES', '0,1,2,3,4,5')
178
+
179
+
def backend_exists(self):
180
+
#TODO: if IB is available we should suggestion mvapich
181
+
#This ompi check will still work for jsrun since spectrum-mpi is based on ompi
182
+
return shutil.which('ompi_info')
183
+
184
+
@property
185
+
def name(self):
186
+
return "jsrun"
187
+
188
+
def validate_args(self):
189
+
super().validate_args()
190
+
#TODO: Allow for include/exclude at node-level but not gpu-level
191
+
if self.args.include != "" or self.args.exclude != "":
192
+
raise ValueError(
193
+
f"{self.name} backend does not support worker include/exclusion")
194
+
if self.args.num_nodes != -1 or self.args.num_gpus != -1:
195
+
raise ValueError(
196
+
f"{self.name} backend does not support limiting num nodes/gpus")
197
+
198
+
def get_cmd(self, environment, active_resources):
199
+
total_process_count = sum(self.resource_pool.values())
200
+
201
+
jsrun_cmd = [
202
+
'jsrun',
203
+
'-n',
204
+
f'{total_process_count}',
205
+
'-c',
206
+
f'{7}',
207
+
'-g',
208
+
f'{1}',
209
+
'-a',
210
+
f'{1}',
211
+
212
+
] + split(self.args.launcher_args)
213
+
214
+
export_cmd = []
215
+
for k, v in self.exports.items():
216
+
export_cmd += ['-E', "{}={}".format(k, v)]
217
+
218
+
python_exec = []
219
+
if not self.args.no_python:
220
+
python_exec = [sys.executable, "-u"]
221
+
if self.args.module:
222
+
python_exec.append("-m")
223
+
224
+
return jsrun_cmd + export_cmd + python_exec + [self.user_script
225
+
] + self.user_arguments
172
226
173
227
class MPICHRunner(MultiNodeRunner):
174
228
def __init__(self, args, world_info_base64, resource_pool):
Original file line number Diff line number Diff line change
@@ -18,8 +18,8 @@
18
18
import signal
19
19
import time
20
20
21
-
from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner
22
-
from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER
21
+
from .multinode_runner import PDSHRunner, OpenMPIRunner, MVAPICHRunner, SlurmRunner, MPICHRunner, JSRunner
22
+
from .constants import PDSH_LAUNCHER, OPENMPI_LAUNCHER, MVAPICH_LAUNCHER, SLURM_LAUNCHER, MPICH_LAUNCHER, JSRUN_LAUNCHER
23
23
from ..constants import TORCH_DISTRIBUTED_DEFAULT_PORT
24
24
from ..nebula.constants import NEBULA_EXPORT_ENVS
25
25
from ..utils import logger
@@ -511,6 +511,8 @@ def main(args=None):
511
511
runner = PDSHRunner(args, world_info_base64)
512
512
elif args.launcher == OPENMPI_LAUNCHER:
513
513
runner = OpenMPIRunner(args, world_info_base64, resource_pool)
514
+
elif args.launcher == JSRUN_LAUNCHER:
515
+
runner = JSRunner(args, world_info_base64, resource_pool)
514
516
elif args.launcher == MPICH_LAUNCHER:
515
517
runner = MPICHRunner(args, world_info_base64, resource_pool)
516
518
elif args.launcher == MVAPICH_LAUNCHER:
You can’t perform that action at this time.
RetroSearch is an open source project built by @garambo | Open a GitHub Issue
Search and Browse the WWW like it's 1997 | Search results from DuckDuckGo
HTML:
3.2
| Encoding:
UTF-8
| Version:
0.7.4