Changes between Version 1 and Version 2 of Workshops/cypress/ManyTaskComputing


Ignore:
Timestamp:
12/12/22 14:03:53 (2 years ago)
Author:
fuji
Comment:

Legend:

Unmodified
Added
Removed
Modified
  • Workshops/cypress/ManyTaskComputing

    v1 v2  
    2828date
    2929hostname
     30# $i = 1,2,3,..$SLURM_NTASKS_PER_NODE
    3031for i in $(seq $SLURM_NTASKS_PER_NODE)
    3132do
     33        # $TASK_ID=1,2,...100
    3234        TASK_ID=$((SLURM_ARRAY_TASK_ID + i))
    3335        cust_func $TASK_ID > log${TASK_ID}.out & # Put a function in the background
     
    3638## Put all cust_func in the background and bash
    3739## would wait until those are completed
    38 ## before displaying done message
     40## before displaying 'done' message
    3941wait
    4042echo "done"
     
    4345
    4446== Many MPI jobs in a single job ==
    45 If you have some MPI jobs that must run concurrently, Many-task computing may be the way to go.
     47If you have some MPI jobs that must run concurrently, Many-task computing may be the way to go.
     48
     49The example below requests 6 nodes and puts 3 MPI jobs into the single job.
     50
     51{{{
     52#!/bin/bash
     53#SBATCH --partition=defq        # Partition
     54#SBATCH --qos=normal            # Quality of Service
     55#SBATCH --job-name=PilotJob   # Job Name
     56#SBATCH --time=00:10:00         # WallTime
     57#SBATCH --nodes=6               # Number of Nodes
     58#SBATCH --ntasks-per-node=20    # Number of tasks (MPI presseces)
     59#SBATCH --cpus-per-task=1       # Number of processors per task OpenMP threads()
     60#SBATCH --gres=mic:0            # Number of Co-Processors
     61
     62module load intel-psxe
     63
     64# NUMBER OF SUB-JOBS
     65NUM_OF_SUBJOBS=3
     66
     67# Make hostlist
     68HOSTLIST=${SLURM_JOB_ID}_HOSTS
     69mpirun hostname -s | sort > ${SLURM_JOB_ID}_HOSTS
     70#
     71python jobLauncher.py $NUM_OF_SUBJOBS $HOSTLIST
     72}}}
     73
     74'jobLauncher.py' manages sub-jobs.
     75
     76{{{
     77# -*- coding: utf-8 -*-
     78"""
     79Created on Fri Jan 22 16:33:57 2016
     80
     81@author: fuji
     82"""
     83
     84import sys
     85import os
     86import time
     87import subprocess
     88
     89dirname = os.path.dirname(os.path.abspath(sys.argv[0]))
     90numOfSubJobs = int(sys.argv[1])
     91nodeFile = sys.argv[2]
     92#
     93# Get nodes
     94with open(nodeFile, 'r') as f:
     95    nodes = f.readlines()
     96#
     97numOfNodes = len(nodes)
     98#print "N", numOfNodes, nodes
     99if (numOfNodes < numOfSubJobs) or (numOfSubJobs < 1):
     100    os.abort()
     101#
     102# Dvide processors into numOfJobs
     103numOfNodesSubJob = [0] *  numOfSubJobs
     104for id in range(numOfSubJobs):
     105    numOfNodesSubJob[id] = numOfNodes / numOfSubJobs
     106    if (numOfNodes % numOfSubJobs != 0):
     107        if (id < numOfNodes % numOfSubJobs):
     108            numOfNodesSubJob[id] += 1
     109    #print "n", numOfNodesSubJob[id]
     110#
     111# Allocate Nodes
     112idx = 0
     113nodesSubJob = [None] * numOfSubJobs
     114for id in range(numOfSubJobs):
     115    nodesSubJob[id] = [None] * numOfNodesSubJob[id]
     116    for n in range(numOfNodesSubJob[id]):
     117        nodesSubJob[id][n] = nodes[idx]
     118        idx += 1
     119    #print nodesSubJob[id]
     120#
     121# Create Nodes Files
     122nodeFileName = []
     123for id in range(numOfSubJobs):
     124    nodeFileName.append("%s_%04d.nod" % (nodeFile,id))
     125    with open(nodeFileName[id],'wt') as outp:
     126        for node in nodesSubJob[id]:
     127            outp.write(node)
     128#
     129# Launch SubJobs
     130proc = []
     131for id in range(numOfSubJobs):
     132    commandToLaunchSubJobs = []
     133    commandToLaunchSubJobs.append(dirname + "/launch_subjob.sh")
     134    commandToLaunchSubJobs.append(nodeFileName[id])
     135    #
     136    #print commandToLaunchSubJobs
     137    p = subprocess.Popen(commandToLaunchSubJobs,
     138                         shell=False,
     139                         stdout=subprocess.PIPE,
     140                         stderr=subprocess.PIPE)
     141    proc.append(p)
     142
     143# Wait Until All subjobs done
     144while(True):
     145    runningtasks = 0
     146    for id in range(numOfSubJobs):
     147        if (proc[id].poll() == None):
     148            runningtasks += 1
     149    if runningtasks == 0:
     150        break
     151    time.sleep(5) # Checks every 5 seconds
     152#
     153# Show outputs
     154for id in range(numOfSubJobs):
     155    comm = proc[id].communicate()
     156    sout = comm[0]
     157    serr = comm[1]
     158    #
     159    print "SubJob", id
     160    print sout
     161    print serr
     162}}}
    46163
    47164
     165
     166and 'launch_subjob.sh' script launch a sub-job.
     167
     168
     169{{{
     170#!/bin/bash
     171export HOST_LIST=$1
     172
     173# Run Sub-job
     174mpirun -hostfile $HOST_LIST hostname -s
     175}}}
     176
     177
     178