45 | | If you have some MPI jobs that must run concurrently, Many-task computing may be the way to go. |
| 47 | If you have some MPI jobs that must run concurrently, Many-task computing may be the way to go. |
| 48 | |
| 49 | The example below requests 6 nodes and puts 3 MPI jobs into the single job. |
| 50 | |
| 51 | {{{ |
| 52 | #!/bin/bash |
| 53 | #SBATCH --partition=defq # Partition |
| 54 | #SBATCH --qos=normal # Quality of Service |
| 55 | #SBATCH --job-name=PilotJob # Job Name |
| 56 | #SBATCH --time=00:10:00 # WallTime |
| 57 | #SBATCH --nodes=6 # Number of Nodes |
| 58 | #SBATCH --ntasks-per-node=20 # Number of tasks (MPI presseces) |
| 59 | #SBATCH --cpus-per-task=1 # Number of processors per task OpenMP threads() |
| 60 | #SBATCH --gres=mic:0 # Number of Co-Processors |
| 61 | |
| 62 | module load intel-psxe |
| 63 | |
| 64 | # NUMBER OF SUB-JOBS |
| 65 | NUM_OF_SUBJOBS=3 |
| 66 | |
| 67 | # Make hostlist |
| 68 | HOSTLIST=${SLURM_JOB_ID}_HOSTS |
| 69 | mpirun hostname -s | sort > ${SLURM_JOB_ID}_HOSTS |
| 70 | # |
| 71 | python jobLauncher.py $NUM_OF_SUBJOBS $HOSTLIST |
| 72 | }}} |
| 73 | |
| 74 | 'jobLauncher.py' manages sub-jobs. |
| 75 | |
| 76 | {{{ |
| 77 | # -*- coding: utf-8 -*- |
| 78 | """ |
| 79 | Created on Fri Jan 22 16:33:57 2016 |
| 80 | |
| 81 | @author: fuji |
| 82 | """ |
| 83 | |
| 84 | import sys |
| 85 | import os |
| 86 | import time |
| 87 | import subprocess |
| 88 | |
| 89 | dirname = os.path.dirname(os.path.abspath(sys.argv[0])) |
| 90 | numOfSubJobs = int(sys.argv[1]) |
| 91 | nodeFile = sys.argv[2] |
| 92 | # |
| 93 | # Get nodes |
| 94 | with open(nodeFile, 'r') as f: |
| 95 | nodes = f.readlines() |
| 96 | # |
| 97 | numOfNodes = len(nodes) |
| 98 | #print "N", numOfNodes, nodes |
| 99 | if (numOfNodes < numOfSubJobs) or (numOfSubJobs < 1): |
| 100 | os.abort() |
| 101 | # |
| 102 | # Dvide processors into numOfJobs |
| 103 | numOfNodesSubJob = [0] * numOfSubJobs |
| 104 | for id in range(numOfSubJobs): |
| 105 | numOfNodesSubJob[id] = numOfNodes / numOfSubJobs |
| 106 | if (numOfNodes % numOfSubJobs != 0): |
| 107 | if (id < numOfNodes % numOfSubJobs): |
| 108 | numOfNodesSubJob[id] += 1 |
| 109 | #print "n", numOfNodesSubJob[id] |
| 110 | # |
| 111 | # Allocate Nodes |
| 112 | idx = 0 |
| 113 | nodesSubJob = [None] * numOfSubJobs |
| 114 | for id in range(numOfSubJobs): |
| 115 | nodesSubJob[id] = [None] * numOfNodesSubJob[id] |
| 116 | for n in range(numOfNodesSubJob[id]): |
| 117 | nodesSubJob[id][n] = nodes[idx] |
| 118 | idx += 1 |
| 119 | #print nodesSubJob[id] |
| 120 | # |
| 121 | # Create Nodes Files |
| 122 | nodeFileName = [] |
| 123 | for id in range(numOfSubJobs): |
| 124 | nodeFileName.append("%s_%04d.nod" % (nodeFile,id)) |
| 125 | with open(nodeFileName[id],'wt') as outp: |
| 126 | for node in nodesSubJob[id]: |
| 127 | outp.write(node) |
| 128 | # |
| 129 | # Launch SubJobs |
| 130 | proc = [] |
| 131 | for id in range(numOfSubJobs): |
| 132 | commandToLaunchSubJobs = [] |
| 133 | commandToLaunchSubJobs.append(dirname + "/launch_subjob.sh") |
| 134 | commandToLaunchSubJobs.append(nodeFileName[id]) |
| 135 | # |
| 136 | #print commandToLaunchSubJobs |
| 137 | p = subprocess.Popen(commandToLaunchSubJobs, |
| 138 | shell=False, |
| 139 | stdout=subprocess.PIPE, |
| 140 | stderr=subprocess.PIPE) |
| 141 | proc.append(p) |
| 142 | |
| 143 | # Wait Until All subjobs done |
| 144 | while(True): |
| 145 | runningtasks = 0 |
| 146 | for id in range(numOfSubJobs): |
| 147 | if (proc[id].poll() == None): |
| 148 | runningtasks += 1 |
| 149 | if runningtasks == 0: |
| 150 | break |
| 151 | time.sleep(5) # Checks every 5 seconds |
| 152 | # |
| 153 | # Show outputs |
| 154 | for id in range(numOfSubJobs): |
| 155 | comm = proc[id].communicate() |
| 156 | sout = comm[0] |
| 157 | serr = comm[1] |
| 158 | # |
| 159 | print "SubJob", id |
| 160 | print sout |
| 161 | print serr |
| 162 | }}} |