| 45 | | If you have some MPI jobs that must run concurrently, Many-task computing may be the way to go. |
| | 47 | If you have some MPI jobs that must run concurrently, Many-task computing may be the way to go. |
| | 48 | |
| | 49 | The example below requests 6 nodes and puts 3 MPI jobs into the single job. |
| | 50 | |
| | 51 | {{{ |
| | 52 | #!/bin/bash |
| | 53 | #SBATCH --partition=defq # Partition |
| | 54 | #SBATCH --qos=normal # Quality of Service |
| | 55 | #SBATCH --job-name=PilotJob # Job Name |
| | 56 | #SBATCH --time=00:10:00 # WallTime |
| | 57 | #SBATCH --nodes=6 # Number of Nodes |
| | 58 | #SBATCH --ntasks-per-node=20 # Number of tasks (MPI presseces) |
| | 59 | #SBATCH --cpus-per-task=1 # Number of processors per task OpenMP threads() |
| | 60 | #SBATCH --gres=mic:0 # Number of Co-Processors |
| | 61 | |
| | 62 | module load intel-psxe |
| | 63 | |
| | 64 | # NUMBER OF SUB-JOBS |
| | 65 | NUM_OF_SUBJOBS=3 |
| | 66 | |
| | 67 | # Make hostlist |
| | 68 | HOSTLIST=${SLURM_JOB_ID}_HOSTS |
| | 69 | mpirun hostname -s | sort > ${SLURM_JOB_ID}_HOSTS |
| | 70 | # |
| | 71 | python jobLauncher.py $NUM_OF_SUBJOBS $HOSTLIST |
| | 72 | }}} |
| | 73 | |
| | 74 | 'jobLauncher.py' manages sub-jobs. |
| | 75 | |
| | 76 | {{{ |
| | 77 | # -*- coding: utf-8 -*- |
| | 78 | """ |
| | 79 | Created on Fri Jan 22 16:33:57 2016 |
| | 80 | |
| | 81 | @author: fuji |
| | 82 | """ |
| | 83 | |
| | 84 | import sys |
| | 85 | import os |
| | 86 | import time |
| | 87 | import subprocess |
| | 88 | |
| | 89 | dirname = os.path.dirname(os.path.abspath(sys.argv[0])) |
| | 90 | numOfSubJobs = int(sys.argv[1]) |
| | 91 | nodeFile = sys.argv[2] |
| | 92 | # |
| | 93 | # Get nodes |
| | 94 | with open(nodeFile, 'r') as f: |
| | 95 | nodes = f.readlines() |
| | 96 | # |
| | 97 | numOfNodes = len(nodes) |
| | 98 | #print "N", numOfNodes, nodes |
| | 99 | if (numOfNodes < numOfSubJobs) or (numOfSubJobs < 1): |
| | 100 | os.abort() |
| | 101 | # |
| | 102 | # Dvide processors into numOfJobs |
| | 103 | numOfNodesSubJob = [0] * numOfSubJobs |
| | 104 | for id in range(numOfSubJobs): |
| | 105 | numOfNodesSubJob[id] = numOfNodes / numOfSubJobs |
| | 106 | if (numOfNodes % numOfSubJobs != 0): |
| | 107 | if (id < numOfNodes % numOfSubJobs): |
| | 108 | numOfNodesSubJob[id] += 1 |
| | 109 | #print "n", numOfNodesSubJob[id] |
| | 110 | # |
| | 111 | # Allocate Nodes |
| | 112 | idx = 0 |
| | 113 | nodesSubJob = [None] * numOfSubJobs |
| | 114 | for id in range(numOfSubJobs): |
| | 115 | nodesSubJob[id] = [None] * numOfNodesSubJob[id] |
| | 116 | for n in range(numOfNodesSubJob[id]): |
| | 117 | nodesSubJob[id][n] = nodes[idx] |
| | 118 | idx += 1 |
| | 119 | #print nodesSubJob[id] |
| | 120 | # |
| | 121 | # Create Nodes Files |
| | 122 | nodeFileName = [] |
| | 123 | for id in range(numOfSubJobs): |
| | 124 | nodeFileName.append("%s_%04d.nod" % (nodeFile,id)) |
| | 125 | with open(nodeFileName[id],'wt') as outp: |
| | 126 | for node in nodesSubJob[id]: |
| | 127 | outp.write(node) |
| | 128 | # |
| | 129 | # Launch SubJobs |
| | 130 | proc = [] |
| | 131 | for id in range(numOfSubJobs): |
| | 132 | commandToLaunchSubJobs = [] |
| | 133 | commandToLaunchSubJobs.append(dirname + "/launch_subjob.sh") |
| | 134 | commandToLaunchSubJobs.append(nodeFileName[id]) |
| | 135 | # |
| | 136 | #print commandToLaunchSubJobs |
| | 137 | p = subprocess.Popen(commandToLaunchSubJobs, |
| | 138 | shell=False, |
| | 139 | stdout=subprocess.PIPE, |
| | 140 | stderr=subprocess.PIPE) |
| | 141 | proc.append(p) |
| | 142 | |
| | 143 | # Wait Until All subjobs done |
| | 144 | while(True): |
| | 145 | runningtasks = 0 |
| | 146 | for id in range(numOfSubJobs): |
| | 147 | if (proc[id].poll() == None): |
| | 148 | runningtasks += 1 |
| | 149 | if runningtasks == 0: |
| | 150 | break |
| | 151 | time.sleep(5) # Checks every 5 seconds |
| | 152 | # |
| | 153 | # Show outputs |
| | 154 | for id in range(numOfSubJobs): |
| | 155 | comm = proc[id].communicate() |
| | 156 | sout = comm[0] |
| | 157 | serr = comm[1] |
| | 158 | # |
| | 159 | print "SubJob", id |
| | 160 | print sout |
| | 161 | print serr |
| | 162 | }}} |