| | 1 | = Example Pipeline for Data Transfer Using Globus and Computation in Batch Jobs = |
| | 2 | Here we consider a pipeline to do the following processes. |
| | 3 | 1. Transfer data files from Box to the Cypress Lustre directory. |
| | 4 | 2. Perform computation using the data. |
| | 5 | 3. Transfer the results to Box. |
| | 6 | 4. Delete files in Cypress Lustre. |
| | 7 | |
| | 8 | == Scripts == |
| | 9 | === Job Submission Script === |
| | 10 | '''submitJob.sh''' |
| | 11 | {{{ |
| | 12 | # |
| | 13 | # Pipeline for Data Transfer Using Globus and Computation |
| | 14 | # |
| | 15 | # Job name |
| | 16 | JOB_NAME="COMPUTING1" |
| | 17 | |
| | 18 | # Set path |
| | 19 | export BOX_DATA_DIR="/Test/" |
| | 20 | export CYPRESS_WORK_DIR="/lustre/project/group/userid/test/" |
| | 21 | export BOX_RESULT_DIR="/Test_result/" |
| | 22 | # |
| | 23 | # Submit a job to transfer data from Box to Cypress |
| | 24 | JOB1=`sbatch --job-name=${JOB_NAME}_DL ./transferData.sh DOWNLOAD KEEP | awk '{print $4}'`; |
| | 25 | echo $JOB1 "Submitted" |
| | 26 | |
| | 27 | # Submit a job to process data on Cypress |
| | 28 | JOB2=`sbatch --job-name=${JOB_NAME} --dependency=afterok:$JOB1 ./computing.sh | awk '{print $4}'`; |
| | 29 | echo $JOB2 "Submitted" |
| | 30 | |
| | 31 | # Submit a job to transfer data from Cypress to Box |
| | 32 | JOB3=`sbatch --job-name=${JOB_NAME}_UL --dependency=afterok:$JOB2 ./transferData.sh UPLOAD DELETE | awk '{print $4}'`; |
| | 33 | echo $JOB3 "Submitted" |
| | 34 | }}} |
| | 35 | '''JOB_NAME''' is the job name. |
| | 36 | |
| | 37 | '''BOX_DATA_DIR''' is the directory in Box where the source data is stored. |
| | 38 | |
| | 39 | '''CYPRESS_WORK_DIR''' is the directory where the downloaded data is stored. |
| | 40 | |
| | 41 | '''BOX_RESULT_DIR''' is the directory where results are uploaded in Box. |
| | 42 | |
| | 43 | === Data Transfer Script === |
| | 44 | '''transferData.sh''' |
| | 45 | {{{ |
| | 46 | #!/bin/bash |
| | 47 | #SBATCH --partition=centos7 |
| | 48 | #SBATCH --qos=long |
| | 49 | #SBATCH --time=7-00:00:00 |
| | 50 | #SBATCH --nodes=1 |
| | 51 | #SBATCH --ntasks-per-node=1 |
| | 52 | #SBATCH --cpus-per-task=1 |
| | 53 | |
| | 54 | # Check options |
| | 55 | if [ $# -ne 2 ]; then |
| | 56 | echo 'Usage: transferData.sh [DOWNLOAD | UPLOAD] [KEEP | DELETE]' |
| | 57 | exit 1 |
| | 58 | fi |
| | 59 | |
| | 60 | # Check path |
| | 61 | if [[ -z "${BOX_DATA_DIR}" ]]; then |
| | 62 | echo "ERROR! BOX_DATA_DIR isn't set." |
| | 63 | exit 1 |
| | 64 | fi |
| | 65 | if [[ -z "${CYPRESS_WORK_DIR}" ]]; then |
| | 66 | echo "ERROR! CYPRESS_WORK_DIR isn't set." |
| | 67 | exit 1 |
| | 68 | fi |
| | 69 | if [[ -z "${BOX_RESULT_DIR}" ]]; then |
| | 70 | echo "ERROR! BOX_RESULT_DIR isn't set." |
| | 71 | exit 1 |
| | 72 | fi |
| | 73 | |
| | 74 | # Start Globus Connect |
| | 75 | module load globusconnectpersonal/3.2.5 |
| | 76 | globusconnect -start & |
| | 77 | |
| | 78 | # Set up CLI environment |
| | 79 | source activate globus-cli |
| | 80 | |
| | 81 | # Obtain local UUID |
| | 82 | MY_UUID=$(globus endpoint local-id) |
| | 83 | uuid_code=$? |
| | 84 | if [ $uuid_code -ne 0 ]; then |
| | 85 | echo "ERROR! Globus Connect isn't activated." |
| | 86 | globusconnect -stop |
| | 87 | exit 1 |
| | 88 | fi |
| | 89 | |
| | 90 | # Make the source and destination path |
| | 91 | if [[ "$1" == "DOWNLOAD" ]]; then |
| | 92 | SOURCE_EP=$TULANE_BOX:$BOX_DATA_DIR |
| | 93 | DEST_EP=$MY_UUID:$CYPRESS_WORK_DIR |
| | 94 | else |
| | 95 | SOURCE_EP=$MY_UUID:$CYPRESS_WORK_DIR |
| | 96 | DEST_EP=$TULANE_BOX:$BOX_DATA_DIR |
| | 97 | fi |
| | 98 | |
| | 99 | # Check logged in to Globus |
| | 100 | output=$(globus whoami >/dev/null 2>&1) |
| | 101 | output_code=$? |
| | 102 | if [ $output_code -ne 0 ]; then |
| | 103 | echo "ERROR! Not logged in to Globus" |
| | 104 | globusconnect -stop |
| | 105 | exit 1 |
| | 106 | fi |
| | 107 | |
| | 108 | task_id=$(globus transfer "$SOURCE_EP" "$DEST_EP" --label "$SLURM_JOB_NAME" | tail -1 | awk '{print $3}') |
| | 109 | output_code=$? |
| | 110 | if [ $output_code -ne 0 ]; then |
| | 111 | echo "ERROR! The transfer of data in could not be started." |
| | 112 | globusconnect -stop |
| | 113 | exit 1 |
| | 114 | fi |
| | 115 | |
| | 116 | # wait util the task done. |
| | 117 | output=$(globus task wait $task_id) |
| | 118 | output_code=$? |
| | 119 | if [ $output_code -ne 0 ]; then |
| | 120 | echo "ERROR! The transfer of data was failed." |
| | 121 | globus task cancel $task_id |
| | 122 | globusconnect -stop |
| | 123 | exit 1 |
| | 124 | fi |
| | 125 | |
| | 126 | # Check if the delete option is set |
| | 127 | if [[ "$2" == "DELETE" ]]; then |
| | 128 | task_id=$(globus rm --recursive $SOURCE_EP |& awk '{print $6}' | sed -e "s/\"//g") |
| | 129 | globus task wait $task_id |
| | 130 | fi |
| | 131 | |
| | 132 | # done successfully |
| | 133 | source deactivate globus-cli |
| | 134 | globusconnect -stop |
| | 135 | exit 0 |
| | 136 | }}} |
| | 137 | |
| | 138 | |
| | 139 | === Computing Script === |
| | 140 | '''computing.sh''' |
| | 141 | {{{ |
| | 142 | #!/bin/bash |
| | 143 | #SBATCH --partition=defq |
| | 144 | #SBATCH --qos=normal |
| | 145 | #SBATCH --time=1-00:00:00 |
| | 146 | #SBATCH --nodes=1 |
| | 147 | #SBATCH --ntasks-per-node=1 |
| | 148 | #SBATCH --cpus-per-task=20 |
| | 149 | |
| | 150 | # cd to working directory |
| | 151 | cd ${CYPRESS_WORK_DIR} |
| | 152 | pwd |
| | 153 | |
| | 154 | # module load ... computing something |
| | 155 | touch RES |
| | 156 | sleep 5 |
| | 157 | |
| | 158 | #done |
| | 159 | exit 0 |
| | 160 | }}} |
| | 161 | |
| | 162 | |
| | 163 | == How to submit a job == |
| | 164 | {{{ |
| | 165 | sh ./SubmitJob.sh |
| | 166 | }}} |