| 1 | = Example Pipeline for Data Transfer Using Globus and Computation in Batch Jobs = |
| 2 | Here we consider a pipeline to do the following processes. |
| 3 | 1. Transfer data files from Box to the Cypress Lustre directory. |
| 4 | 2. Perform computation using the data. |
| 5 | 3. Transfer the results to Box. |
| 6 | 4. Delete files in Cypress Lustre. |
| 7 | |
| 8 | == Scripts == |
| 9 | === Job Submission Script === |
| 10 | '''submitJob.sh''' |
| 11 | {{{ |
| 12 | # |
| 13 | # Pipeline for Data Transfer Using Globus and Computation |
| 14 | # |
| 15 | # Job name |
| 16 | JOB_NAME="COMPUTING1" |
| 17 | |
| 18 | # Set path |
| 19 | export BOX_DATA_DIR="/Test/" |
| 20 | export CYPRESS_WORK_DIR="/lustre/project/group/userid/test/" |
| 21 | export BOX_RESULT_DIR="/Test_result/" |
| 22 | # |
| 23 | # Submit a job to transfer data from Box to Cypress |
| 24 | JOB1=`sbatch --job-name=${JOB_NAME}_DL ./transferData.sh DOWNLOAD KEEP | awk '{print $4}'`; |
| 25 | echo $JOB1 "Submitted" |
| 26 | |
| 27 | # Submit a job to process data on Cypress |
| 28 | JOB2=`sbatch --job-name=${JOB_NAME} --dependency=afterok:$JOB1 ./computing.sh | awk '{print $4}'`; |
| 29 | echo $JOB2 "Submitted" |
| 30 | |
| 31 | # Submit a job to transfer data from Cypress to Box |
| 32 | JOB3=`sbatch --job-name=${JOB_NAME}_UL --dependency=afterok:$JOB2 ./transferData.sh UPLOAD DELETE | awk '{print $4}'`; |
| 33 | echo $JOB3 "Submitted" |
| 34 | }}} |
| 35 | '''JOB_NAME''' is the job name. |
| 36 | |
| 37 | '''BOX_DATA_DIR''' is the directory in Box where the source data is stored. |
| 38 | |
| 39 | '''CYPRESS_WORK_DIR''' is the directory where the downloaded data is stored. |
| 40 | |
| 41 | '''BOX_RESULT_DIR''' is the directory where results are uploaded in Box. |
| 42 | |
| 43 | === Data Transfer Script === |
| 44 | '''transferData.sh''' |
| 45 | {{{ |
| 46 | #!/bin/bash |
| 47 | #SBATCH --partition=centos7 |
| 48 | #SBATCH --qos=long |
| 49 | #SBATCH --time=7-00:00:00 |
| 50 | #SBATCH --nodes=1 |
| 51 | #SBATCH --ntasks-per-node=1 |
| 52 | #SBATCH --cpus-per-task=1 |
| 53 | |
| 54 | # Check options |
| 55 | if [ $# -ne 2 ]; then |
| 56 | echo 'Usage: transferData.sh [DOWNLOAD | UPLOAD] [KEEP | DELETE]' |
| 57 | exit 1 |
| 58 | fi |
| 59 | |
| 60 | # Check path |
| 61 | if [[ -z "${BOX_DATA_DIR}" ]]; then |
| 62 | echo "ERROR! BOX_DATA_DIR isn't set." |
| 63 | exit 1 |
| 64 | fi |
| 65 | if [[ -z "${CYPRESS_WORK_DIR}" ]]; then |
| 66 | echo "ERROR! CYPRESS_WORK_DIR isn't set." |
| 67 | exit 1 |
| 68 | fi |
| 69 | if [[ -z "${BOX_RESULT_DIR}" ]]; then |
| 70 | echo "ERROR! BOX_RESULT_DIR isn't set." |
| 71 | exit 1 |
| 72 | fi |
| 73 | |
| 74 | # Start Globus Connect |
| 75 | module load globusconnectpersonal/3.2.5 |
| 76 | globusconnect -start & |
| 77 | |
| 78 | # Set up CLI environment |
| 79 | source activate globus-cli |
| 80 | |
| 81 | # Obtain local UUID |
| 82 | MY_UUID=$(globus endpoint local-id) |
| 83 | uuid_code=$? |
| 84 | if [ $uuid_code -ne 0 ]; then |
| 85 | echo "ERROR! Globus Connect isn't activated." |
| 86 | globusconnect -stop |
| 87 | exit 1 |
| 88 | fi |
| 89 | |
| 90 | # Make the source and destination path |
| 91 | if [[ "$1" == "DOWNLOAD" ]]; then |
| 92 | SOURCE_EP=$TULANE_BOX:$BOX_DATA_DIR |
| 93 | DEST_EP=$MY_UUID:$CYPRESS_WORK_DIR |
| 94 | else |
| 95 | SOURCE_EP=$MY_UUID:$CYPRESS_WORK_DIR |
| 96 | DEST_EP=$TULANE_BOX:$BOX_DATA_DIR |
| 97 | fi |
| 98 | |
| 99 | # Check logged in to Globus |
| 100 | output=$(globus whoami >/dev/null 2>&1) |
| 101 | output_code=$? |
| 102 | if [ $output_code -ne 0 ]; then |
| 103 | echo "ERROR! Not logged in to Globus" |
| 104 | globusconnect -stop |
| 105 | exit 1 |
| 106 | fi |
| 107 | |
| 108 | task_id=$(globus transfer "$SOURCE_EP" "$DEST_EP" --label "$SLURM_JOB_NAME" | tail -1 | awk '{print $3}') |
| 109 | output_code=$? |
| 110 | if [ $output_code -ne 0 ]; then |
| 111 | echo "ERROR! The transfer of data in could not be started." |
| 112 | globusconnect -stop |
| 113 | exit 1 |
| 114 | fi |
| 115 | |
| 116 | # wait util the task done. |
| 117 | output=$(globus task wait $task_id) |
| 118 | output_code=$? |
| 119 | if [ $output_code -ne 0 ]; then |
| 120 | echo "ERROR! The transfer of data was failed." |
| 121 | globus task cancel $task_id |
| 122 | globusconnect -stop |
| 123 | exit 1 |
| 124 | fi |
| 125 | |
| 126 | # Check if the delete option is set |
| 127 | if [[ "$2" == "DELETE" ]]; then |
| 128 | task_id=$(globus rm --recursive $SOURCE_EP |& awk '{print $6}' | sed -e "s/\"//g") |
| 129 | globus task wait $task_id |
| 130 | fi |
| 131 | |
| 132 | # done successfully |
| 133 | source deactivate globus-cli |
| 134 | globusconnect -stop |
| 135 | exit 0 |
| 136 | }}} |
| 137 | |
| 138 | |
| 139 | === Computing Script === |
| 140 | '''computing.sh''' |
| 141 | {{{ |
| 142 | #!/bin/bash |
| 143 | #SBATCH --partition=defq |
| 144 | #SBATCH --qos=normal |
| 145 | #SBATCH --time=1-00:00:00 |
| 146 | #SBATCH --nodes=1 |
| 147 | #SBATCH --ntasks-per-node=1 |
| 148 | #SBATCH --cpus-per-task=20 |
| 149 | |
| 150 | # cd to working directory |
| 151 | cd ${CYPRESS_WORK_DIR} |
| 152 | pwd |
| 153 | |
| 154 | # module load ... computing something |
| 155 | touch RES |
| 156 | sleep 5 |
| 157 | |
| 158 | #done |
| 159 | exit 0 |
| 160 | }}} |
| 161 | |
| 162 | |
| 163 | == How to submit a job == |
| 164 | {{{ |
| 165 | sh ./SubmitJob.sh |
| 166 | }}} |