#!/bin/bash
#
# Periodically check state of grid jobs in SLURM, and put mark files
# for finished jobs.
#
# usage: scan_slurm_job control_dir ...

# ARC1 passes first the config file.
if [ "$1" = "--config" ]; then shift; ARC_CONFIG=$1; shift; fi
# Validate control directories supplied on command-line
if [ -z "$1" ] ; then
    echo "no control_dir specified" 1>&2; exit 1
fi
for ctr_dir in "$@"; do
    if [ ! -d "$ctr_dir" ]; then
	echo "called with erronous control dir: $ctr_dir"
	exit 1
    fi
done

joboption_lrms="SLURM"
lrms_options="slurm_wakeupperiod slurm_use_sacct slurm_bin_path"

# define paths and config parser
basedir=`dirname $0`
basedir=`cd $basedir > /dev/null && pwd` || exit $?
. "${basedir}/lrms_common.sh"

# include common scan functions
. "${pkgdatadir}/scan_common.sh" || exit $?

# run common init 
#  * parse config
#  * load LRMS-specific env
#  * set common variables
common_init

# Prevent multiple instances of scan-slurm-job to run concurrently
lockfile="${TMPDIR:-/tmp}/scan-slurm-job.lock"

#Check if lockfile exist, if not, create it.
(set -C; : > "$lockfile") 2> /dev/null
if [ "$?" != "0" ]; then
    if ps -p $(< "$lockfile") 2>/dev/null;then
	echo "lockfile exists and PID $(< $lockfile) is running"
	exit 1
    fi
    echo "old lockfile found, was scan-slurm-job killed?"

    # sleep, and if no other have removed and recreated the lockfile we remove it.
    # there are still races possible, but this will have to do
    sleep $((${RANDOM}%30+10))
    if ps -p $(< $lockfile) &>/dev/null;then
        echo "lockfile exists and $(< $lockfile) is running"
	exit 1
    else
	echo "still not running, removing lockfile"
	rm $lockfile
	exit 1
    fi
fi
echo "$$" > "$lockfile"
#If killed, remove lockfile
trap 'rm $lockfile' EXIT KILL TERM
#Default sleep-time is 30 seconds
sleep ${CONFIG_slurm_wakeupperiod:-30}

# Log system performance
if [ ! -z "$perflogdir" ]; then
   perflog_common "$perflogdir" "$CONFIG_controldir"
fi

### use sacct
unset use_sacct

if [ ! -z "${CONFIG_slurm_use_sacct}" ]; then
  if [ "${CONFIG_slurm_use_sacct}" = "yes" ]; then
    use_sacct="true"
  fi
fi

my_id=`id -u`

if [ ! -z "$perflogdir" ]; then
   #start time stamp
   start_ts=`date +%s.%N`
fi

# List of SLURM jobids for grid-jobs with state INLRMS
declare -a localids
# Array with basenames of grid-job files in ctrl_dir, indexed by localid
# example /some/path/job.XXXXX /some/other/parh/job.YYYYY
declare -a basenames
# Array with states of the jobs in SLURM, indexed by localid
declare -a jobstates
# Array to store localids of jobs that are determined to have finished, which are sent to gm-kick
declare -a kicklist
# Array with jobid blocks 
declare -a lidblocks

# Find list of grid jobs with status INLRMS, store localid and
# basename for those jobs
for ctr_dir in "$@"; do
for basename in $(find "$ctr_dir/processing" -name 'job.*.status' -print0 \
    | xargs -0 egrep -l "INLRMS|CANCELING" \
    | sed 's/processing\/job\.\([^\.]*\)\.status$/job.\1/')
  do
  localid=$(grep ^localid= "${basename}.local" | cut -d= -f2)

  verify_jobid "$localid" || continue

  localids[${#localids[@]}]="$localid"
  basenames[$localid]="$basename"
done
done

# No need to continue further if no jobs have status INLRMS
if [ ${#localids[@]} -eq 0 ]; then
    exit 0
fi


# Distribute localids into block so that we don't exceed max command line length
for jids in `echo "${localids[@]}" | xargs -n 4000 | tr ' ' ,`; do
  lidblocks[${#lidblocks[@]}]=$jids
done

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
#   t=`perl -e "printf '%.2f',$stop_ts-$start_ts;"`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-slurm-job, ControldirTraversal: $t" >> $perflogfile
fi


if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

# Get JobStates from SLURM
jobstate_squeue=$(echo "${lidblocks[@]}" | xargs -n 1 $squeue -a -h -o "%i:%T" -t all -j )\
    || { echo "squeue failed" 1>&2; exit 1; }

for record in $jobstate_squeue; do
    localid=$(echo "$record"|cut -d: -f1)
    state=$(echo "$record"|cut -d: -f2)
    jobstates[$localid]=$state;
done
unset jobstate_squeue

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-slurm-job, squeue -a -h -o %i:%T -t all -j: $t" >> $perflogfile
fi

handle_commentfile () {
    localid=$1
    sessiondir=`grep -h '^sessiondir=' $jobfile | sed 's/^sessiondir=\(.*\)/\1/'`
    if [ "$my_id" != '0' ] ; then
      if [ ! -O "$jobfile" ] ; then continue ; fi
    fi
    uid=$(get_owner_uid "$jobfile")
    [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }
    save_commentfile "$uid" "${sessiondir}.comment" "${basenames[$localid]}.errors"
}

# Call scontrol and find the exitcode of a job. Write this, together with a
# message to the lrms_done file. This function is used in the loop below.
function handle_exitcode {
    localid="$1"
    tmpexitcode="$2"
    reason="$3"

    if [ "$use_sacct" ]; then 
      jobinfostring=$("$sacct" -j $localid.batch -o ExitCode -P | tail -n 1)
      exitcode1=$(echo $jobinfostring|awk -F':' '{print $1}')
      exitcode2=$(echo $jobinfostring|awk -F':' '{print $2}')
    else
      jobinfostring=$("$scontrol" -o show job $localid)

      exitcode1=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\1/p')
      exitcode2=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\2/p')
    fi

    if [ -z "$exitcode1" ] && [ -z "$exitcode2" ] ; then
	exitcode=$tmpexitcode
    elif [ $exitcode2 -ne 0 ]; then
	exitcode=$(( $exitcode2 + 256 ))
    elif [ $exitcode1 -ne 0 ]; then
	exitcode=$exitcode1
    else
	exitcode=0
    fi

    echo "$exitcode $reason" > "${basenames[$localid]}.lrms_done"
    kicklist=(${kicklist[@]} $localid)
}

# A special version of the function above, needed to force 
# exit code to non-zero if the job was cancelled, since
# CANCELLED jobs in SLURM can have 0 exit code.
# This is a temporary workaround, should later be replaced by 
# proper fix that determines the reason of failure
function handle_exitcode_cancelled {
    localid="$1"
    tmpexitcode="$2"
    reason="$3"

    if [ "$use_sacct" ]; then 
      jobinfostring=$("$sacct" -j $localid.batch -o ExitCode -P | tail -n 1)
      exitcode1=$(echo $jobinfostring|awk -F':' '{print $1}')
      exitcode2=$(echo $jobinfostring|awk -F':' '{print $2}')
    else
      jobinfostring=$("$scontrol" -o show job $localid)
      exitcode1=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\1/p')
      exitcode2=$(echo $jobinfostring|sed -n 's/.*ExitCode=\([0-9]*\):\([0-9]*\).*/\2/p')
    fi

    if [ -z "$exitcode1" ] && [ -z "$exitcode2" ] ; then
      exitcode=$tmpexitcode
    elif [ $exitcode2 -ne 0 ]; then
      exitcode=$(( $exitcode2 + 256 ))
    elif [ $exitcode1 -ne 0 ]; then
      exitcode=$exitcode1
    else
      exitcode=0
    fi
    if [ $exitcode -eq 0 ]; then
      exitcode=15
      reason="Job was cancelled by SLURM"
    fi
    echo "$exitcode $reason" > "${basenames[$localid]}.lrms_done"
    kicklist=(${kicklist[@]} $localid)
}


#This function filters out WallTime from the .diag-file if present and
#replaces it with output from the LRMS, it also adds StartTime and
#EndTime for accounting.

function handle_diag_file {
    localid="$1"
    ctr_diag="$2"

    job_read_diag

    if [ "$use_sacct" ]; then 
      jobinfostring=$("$sacct" -j $localid.batch -o NCPUS,NNODES,CPUTimeRAW,Start,End,ExitCode,State -P | tail -n 1)

      cpus=$(echo "$jobinfostring" | awk -F'|' '{print $1}')

      starttime=$(echo "$jobinfostring"|awk -F'|' '{print $4}'| sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')

      endtime=$(echo "$jobinfostring"|awk -F'|' '{print $5}'| sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')

      cputime=$(echo "$jobinfostring" | awk -F'|' '{print $3}')
    else
      jobinfostring=$("$scontrol" -o show job $localid)

      #Slurm can report StartTime and EndTime in at least these two formats:
      #2010-02-15T15:30:29
      #02/15-15:25:15
      #For our code to be able to manage both, the first needs to keep its hyphens,
      #the second needs them removed
      starttime=$(echo "$jobinfostring"|sed -n 's/.*StartTime=\([^ ]*\) .*/\1/p' | \
  	sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')
      endtime=$(echo "$jobinfostring"|sed -n 's/.*EndTime=\([^ ]*\) .*/\1/p' | \
	sed 's,\([0-9]\+/[0-9]\+\)-\([0-9:]\+\),\1 \2,g' | sed 's/T/ /g')
      cpus=$(echo "$jobinfostring"|sed -n 's/.*NumCPUs=\([^ ]*\) .*/\1/p')
    fi

    date_to_utc_seconds "$starttime"
    starttime_seconds="$return_date_seconds"
    seconds_to_mds_date "$return_date_seconds"
    LRMSStartTime=$return_mds_date
    date_to_utc_seconds "$endtime"
    endtime_seconds="$return_date_seconds"
    seconds_to_mds_date "$return_date_seconds"
    LRMSEndTime=$return_mds_date

    #TODO handle cputime, exitcode etc.
    walltime=$(( $endtime_seconds - $starttime_seconds))
    #cputime=$(( $walltime * $count))
    # Values to write to diag. These will override values already written.
    [ -n "$walltime" ] && WallTime=$walltime
    [ -n "$cpus" ] && Processors=$cpus
    [ -n "$cputime" ] && UserTime=$cputime
    #[ -n "$cputime" ] && KernelTime=0

    job_write_diag
}

if [ ! -z "$perflogdir" ]; then
   start_ts=`date +%s.%N`
fi

run=0
done=0
zombie=0
failed=0
# Look at the list of jobstates and determine which jobs that have
# finished. Write job.XXXX.lrms_done according to this
for localid in ${localids[@]}; do
#    state=${jobstates[$localid]}
#     case $state in
    # Initialize jobfile variable since it's used below
    jobfile="${basenames[$localid]}.local"
    case "${jobstates[$localid]}" in
 	"")
            # Job is missing (no state) from slurm but INLRMS.
            zombie=$(($zombie + 1))
            exitcode=''
            # get session directory of this job
            sessiondir=`grep -h '^sessiondir=' $jobfile | sed 's/^sessiondir=\(.*\)/\1/'`
            diagfile="${sessiondir}.diag"
	    commentfile="${sessiondir}.comment"
            if [ "$my_id" != '0' ] ; then
                if [ ! -O "$jobfile" ] ; then continue ; fi
            fi
            uid=$(get_owner_uid "$jobfile")
            [ -z "$uid" ] && { log "Failed to stat $jobfile"; continue; }

            if [ ! -z "$sessiondir" ] ; then
            # have chance to obtain exit code
		if [ -z "${RUNTIME_NODE_SEES_FRONTEND}" ] ; then
            # In case of non-NFS setup it may take some time till
            # diagnostics file is delivered. Wait for it max 2 minutes.
		    diag_tries=20
		    while [ "$diag_tries" -gt 0 ] ; do
			if [ -z "$uid" ] ; then
			    exitcode=`grep '^exitcode=' "$diagfile" 2>/dev/null | sed 's/^exitcode=//'`
			else
			    exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
			fi
			if [ ! -z "$exitcode" ] ; then break ; fi
			sleep 10
			diag_tries=$(( $diag_tries - 1 ))
		    done
		else
		    if [ -z "$uid" ] ; then
			exitcode=`grep '^exitcode=' "$diagfile" 2>/dev/null | sed 's/^exitcode=//'`
		    else
			exitcode=$(do_as_uid "$uid" "grep '^exitcode=' '$diagfile'" | sed 's/^exitcode=//')
		    fi
		fi
	    fi

	    jobstatus="$exitcode Job missing from SLURM, exitcode recovered from session directory"
	    if [ -z $exitcode ];then
		exitcode="-1"
		jobstatus="$exitcode Job missing from SLURM"
	    fi

	    save_commentfile "$uid" "$commentfile" "${basenames[$localid]}.errors"
	    echo  "$jobstatus" > "${basenames[$localid]}.lrms_done"
	    kicklist=(${kicklist[@]} $localid)

 	    ;;
  	PENDING|RUNNING|SUSPENDE|COMPLETING)
  	#Job is running, nothing to do.
            run=$(($run + 1))
  	    ;;
  	CANCELLED)
            failed=$(($failed + 1))
	    handle_commentfile $localid
	    echo "-1 Job was cancelled" > "${basenames[$localid]}.lrms_done"
	    kicklist=(${kicklist[@]} $localid)
	    handle_exitcode_cancelled $localid "-1" "Job was cancelled"
	    handle_diag_file "$localid" "${basenames[$localid]}.diag"
	    ;;
  	COMPLETED)
            done=$(($done + 1))
	    handle_commentfile $localid
	    handle_exitcode $localid "0" ""
	    handle_diag_file "$localid" "${basenames[$localid]}.diag"
  	    ;;
  	FAILED)
            failed=$(($failed + 1))
	    handle_commentfile $localid
	    handle_exitcode $localid "-1" "Job failed"
	    handle_diag_file "$localid" "${basenames[$localid]}.diag"
  	    ;;
  	TIMEOUT)
            failed=$(($failed + 1))
	    handle_commentfile $localid
	    handle_exitcode $localid "-1" "Job timeout"
	    handle_diag_file "$localid" "${basenames[$localid]}.diag"
  	    ;;
  	NODE_FAIL)
            failed=$(($failed + 1))
	    handle_commentfile $localid
	    handle_exitcode_cancelled $localid "-1" "Node fail"
	    handle_diag_file "$localid" "${basenames[$localid]}.diag"
  	    ;;
    esac
done

if [ ! -z "$perflogdir" ]; then
   stop_ts=`date +%s.%N`
   t=`awk "BEGIN { printf \"%.3f\", ${stop_ts}-${start_ts} }"`
   echo "[`date +%Y-%m-%d\ %T`] scan-slurm-job, JobHandling, R= $run, D= $done, Z= $zombie, F= $failed:  $t" >> $perflogfile
fi

# Kick the GM
if [ -n "${kicklist[*]}" ];then
    for localid in "${kicklist[@]}";do
        gridid=`echo "${basenames[$localid]}" | sed 's/.*\.\([^\.]*\)$/\1/'`
        "${pkglibexecdir}/gm-kick" -j "${gridid}" "${basenames[$localid]}.local"
    done
fi

exit 0
