#!/bin/sh
#
# pbs_mom	This script will start and stop the PBS Mom
#
# chkconfig: 345 95 05
# description: TORQUE/PBS is a versatile batch system for SMPs and clusters
#
#### BEGIN INIT INFO 
# Provides: pbs_mom 
# Required-Start: $local_fs $network $syslog 
# Required-Stop: $local_fs $network $syslog 
# Default-Start: 2 3 4 5 
# Default-Stop: 0 1 6 
# Short-Description: Start up the pbs_mom daemon 
# Description: pbs_mom is part of a batch scheduler 
# This service starts up the compute node. 
### END INIT INFO 

ulimit -n 32768
# Source the library functions
. /etc/rc.d/init.d/functions

# NOTE: customize these variables as needed
SBIN_PATH=/usr/bin
PBS_DAEMON="$SBIN_PATH/pbs_mom"
PBS_HOME=/var/spool/torque
PBS_ARGS=""

SUBSYS_LOCK="/var/lock/subsys/pbs_mom"

if [ -f /etc/sysconfig/pbs_mom ];then
	. /etc/sysconfig/pbs_mom
fi

MOM_LOCK="$PBS_HOME/mom_priv/mom.lock"

if [ -z "$previous" ];then
   # being run manually, don't disturb jobs
   args="$args -p"
else
   args="$args -q"
fi

pidof_pbs_mom() {
	# This function will echo the PID of the master pbs_mom process if such
	# a process is running and then return 0. Otherwise it will echo nothing
	# and return 1.

	lockfile_info=`stat -Lc "%d:%i" $MOM_LOCK 2>/dev/null`
	if [ $? -ne 0 ]; then
		return 1
	fi

	# now we are sure the $MOM_LOCK file exists
	parent_mom_pid=`cat $MOM_LOCK 2>/dev/null`
	pidof pbs_mom | egrep -e "(^| )${parent_mom_pid}( |$)" >/dev/null 2>/dev/null
	if [ $? -ne 0 ]; then
		return 1
	fi

	# the parent pbs_mom should have a lock on the $MOM_LOCK file, this is
	# typically acquired early in the pbs_mom process, so this loop typically
	# stops at the sixth open file.
	for fd in `ls /proc/${parent_mom_pid}/fd/ | sort -n`; do
		fd_info=`stat -Lc "%d:%i" /proc/${parent_mom_pid}/fd/${fd} 2>/dev/null`
		if [ x${fd_info} == x${lockfile_info} ]; then
			echo $parent_mom_pid
			return 0
		fi
	done

	return 1
}

kill_pbs_mom() {
	# This function will try to terminate the master pbs_mom process if such a
	# process is running.
	pid=`pidof_pbs_mom`

	if [ $? -ne 0 ]; then
		return 0;
	fi

	retval=1
	for i in {1..5}; do
		kill -0 $pid &>/dev/null || return 0
		$SBIN_PATH/momctl -s && return $?
		sleep 1
	done
	return $retval
}

# how were we called
case "$1" in
	start)
		echo -n "Starting TORQUE Mom: "
		# check if pbs_mom is already running
		stat $SUBSYS_LOCK &> /dev/null
		lock_present=$?

		pid=`pidof_pbs_mom`
		if [ $? -eq 0 ]; then
			if [ $lock_present -eq 0 ]; then
			   echo -n "pbs_mom already running (pid $pid)"
			else
			   touch $SUBSYS_LOCK && echo -n "pbs_mom running (pid $pid)"
			fi
			RET=$?
			[ $RET -eq 0 ] && success && echo && exit 0
		fi

		# ulimit -c unlimited  # Uncomment this to preserve core files
		daemon $PBS_DAEMON $args -d $PBS_HOME $PBS_ARGS
		RET=$?
		touch $SUBSYS_LOCK
		echo
		;;
	purge)
		[ -f $SUBSYS_LOCK ] && $0 stop
		echo -n "Starting TORQUE Mom with purge: "
		daemon $PBS_DAEMON -r
		RET=$?
		[ $RET -eq 0 ] &&  touch $SUBSYS_LOCK
		echo
		;;
	stop)
		echo -n "Shutting down TORQUE Mom: "
		# check if pbs_mom is running
		pid=`pidof_pbs_mom`
		[ $? -ne 0 ] && echo -n "pbs_mom already stopped" && success && echo && exit 0

		kill_pbs_mom
		RET=$?
		[ $RET -eq 0 ] && success "shutdown" || failure "shutdown"
		echo
		rm -f $SUBSYS_LOCK
		;;
	status)
		stat $SUBSYS_LOCK &> /dev/null
		lock_present=$?

		pid=`pidof_pbs_mom`
		if [ $? -ne 0 ]; then
			pid=-1
		fi

		# Return codes per Linux Standard Base (LSB) Core Specificiation 3.1
		[ $pid -eq -1 -a $lock_present -eq 0 ] && echo -n "pbs_mom dead but subsys locked" && failure && echo && exit 2
		[ $pid -eq -1 ] && echo -n "pbs_mom already stopped" && success && echo && exit 3
		[ $pid -ne -1 -a $lock_present -eq 0 ] && echo -n "pbs_mom already running" && success && echo && exit 0
		[ $pid -ne -1 ] && echo -n "pbs_mom running but subsys not locked" && failure && echo && exit 0
		;;
	restart)
		$0 stop
		sleep 1
		$0 start
		;;
	condrestart|try-restart)
		$0 status || exit 0
		$0 restart
		;;
	reload)
		echo -n "Re-reading TORQUE Mom config file: "
		pid=`pidof_pbs_mom`
		if [ $? -eq 0 ]; then
			kill -HUP $pid
			RET=$?
			[ $RET -eq 0 ] && success "HUP" || failure "HUP"
		else
			failure "HUP"
		fi
		echo
		;;
	*)
		echo "Usage: pbs_mom {start|stop|restart|reload|status|purge}"
		exit 1
esac
exit $RET
