home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
Power-Programmierung
/
CD1.mdf
/
magazine
/
drdobbs
/
1990
/
12
/
bauer.asc
next >
Wrap
Text File
|
1990-11-15
|
36KB
|
973 lines
_CONTROLLING BACKGROUND PROCESSES UNDER UNIX_
by Barr E. Bauer
[LISTING ONE]
origin=`hostname`
# run - the user interface component of the shepard system -- B. E. Bauer 1990
# configuration files associated with run:
# .run.ini defaults for script,dataset,host,datadir.
# .current jobs originating from workstation environment
# .hosts host machines able to run shepard
# .tasklist list of tasks. Has flags for shepard
# .runscripts list of machines and possible scripts
# these files must be located in the login directory
# flag (and task definitions) definitions, Used in case statement
# and passed as actual flag arguments to shepard:
# x executes (submits) a job
# m monitors job
# p probes job
# s status of running jobs on all platforms
# r list of running jobs
# k kill job (with extreme prejudice)
# t terminate job in a controlled manner (script dependent)
# l list log on remote machine
# b bump a waiting job from the .waiting list
# d delete a waiting job
# f list finished jobs
# e list error log
# c change host
# w list waiting jobs
# a list restart jobs
# g restart a restartable job
#place the date/time in day-month-year@time single string format
set - `date`
year=$6 month=$2 day=$3 tm=$4
datetime=$day-$month-$year@$tm
echo 'welcome to run on '$origin' at '$datetime
. $HOME/.run.ini #source the run-script defaults
. $HOME/.shepard.ini # has network definition
# check for finished jobs, update list, display finished list
# find jobs with status RUNNING, check host for status
if (test -f $HOME/.current) then
cnt=`grep -c DONE $HOME/.current`
if (test "$cnt" != "0" ) then
awk 'BEGIN {
printf "\njobs recently finished\n"
printf "\n%-10s %-10s %-8s %-21s %-21s\n\n",\
"script","dataset","host","start","end"
} $7 == "DONE" {
printf "%-16s%-16s%-8s%-20s%-20s\n",$1,$2,$3,$5,$6
} ' $HOME/.current >tmp
echo ' '; cat tmp # display list of completed jobs
echo 'press any key to continue \c'; read sel
cat tmp >> $HOME/run.log # completed job data to runlog
awk '$7 != "DONE" {
print $0
} ' $HOME/.current >tmp
mv tmp $HOME/.current
else
echo "no new finished jobs"
fi
fi
# set default host. All activities focus on that host until changed
awk 'BEGIN {
n=1
printf "\n----- current hosts -------------------------\n\n"
} {
if ("'$defhost'" == $1)
printf "%-3s%-16s%s %s %s (default)\n",n,$1,$2,$3,$4
else printf "%-3s%-16s%s %s %s\n",n,$1,$2,$3,$4
n++
}
END {
printf "\nselect a host machine by number: "
}' $HOME/.hosts
read sel
if (test -z "$sel") then
host=$defhost
else
sel=`awk 'BEGIN {n=1}{if ("'$sel'" == n) print $1; n++}' $HOME/.hosts`
host=$sel; defhost=$sel
fi
loop=YES
# top of loop. exit with <ret>
while (test "$loop" = "YES")
do
# display menu of tasks
echo ' '; echo 'current host is ' $host
echo ' '
awk ' BEGIN {
n=1
printf "\t# flag task\n"
printf "\t----------------------------------------------\n"
}{
printf "\t%-3s\t%s\n",n,$0
n++
}
END {
printf "\ntask selection number [<ret> to exit]: "
} ' $HOME/.tasklist
read sel
# look up value for shepard flag associated with task.
# Use the flag in the case statement
task=`awk 'BEGIN {n=1} {if("'$sel'" == n) print $2; n++}' $HOME/.tasklist`
flag=`awk 'BEGIN {n=1} {if("'$sel'" == n) print $1; n++}' $HOME/.tasklist`
# if response is <ret>, exit while loop
if (test -z "$sel") then
break
fi
case $flag in
-x) # start a job. Queries for script, dataset, datadir
# list scripts available only on selected host
awk ' BEGIN {
n=1
def=0
printf "\n# (host) script"
printf "\n-------------------------------------\n"
}
"'$host'" == $1 {
if ($2 == "'$defscript'") {
printf "%-2s %s (default)\n",n,$0
def = n
}
else printf "%-2s %s\n",n,$0
n++
}
END {
printf "\nselect a script by number [%s]: ",def
} ' $HOME/.runscripts
read tmp
# look up the script selected by number (must be on one line)
sel=`awk 'BEGIN{n=1} "'$host'"==$1 {if("'$tmp'" == n) print $2; n++} ' $HOME/.runscripts`
if (test "$sel" = "") then
script=$defscript
else
script=$sel; defscript=$sel
fi
echo 'selected script is '$script
# get the dataset name
echo ' '; echo 'enter dataset name ['$defdata']: \c'
read sel
if (test "$sel" = "") then # substitute default for <ret>
dataset=$defdat
else
dataset=$sel; defdata=$sel
fi
echo 'selected dataset is '$dataset
# get the directory where the data is located
# if $SHEPARD_NETWORK is set to "remote", data moves between machines
# using nfs otherwise, data is retained on server
# home directory on the host machine, then back when done
echo ' '; echo 'enter directory of data on '
case $SHEPARD_NETWORK in
remote) echo $iam': \c';;
nfs) echo $iam' using nfs mount on '$host': \c';;
server) echo $host': \c'; defdir='$HOME';;
esac
read sel
if (test "$sel" = "") then # substitute default for <ret>
datadir=$defdir
else
datadir=$sel; defdir=$sel
fi
echo 'selected directory is '$datadir
# append new job entry to $HOME/.current
llist='$script $dataset $host $datadir $datetime'
echo $llist 'out' 'STARTED' >>$HOME/.current
if (test "$origin" = "$host") then
shepard $flag $script $dataset $host $datadir
else
rsh $host shepard $flag $script $dataset $origin $datadir
fi;;
-s) # listing of current file. shows activity on other platforms
awk ' BEGIN {
fmt="%-5s %-16s %-16s %-21s %-16s\n"
dash5="-----"
dash16="----------------"
dash21="---------------------"
n=1
printf "\n\ncurrent job status\n\n"
printf fmt,"#","script","dataset","submitted","status"
printf fmt,dash5,dash16,dash16,dash21,dash16
printf "\n"
} {
printf fmt,n,$1,$2,$5,$7
n++
}
END {
printf "\npress any key to continue "
} ' $HOME/.current
read sel;;
-[ktpdbg])
# these are all list processing commands using pick an item menuing
# the menu is generated by shepard on the selected host
# the item is picked in run and the selection happens in shepard
case $flag in
-[ktp]) lflag='-r';; # list running jobs
-[db]) lflag='-w';; # list waiting jobs
-g) lflag='-a';; # list restartable jobs
esac
if (test "$origin" = "$host") then
shepard $lflag dummy2 dummy3 dummy4 dummy5
else
rsh $host shepard $lflag dummy2 dummy3 dummy4 dummy5
fi
echo ' '; echo 'select number of job to \c'
case $flag in
-k) echo 'kill \c';;
-t) echo 'halt gracefully \c';;
-g) echo 'restart \c';;
-d) echo 'remove from waiting queue \c';;
-b) echo 'bump to top of queue \c';;
-p) echo 'probe running status \c';;
esac
read sel # select one from list
arg5=$sel
if (test "$origin" = "$host") then
shepard $flag dummy2 dummy3 dummy4 $arg5
else
rsh $host shepard $flag dummy2 dummy3 dummy4 $arg5
fi;;
-c) # change hosts
awk 'BEGIN {
n=1
printf "----- current hosts -------------------------\n\n"
} {
if ("'$defhost'" == $1) {
printf "%-3s%-16s%s %s %s (default)\n",n,$1,$2,$3,$4 }
else printf "%-3s%-16s%s %s %s\n",n,$1,$2,$3,$4
n++
}
END {
printf "select a new host machine by number: "
}' $HOME/.hosts
read sel
if (test -z "$sel") then
host=$defhost
else
sel=`awk 'BEGIN {n=1}{if ("'$sel'"==n) print $1; n++}' $HOME.hosts`
host=$sel; defhost=$sel
fi;;
-[rewfalm]) # process listing commands
if (test "$origin" = "$host") then
shepard $flag dummy2 dummy3 dummy4 dummy5
else
rsh $host shepard $flag dummy2 dummy3 dummy4 dummy5
fi
read sel;;
*) # woops
echo $flag 'is not a recognized option, try again'
esac
done # bottom of while loop
# write current values to run-script default file
# $HOME/.run.ini is sourced on invocation in effect restoring the
# last values used. Handy for checking on a previously
# started job - values properly default to the previous
echo 'defscript='$defscript >$HOME/.run.ini
echo 'defdata='$defdata >>$HOME/.run.ini
echo 'defhost='$defhost >>$HOME/.run.ini
echo 'defdir='$defdir >>$HOME/.run.ini
echo 'end of run'
[LISTING TWO]
trap 'rm -f $HOME/.sheplock; exit' 1 2 3 15
# shepard - task management component od shepard system -- B. E. Bauer 1990
# Shepard is the action component of the system. When invoked, it
# owns all the associated files (see top of shepard_queue for list)
# and updates the current file on the originator, log and err files.
# Shepard can be invoked from local or remote machines; it senses
# local or remote operation and behaves accordingly.
# Shepard handles all tasks except for job queueing (shepard_queue) and
# application-specific job probing (defined in $probe_script as sourced
# in 'script'.script). Shepard is called by terminating jobs for cleanup.
# Shepard can be present in several executing copies called by run (the
# user interface) and by completing jobs waiting for cleanup. To avoid
# collision between shepards, absolute ownership of all associated files
# is essential, and is accomplished by creating a lock file. All other
# versions of shepard have to wait until the first is done.
# wait until lock file established insures complete ownership
# of all files by only one version of shepard at a time
until lockon .sheplock
do sleep 5; done
iam=`hostname`
. $HOME/.shepard.ini # source the initialization file
# do not display greeting message if called from terminating process
if (test "$1" != "-z") then
echo 'shepard on '$iam' at '`date`
fi
# if you see the message, you made it.
# Important verification that remote shell command is functioning
# lookup values from files depending on mode
pass=NO
case $1 in # select the file name associated with flag
-[ktp]) fname=$HOME/.running; pass=YES;;
-[bd]) fname=$HOME/.waiting; pass=YES;;
-g) fname=$HOME/.restart; pass=YES;;
esac
if (test "$pass" = "YES") then # do the lookup
scr=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $1; n++}' $fname`
dset=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $2; n++}' $fname`
host=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $3; n++}' $fname`
ddir=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $4; n++}' $fname`
sel=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $5; n++}' $fname`
tname=$host':'$scr'('$dset')' # compact file name
fi
# no loop in shepard. Does the command then exits
case $1 in
-x) # runs job through queue manager which handles submission
shepard_queue $2 $3 $4 $5;;
-m) # system-dependent code here. "big" is using Berkeley UNIX
# while all others use SYSTEM V. Options to ps are different
if (test "`hostname`" = "big") then
ps -ax | grep -n shepard_exec # CONVEX specific (for example)
else
ps -ef # SGI IRIS specific (for example)
fi;;
-p) # probe job - script-dependent
# source the file containing application-specific scripts
. $SHEPARD_DIR/$2.script
. $probe_script;; # defined in sourced file $script.script
echo 'press <ret> to continue \c'
-r) # list running jobs on host
cnt=`wc -l $HOME/.running | awk '{print $1}'`
if (test "$cnt" = "0") then
echo ' '; echo 'no jobs running'; echo ' '
else
awk ' BEGIN {
fmt="\n%-5s %-16s %-16s %-8s\n"
printf "\n----- running jobs on %s -----\n","'$host'"
printf fmt,"#","script","dataset","pid"
printf "----- ---------------- ---------------- --------\n"
n=1
} {
printf fmt,n,$1,$2,$5
n++
}
END {
printf "\npress any key to continue "
} ' $HOME/.running
fi;;
-w) # list waiting jobs on host
cnt=`wc -l $HOME/.waiting | awk '{print $1}'`
if (test "$cnt" = "0") then
echo ' '; echo 'no jobs waiting'; echo ' '
else
awk ' BEGIN {
fmt="\n%-5s %-16s %-16s %-8s\n"
printf "\n----- waiting jobs on %s -----\n","'$host'"
printf fmt,"#","script","dataset","position"
printf "----- ---------------- ---------------- --------\n"
n=1
} {
printf fmt,n,$1,$2,$5
n++
}
END {
printf "\npress any key to continue "
} ' $HOME/.waiting
fi;;
-a) # list restartable jobs on host
cnt=`wc -l $HOME/.restart | awk '{print $1}'`
if (test "$cnt" = "0") then
echo ' '; echo 'no jobs in restart'; echo ' '
else
awk ' BEGIN {
fmt="\n%-5s %-16s %-16s %-8s\n"
printf "\n----- restartable jobs on %s -----\n","'$host'"
printf fmt,"#","script","dataset","position"
printf "----- ---------------- ---------------- --------\n"
n=1
} {
printf fmt,n,$1,$2,$5
n++
} ' $HOME/.restart
fi;;
-g) # restart a job from $HOME/.restart and update
# file to select passed as shell argument 5
# copys the selected entry to $HOME/.waiting with priority=RESTART
awk ' BEGIN {
n=1
} {
if (n == "'$5'") printf "%s %s %s %s RESTART\n",$1,$2,$3,$4
n++
} ' $HOME/.restart >> $HOME/.waiting
awk ' BEGIN { # restarted job is purged from $HOME/.restart
n=1
} {
if (n != "'$5'") print $0
n++
}' $HOME/.restart > tmp
mv tmp $HOME/.restart
echo 'restarting '$tname' at '$datetime >>shepard.log
#update .current on origin machine
if (test "$host" = "$iam") then
run_update -g $scr $dset $sel
else
rsh $host run_update -g $scr $dset $sel
fi
shepard_queue -r;; # do the restart
-k) # kill job with extreme prejudice
# pid passed as shell argument 5, assigned to sel
# running processes have 2 entries in the process list
# first = shepard_exec and has the pid stored in running
# second = the executable application
# searching the process list for first finds second; both
# must be killed to stop the application: killing shepard_exec
# alone leaves the application program still running
if (test "$iam" = "big") then
cleanup=`ps -axl | awk ' "'$sel'" == $4 {print $3}'`
else
cleanup=`ps -ef | awk ' "'$sel'" == $4 {print $3}'`
fi
kill -9 $sel
kill -9 $cleanup
if (test "$?" = "0") then
echo 'killed '$tname' at '$datetime >>$HOME/shepard.log
else
echo 'status of kill command nonzero - check log for problems'
fi
awk ' $5 != "'$sel'" { print $0 }' $HOME/.running > $HOME/tmp
mv $HOME/tmp $HOME/.running
#update .current on origin machine
if (test "$host" = "$iam") then
run_update -k $scr $dset $sel
else
rsh $host run_update -k $scr $dset $sel
fi
shepard_queue -q;; # check for waiting jobs
-t) # terminate job gracefully pass script and origin variables
# source the file containing application-specific scripts
. $SHEPARD_DIR/$2.script
. $terminate_script # found in scriptname.script
echo 'terminated '$tname' at '$datetime >> $HOME/shepard.log
#update .current on origin machine
if (test "$host" = "$iam") then
run_update -t $scr $dset $sel
else
rsh $host run_update -t $scr $dset $sel
fi;; # when the application exits, it will check for waiting jobs
-l) # list the job log on host
tail -30 shepard.log;; # only the last is generally interesting
-b) # bump priority of specific job
# $HOME/.waiting can be in any order, use 2-pass approach
# pass 1: set desired to zero, increment all others
# pass 2: change 0 to 1, zero now being easy to spot
awk ' {
if ($1 == "'$scr'") {
if ($5=="'$sel'") $5 = 0
if ($5 < "'$sel'") $5 += 1
}
printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
} ' $HOME/.waiting | awk ' {
if ($5 == 0) $5 = 1
printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
} ' > $HOME/tmp
mv $HOME/tmp $HOME/.waiting
echo 'bumped '$tname' at '$datetime >> $HOME/shepard.log
if (test "$host" = "$iam") then
run_update -b $scr $dset $sel
else
rsh $host run_update -b $scr $dset $sel
fi;;
-d) # delete a waiting job from waiting, selected passed as shell arg 5
# same script/higher priority have their priorities--
awk ' {
if ($1 == "'$scr'") {
if ($5=="'$sel'") next # excise deleted job
if ($5 > "'$sel'") $5 = $5 - 1
}
print $0
} ' $HOME/.waiting > tmp
mv tmp $HOME/.waiting
echo 'deleted '$tname' at '$datetime >> $HOME/shepard.log
#update .current on origin machine
if (test "$host" = "$iam") then
run_update -d $scr $dset $sel
else
rsh $host run_update -d $scr $dset $sel
fi;;
-f) # list finished jobs
awk ' BEGIN {
fmt="\n%-16s %-16s %-12s\n"
printf "\n----- finished jobs on %s -----\n","'$host'"
printf fmt,"script","dataset","origin"
printf "---------------- ---------------- ------------\n"
} {
printf fmt,$1,$2,$3
}
END {
printf "\npress any key to continue "
} ' $HOME/.finished;;
-e) # list error log
tail -30 $HOME/shepard.err;;
-z) # go to cleanup routine, $5 has the completed jobs pid number
echo 'finished '$4':'$2'('$3') at '`date` >>shepard.log
# write entry to .finished
# run on origin will look here for completed jobs
echo $2 $3 $4 $5 `date` >> $HOME/.finished
# excise finished job from $HOME/.running list
awk '{
if ("'$5'" != $5) print $0
}' $HOME/.running >tmp
mv tmp $HOME/.running
#update .current on origin machine
if (test "$4" = "$iam") then
run_update -f $2 $3 $5
else
rsh $4 run_update -f $2 $3 $5
fi
#check queue for waiting process
shepard_queue -q;;
esac
rm -f $HOME/.sheplock # remove locking file
# normal return to run if invoked by remote shell, otherwise terminates
[LISTING THREE]
trap 'rm -f $HOME/tmp; exit' 1 2 3 15
# shepard_queue - queue manager for shepard system -- B. E. Bauer 1990
# shepard_queue places jobs in a waiting queue and allows a job
# to actually start if the count of similar jobs running is
# below a user defined threshold. Its like a FIFO queue with a twist.
# This is intended to balance throughput vs system demands on
# multiprocessor high performance computers. Alter for your environment
# jobs in $HOME/.waiting have a number associated with their place in the
# queue. 1=next to start up to limit defined in .limits
# passed arguments:
# normal queue submit: 1: script name
# 2: dataset name
# 3: originating machine name
# 4: dataset directory
# restart 1: -r (no other values passed)
# queue check 1: -q (no other values passed)
#
# for restart, $HOME/.waiting has the restart job preappended
iam=`hostname`
. $HOME/.shepard.ini # source the initialization file
mode=NORMAL
if (test "$1" = "-r") then # restart entry submitted
# get the script which has the RESTART code (normally passed as $1)
scr=`awk 'BEGIN {n=0} $5=="RESTART" {print $1}' $HOME/.waiting`
# find and replace RESTART with last queue slot for corresponding script
awk ' BEGIN {
count = 1
} {
if ("'$scr'" != $1) print $0
else if ($5 != "RESTART") {
count++
printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
}
else printf "%s %s %s %s %s\n",$1,$2,$3,$4,count
} ' $HOME/.waiting > $HOME/tmp
mv $HOME/tmp $HOME/.waiting
elif (test "$1" != "-q") then # new job to submit
# append new job entry to $HOME/.waiting list
echo $1 $2 $3 $4 'NEW' >> $HOME/.waiting
# change NEW label to count of jobs having that script
# newest entry has the highest number/last to be executed
awk ' BEGIN {
count = 1
} {
if ("'$1'" != $1) print $0
else if ($5 != "NEW") {
count++
print $0
}
else printf "%s %s %s %s %s\n",$1,$2,$3,$4,count
} ' $HOME/.waiting > $HOME/tmp
mv $HOME/tmp $HOME/.waiting
cnt=`awk 'BEGIN{n=0}"'$1'" == $1 {n++} END {print n}' $HOME/.waiting`
if (test "$3" = "$iam") then
run_update -w $1 $2 cnt
else
rsh $3 run_update -w $1 $2 cnt
fi
else
mode=QUEUE # flag suppresses terminal response when in -q mode
fi
didit=NO # flag reports job starting status
# loop through scripts available on this host
# available scripts are in the environment variable SHEPARD_SCRIPTS
# The FIFO queue has a twist: differing job types are subqueued with
# limits for each found in .limits without maintaining separate queue
# structures. This method is easier to implement and permits a maximum
# load balance consisting of a mix of program types, tailored to ones
# needs. In this way, a number of program type 'a' exceeding the limit
# only runs the number set in .limits, while the others queue leaving
# processor time for program types 'b' and 'c'. The optimum load balance is
# determined by the system resource requirements of each program and
# ones needs for throughput; adjusting .limits allows changes on the fly.
for i in $SHEPARD_SCRIPTS
do
# count jobs actually running for each script, get associated job limit
if (test -f "$HOME/.running") then
rcnt=`awk 'BEGIN{n=0} "'$i'"==$1 {n++} END{print n}' $HOME/.running`
else
rcnt=0 # set rcnt to 0 if $HOME/.running is not present
fi
rlim=`awk '"'$iam'" == $1 && "'$i'" == $2 {print $3}' $HOME/.limits`
if (test -z "$rlim") then
rlim=1 # if no limit in $HOME/.limits, one job permitted
fi
# if more running jobs exceeds the limit, continue to next script
if (test "$rcnt" -ge "$rlim") then
continue
fi
# loop to next script if no jobs waiting with priority=1
script=`awk ' "'$i'" == $1 && $5 == "1" { print $1}' $HOME/.waiting`
if (test "$script" != "$i") then
continue
fi
# found one for current script, get the remaining values
dataset=`awk ' "'$i'" == $1 && $5 == "1" { print $2}' $HOME/.waiting`
origin=`awk ' "'$i'" == $1 && $5 == "1" { print $3}' $HOME/.waiting`
datadir=`awk ' "'$i'" == $1 && $5 == "1" { print $4}' $HOME/.waiting`
# put date/time in a single string format
set - `date`
day=$3 month=$2 year=$6 tm=$4
datetime=$day-$month-$year@$tm
# submit shepard_exec to the background, get its pid
wait 10 # shepard_queue does not wait for shepard_exec
nohup shepard_exec $script $dataset $origin $datadir >shepard_junk.log &
pid=$! # process identification number - unique for job
errflag=$?
# shepard_exec did not initiate, for some reason
# append the shepard_junk.log to shepard.err, alert the user
# the job is placed in .restart
if (test "$errflag" != "0") then
#notify the user
echo $script'('$dataset') did not start at '$datetime
echo ' return code '$errflag
echo '------ process error logfile contents -----'
cat $HOME/shepard_junk.log
echo '------ end of log from '$script'('$dataset') -----'
echo ' '; echo 'check the contents of shepard.err for details'
# update shepard.err
echo $script'('$dataset') did not start at '$datetime >tmp
echo '------ process error logfile contents -----' >>tmp
cat shepard_junk.log >>tmp
echo '------ end of log from '$script'('$dataset') -----' >>tmp
cat tmp >> $HOME/shepard.log; rm tmp
# remove from $HOME/.waiting, place in .restart
awk ' $1 == "'$script'" && $2 == "'$dataset'" && $5 == 1 {
print $0
}' $HOME/.waiting >> $HOME/.restart
awk '{
if ($1 == "'$i'" && $5 == 1) continue
if ($1 == "'$i'") {
$5 = $5 - 1
printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
}
}' $HOME/.waiting > $HOME/tmp
mv $HOME/tmp $HOME/.waiting
exit
fi
didit=YES
# append job specifics to $HOME/.running
echo $script $dataset $origin $datadir $pid >>$HOME/.running
# append job info to shepard.log
echo $script'('$dataset') started '$datetime >>$HOME/shepard.log
# remove running job from $HOME/.waiting, update priority
awk '{
if ($1 == "'$i'" && $5 == 1) next
if ($1 == "'$i'") {
$5 = $5 - 1
printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
}
}' $HOME/.waiting > $HOME/tmp
mv $HOME/tmp $HOME/.waiting
#update .current on origin machine
if (test "$iam" = "$origin") then
run_update -r $script $dataset $pid
else
rsh $origin run_update -r $script $dataset $pid
fi
# if job is successfully started, notify user
if (test "$didit" = "YES") then
echo ' '
echo $script'('$dataset') started on '$iam' at '$datetime
fi
done
if (test "$didit" = "NO") then
if (test "$mode" != "QUEUE") then
echo ' '; echo 'no jobs were submitted'
fi
fi
trap '' 1 2 3 15
[LISTING FOUR]
trap 'shepard -z $1 $2 $3 $$' 1 2 3 15
# shepard_exec - execution potion of shepard system -- B. E. Bauer, 1990
# passed args: 1: script, 2: dataset, 3: origin, 4: datadir
. $HOME/.shepard.ini # source initialization file
. $SHEPARD_DIR/$1.script # source application-specific definitions
# routine to move the required files into the execution environment
# sourcing vs separate shell obviates need to pass values
. $SHEPARD_DIR/$getdata_script
# run the program. Assumes here that stdin, stdout, and stderr are
# required (generally true for UNIX) during execution. All other data
# files were moved into the execution environment by $getdata_script
$exe < $2$inp 1> $2$log 2> $2$err
# source the script to return data back in its proper location
. $SHEPARD_DIR/$putdata_script
# clean up and update status files
shepard -z $1 $2 $3 $$ # pid of completing process returns as arg5
trap '' 1 2 3 15
[LISTING FIVE]
trap 'rm -f $HOME/tmp; exit' 1 2 3 15
# run_update - update component of shepard system -- B.E. Bauer 1990
# updates the .current file to reflect system activities
flag=$1 script=$2 dataset=$3 opt=$4
set - `date`
day=$3 month=$2 year=$6 tm=$4
datetime=$day-$month-$year@$tm
case $flag in
-w) stat='WAITING';;
-r) stat='RUNNING';;
-g) stat='RESTART';;
-b) stat='BUMPED';;
-k) stat='KILLED';;
-f) stat='DONE';;
-d) stat='DELETED';;
-t) stat='TERMINATED';;
esac
awk ' {
if ("'$script'" == $1 && "'$dataset'" == $2) {
printf "%s %s %s %s %s %s %s\n",$1,$2,$3,$4,$5,"'$datetime'","'$stat'"
}
else print $0
}' $HOME/.current >$HOME/tmp
mv $HOME/tmp $HOME/.current
trap '' 1 2 3 15
[LISTING SIX]
big bmin31lv Batchmin large (<2000 atoms)
big bmin31mv Batchmin medium (<1000 atoms)
big spartan ab initio electronic structure calculation
big amber biological structure simulation
big smapps Monte Carlo peptide simulation
big ampac semi-empirical electronic structure calculation
big dspace NMR distance -> structure
moe bmin31ls Batchmin large (<2000 atoms) use with caution!
moe bmin31ms Batchmin medium (<1000 atoms) default
moe bmin31ss Batchmin small (<250 atoms)
moe amber biological structure simulation
moe ampac semi-empirical electronic structure calculation
moe spartan ab initio electronic structure calculation
moe smapps Monte Carlo peptide simulation
larry bmin31ss Batchmin small (<250 atoms)
larry ampac semi-empirical electronic structure calculation
[LISTING SEVEN]
larry SGI IRIS 4D/25TG (B-1-3-09)
curly SGI IRIS 4D/120GTX (B-8-3-22; CADD room)
moe SGI IRIS 4D/240s (B-8-3-22; CADD room)
big CONVEX 220 (B-20-B)
[LISTING EIGHT]
-x execute (submit) a job
-m monitor remote host process status (ps command)
-p probe running job
-s status of running jobs on all platforms
-r running job list
-k kill job (with extreme prejudice)
-t terminate job gracefully
-g restart job
-l log file on remote machine (tail -30)
-b bump waiting job to next
-d delete a waiting job
-f finished job list on host machine
-e error log on host machine (tail -30)
-c change hosts
-w waiting job list on host machine
-a restartable job list on host machine
[LISTING NINE]
# Loads values for script, dataset, host, and datadir last used by run.
# This file is recreated at the end of run.
defscript=bmin31lv
defdata=bmintest3
defhost=big
defdir=pla2
[LISTING TEN]
big bmin31lv 3
big bmin31mv 1
big spartan 1
big amber 2
big smapps 1
big ampac 3
moe bmin31ms 2
moe bmin31ss 4
moe amber 2
moe smapps 1
moe ampac 4
moe spartan 1
larry bmin31ss 1
larry ampac 1
[LISTING ELEVEN]
# Definitions for runnable scripts, dataset network movement and directory for
# various files. The runnable scripts must be in agreement with contents of
# .runscripts. Behavior of network for the originating machine is set here.
# options for SHEPARD_NETWORK: server, nfs, remote
# SHEPARD_DIR is location of application-specific shepard scripts
# this file is sourced and executes directly in environment of script
case `hostname` in
larry|curly) # SGI IRIS workstation definitions
SHEPARD_SCRIPTS='bmin31ss ampac'
SHEPARD_NETWORK=server
SHEPARD_DIR=$HOME/shepard_dir;;
moe) # SGI IRIS-240 compute server definitions
SHEPARD_SCRIPTS='bmin31ss bmin31ms amber smapps ampac spartan'
SHEPARD_NETWORK=server
SHEPARD_DIR=$HOME/shepard_dir;;
big) # CONVEX specific definitions
SHEPARD_SCRIPTS='bmin31lv bmin31mv amber smapps ampac spartan dspace'
SHEPARD_NETWORK=server
SHEPARD_DIR=$HOME/shepard_dir;;
esac
[LISTING TWELVE]
# bmin31lv script for large vector (CONVEX) version of Batchmin v 3.1
exe=bmin31lv # the executable (in PATH)
inp=.com # extension for standard input
log=.log # extension for standard output
err=.err # extension for error output (channel 2)
getdata_script=bmin31lv.getdata # get the input datafiles
putdata_script=bmin31lv.putdata # put the output back
terminate_script=bmin31lv.terminate # application-specific shutdown
probe_script=bmin31lv.probe # conducts an application-specific probe
[LISTING THIRTEEN]
# bmin31lv.getdata: get data script. Sourced in shepard_exec
# shell args: 2: dataset, 3:origin, 4: datadir
# datadir is dependent on network choice:
# server: host-$HOME/datadir (host-$HOME is prepended)
# nfs: nfs path of data from host to origin machines
# remote: origin-$HOME/datadir (origin-$HOME is prepended)
case $SHEPARD_NETWORK in
server) cd $4;; # data stays put on host machine
remote) rsh $3 cat $4/$2.dat >$2.dat # move data. This is a kluge
rsh $3 cat $4/$2$inp >$2$inp;; # remote cat puts output on host
nfs) cp $4/$2.dat . # copy via remotely mounted nfs dir
cp $4/$2$inp . ;;
esac
[LISTING FOURTEEN]
# bmin31lv.putdata: put data script. Sourced in shepard_exec
# shell args: 2: dataset, 3:origin, 4: datadir
# datadir is dependent on network choice:
# server: host-$HOME/datadir (host-$HOME is prepended)
# nfs: nfs path of data from host to origin machines
# remote: origin-$HOME/datadir (origin-$HOME is prepended)
# all application-specific output files are moved, if necessary
case $SHEPARD_NETWORK in
server) cd $HOME;; # movement of files is not necessary
remote) rsh $3 cat ">"$4/$2.out <$2.out # another network kluge
rsh $3 cat ">"$4/$2$log <$2$log # remote cat with ">" writes
rsh $3 cat ">"$4/$2$err <$2$err;; # to remote. < read local.
nfs) cp $2.out $4/$2.out
cp $2$log $4/$2$log
cp $2$err $4/$2$err;;
esac
[LISTING FIFTEEN]
# sourced from shepard
# batchmin terminates when it finds dataset.stp in execution dir
case $SHEPARD_NETWORK in
server) echo 'help me, please help me' > $4/$2.stp;;
remote|nfs) echo 'help me, please help me' > $2.stp;;
esac
[LISTING SIXTEEN]
# Sourced from shepard. $dset=jobname, $ddir=directory, $log=logfile ext
# Script prints the last 30 lines of the log file from the selected job
case $SHEPARD_NETWORK in
server) tail -30 $ddir/$dset$log;;
remote|nfs) tail -30 $dset$log;;
esac
[LISTING SEVENTEEN]
/* lockon.c - creates lock file from argv[1] having no privelege
B. E. Bauer, 1990
*/
main (argc, argv)
int argc;
char *argv[];
{
int fp, locked;
locked = 0;
if (argc != 1) {
printf ("\nuseage: lockon lockfile\n");
exit (0);
}
if ((fp = creat(argv[1], 0)) < 0) ++locked;
else close(fp);
return (locked);
}