home *** CD-ROM | disk | FTP | other *** search
- _CONTROLLING BACKGROUND PROCESSES UNDER UNIX_
- by Barr E. Bauer
-
- [LISTING ONE]
-
- origin=`hostname`
-
- # run - the user interface component of the shepard system -- B. E. Bauer 1990
- # configuration files associated with run:
- # .run.ini defaults for script,dataset,host,datadir.
- # .current jobs originating from workstation environment
- # .hosts host machines able to run shepard
- # .tasklist list of tasks. Has flags for shepard
- # .runscripts list of machines and possible scripts
- # these files must be located in the login directory
- # flag (and task definitions) definitions, Used in case statement
- # and passed as actual flag arguments to shepard:
- # x executes (submits) a job
- # m monitors job
- # p probes job
- # s status of running jobs on all platforms
- # r list of running jobs
- # k kill job (with extreme prejudice)
- # t terminate job in a controlled manner (script dependent)
- # l list log on remote machine
- # b bump a waiting job from the .waiting list
- # d delete a waiting job
- # f list finished jobs
- # e list error log
- # c change host
- # w list waiting jobs
- # a list restart jobs
- # g restart a restartable job
-
- #place the date/time in day-month-year@time single string format
- set - `date`
- year=$6 month=$2 day=$3 tm=$4
- datetime=$day-$month-$year@$tm
-
- echo 'welcome to run on '$origin' at '$datetime
-
- . $HOME/.run.ini #source the run-script defaults
- . $HOME/.shepard.ini # has network definition
-
- # check for finished jobs, update list, display finished list
- # find jobs with status RUNNING, check host for status
-
- if (test -f $HOME/.current) then
- cnt=`grep -c DONE $HOME/.current`
- if (test "$cnt" != "0" ) then
- awk 'BEGIN {
- printf "\njobs recently finished\n"
- printf "\n%-10s %-10s %-8s %-21s %-21s\n\n",\
- "script","dataset","host","start","end"
- } $7 == "DONE" {
- printf "%-16s%-16s%-8s%-20s%-20s\n",$1,$2,$3,$5,$6
- } ' $HOME/.current >tmp
- echo ' '; cat tmp # display list of completed jobs
- echo 'press any key to continue \c'; read sel
- cat tmp >> $HOME/run.log # completed job data to runlog
- awk '$7 != "DONE" {
- print $0
- } ' $HOME/.current >tmp
- mv tmp $HOME/.current
- else
- echo "no new finished jobs"
- fi
- fi
-
- # set default host. All activities focus on that host until changed
- awk 'BEGIN {
- n=1
- printf "\n----- current hosts -------------------------\n\n"
- } {
- if ("'$defhost'" == $1)
- printf "%-3s%-16s%s %s %s (default)\n",n,$1,$2,$3,$4
- else printf "%-3s%-16s%s %s %s\n",n,$1,$2,$3,$4
- n++
- }
- END {
- printf "\nselect a host machine by number: "
- }' $HOME/.hosts
- read sel
- if (test -z "$sel") then
- host=$defhost
- else
- sel=`awk 'BEGIN {n=1}{if ("'$sel'" == n) print $1; n++}' $HOME/.hosts`
- host=$sel; defhost=$sel
- fi
- loop=YES
-
- # top of loop. exit with <ret>
- while (test "$loop" = "YES")
- do
- # display menu of tasks
- echo ' '; echo 'current host is ' $host
- echo ' '
- awk ' BEGIN {
- n=1
- printf "\t# flag task\n"
- printf "\t----------------------------------------------\n"
- }{
- printf "\t%-3s\t%s\n",n,$0
- n++
- }
- END {
- printf "\ntask selection number [<ret> to exit]: "
- } ' $HOME/.tasklist
- read sel
- # look up value for shepard flag associated with task.
- # Use the flag in the case statement
- task=`awk 'BEGIN {n=1} {if("'$sel'" == n) print $2; n++}' $HOME/.tasklist`
- flag=`awk 'BEGIN {n=1} {if("'$sel'" == n) print $1; n++}' $HOME/.tasklist`
-
- # if response is <ret>, exit while loop
- if (test -z "$sel") then
- break
- fi
-
- case $flag in
- -x) # start a job. Queries for script, dataset, datadir
- # list scripts available only on selected host
- awk ' BEGIN {
- n=1
- def=0
- printf "\n# (host) script"
- printf "\n-------------------------------------\n"
- }
- "'$host'" == $1 {
- if ($2 == "'$defscript'") {
- printf "%-2s %s (default)\n",n,$0
- def = n
- }
- else printf "%-2s %s\n",n,$0
- n++
- }
- END {
- printf "\nselect a script by number [%s]: ",def
- } ' $HOME/.runscripts
- read tmp
- # look up the script selected by number (must be on one line)
- sel=`awk 'BEGIN{n=1} "'$host'"==$1 {if("'$tmp'" == n) print $2; n++} ' $HOME/.runscripts`
- if (test "$sel" = "") then
- script=$defscript
- else
- script=$sel; defscript=$sel
- fi
- echo 'selected script is '$script
- # get the dataset name
- echo ' '; echo 'enter dataset name ['$defdata']: \c'
- read sel
- if (test "$sel" = "") then # substitute default for <ret>
- dataset=$defdat
- else
- dataset=$sel; defdata=$sel
- fi
- echo 'selected dataset is '$dataset
- # get the directory where the data is located
- # if $SHEPARD_NETWORK is set to "remote", data moves between machines
- # using nfs otherwise, data is retained on server
- # home directory on the host machine, then back when done
- echo ' '; echo 'enter directory of data on '
- case $SHEPARD_NETWORK in
- remote) echo $iam': \c';;
- nfs) echo $iam' using nfs mount on '$host': \c';;
- server) echo $host': \c'; defdir='$HOME';;
- esac
- read sel
- if (test "$sel" = "") then # substitute default for <ret>
- datadir=$defdir
- else
- datadir=$sel; defdir=$sel
- fi
- echo 'selected directory is '$datadir
- # append new job entry to $HOME/.current
- llist='$script $dataset $host $datadir $datetime'
- echo $llist 'out' 'STARTED' >>$HOME/.current
- if (test "$origin" = "$host") then
- shepard $flag $script $dataset $host $datadir
- else
- rsh $host shepard $flag $script $dataset $origin $datadir
- fi;;
- -s) # listing of current file. shows activity on other platforms
- awk ' BEGIN {
- fmt="%-5s %-16s %-16s %-21s %-16s\n"
- dash5="-----"
- dash16="----------------"
- dash21="---------------------"
- n=1
- printf "\n\ncurrent job status\n\n"
- printf fmt,"#","script","dataset","submitted","status"
- printf fmt,dash5,dash16,dash16,dash21,dash16
- printf "\n"
- } {
- printf fmt,n,$1,$2,$5,$7
- n++
- }
- END {
- printf "\npress any key to continue "
- } ' $HOME/.current
- read sel;;
- -[ktpdbg])
- # these are all list processing commands using pick an item menuing
- # the menu is generated by shepard on the selected host
- # the item is picked in run and the selection happens in shepard
- case $flag in
- -[ktp]) lflag='-r';; # list running jobs
- -[db]) lflag='-w';; # list waiting jobs
- -g) lflag='-a';; # list restartable jobs
- esac
- if (test "$origin" = "$host") then
- shepard $lflag dummy2 dummy3 dummy4 dummy5
- else
- rsh $host shepard $lflag dummy2 dummy3 dummy4 dummy5
- fi
- echo ' '; echo 'select number of job to \c'
- case $flag in
- -k) echo 'kill \c';;
- -t) echo 'halt gracefully \c';;
- -g) echo 'restart \c';;
- -d) echo 'remove from waiting queue \c';;
- -b) echo 'bump to top of queue \c';;
- -p) echo 'probe running status \c';;
- esac
- read sel # select one from list
- arg5=$sel
- if (test "$origin" = "$host") then
- shepard $flag dummy2 dummy3 dummy4 $arg5
- else
- rsh $host shepard $flag dummy2 dummy3 dummy4 $arg5
- fi;;
- -c) # change hosts
- awk 'BEGIN {
- n=1
- printf "----- current hosts -------------------------\n\n"
- } {
- if ("'$defhost'" == $1) {
- printf "%-3s%-16s%s %s %s (default)\n",n,$1,$2,$3,$4 }
- else printf "%-3s%-16s%s %s %s\n",n,$1,$2,$3,$4
- n++
- }
- END {
- printf "select a new host machine by number: "
- }' $HOME/.hosts
- read sel
- if (test -z "$sel") then
- host=$defhost
- else
- sel=`awk 'BEGIN {n=1}{if ("'$sel'"==n) print $1; n++}' $HOME.hosts`
- host=$sel; defhost=$sel
- fi;;
- -[rewfalm]) # process listing commands
- if (test "$origin" = "$host") then
- shepard $flag dummy2 dummy3 dummy4 dummy5
- else
- rsh $host shepard $flag dummy2 dummy3 dummy4 dummy5
- fi
- read sel;;
- *) # woops
- echo $flag 'is not a recognized option, try again'
- esac
- done # bottom of while loop
-
- # write current values to run-script default file
- # $HOME/.run.ini is sourced on invocation in effect restoring the
- # last values used. Handy for checking on a previously
- # started job - values properly default to the previous
- echo 'defscript='$defscript >$HOME/.run.ini
- echo 'defdata='$defdata >>$HOME/.run.ini
- echo 'defhost='$defhost >>$HOME/.run.ini
- echo 'defdir='$defdir >>$HOME/.run.ini
-
- echo 'end of run'
-
- [LISTING TWO]
-
- trap 'rm -f $HOME/.sheplock; exit' 1 2 3 15
-
- # shepard - task management component od shepard system -- B. E. Bauer 1990
- # Shepard is the action component of the system. When invoked, it
- # owns all the associated files (see top of shepard_queue for list)
- # and updates the current file on the originator, log and err files.
- # Shepard can be invoked from local or remote machines; it senses
- # local or remote operation and behaves accordingly.
- # Shepard handles all tasks except for job queueing (shepard_queue) and
- # application-specific job probing (defined in $probe_script as sourced
- # in 'script'.script). Shepard is called by terminating jobs for cleanup.
- # Shepard can be present in several executing copies called by run (the
- # user interface) and by completing jobs waiting for cleanup. To avoid
- # collision between shepards, absolute ownership of all associated files
- # is essential, and is accomplished by creating a lock file. All other
- # versions of shepard have to wait until the first is done.
-
- # wait until lock file established insures complete ownership
- # of all files by only one version of shepard at a time
- until lockon .sheplock
- do sleep 5; done
-
- iam=`hostname`
- . $HOME/.shepard.ini # source the initialization file
- # do not display greeting message if called from terminating process
- if (test "$1" != "-z") then
- echo 'shepard on '$iam' at '`date`
- fi
- # if you see the message, you made it.
- # Important verification that remote shell command is functioning
-
- # lookup values from files depending on mode
- pass=NO
- case $1 in # select the file name associated with flag
- -[ktp]) fname=$HOME/.running; pass=YES;;
- -[bd]) fname=$HOME/.waiting; pass=YES;;
- -g) fname=$HOME/.restart; pass=YES;;
- esac
- if (test "$pass" = "YES") then # do the lookup
- scr=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $1; n++}' $fname`
- dset=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $2; n++}' $fname`
- host=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $3; n++}' $fname`
- ddir=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $4; n++}' $fname`
- sel=`awk 'BEGIN {n=1} {if ("'$5'" == n) print $5; n++}' $fname`
- tname=$host':'$scr'('$dset')' # compact file name
- fi
-
- # no loop in shepard. Does the command then exits
- case $1 in
- -x) # runs job through queue manager which handles submission
- shepard_queue $2 $3 $4 $5;;
- -m) # system-dependent code here. "big" is using Berkeley UNIX
- # while all others use SYSTEM V. Options to ps are different
- if (test "`hostname`" = "big") then
- ps -ax | grep -n shepard_exec # CONVEX specific (for example)
- else
- ps -ef # SGI IRIS specific (for example)
- fi;;
- -p) # probe job - script-dependent
- # source the file containing application-specific scripts
- . $SHEPARD_DIR/$2.script
- . $probe_script;; # defined in sourced file $script.script
- echo 'press <ret> to continue \c'
- -r) # list running jobs on host
- cnt=`wc -l $HOME/.running | awk '{print $1}'`
- if (test "$cnt" = "0") then
- echo ' '; echo 'no jobs running'; echo ' '
- else
- awk ' BEGIN {
- fmt="\n%-5s %-16s %-16s %-8s\n"
- printf "\n----- running jobs on %s -----\n","'$host'"
- printf fmt,"#","script","dataset","pid"
- printf "----- ---------------- ---------------- --------\n"
- n=1
- } {
- printf fmt,n,$1,$2,$5
- n++
- }
- END {
- printf "\npress any key to continue "
- } ' $HOME/.running
- fi;;
- -w) # list waiting jobs on host
- cnt=`wc -l $HOME/.waiting | awk '{print $1}'`
- if (test "$cnt" = "0") then
- echo ' '; echo 'no jobs waiting'; echo ' '
- else
- awk ' BEGIN {
- fmt="\n%-5s %-16s %-16s %-8s\n"
- printf "\n----- waiting jobs on %s -----\n","'$host'"
- printf fmt,"#","script","dataset","position"
- printf "----- ---------------- ---------------- --------\n"
- n=1
- } {
- printf fmt,n,$1,$2,$5
- n++
- }
- END {
- printf "\npress any key to continue "
- } ' $HOME/.waiting
- fi;;
- -a) # list restartable jobs on host
- cnt=`wc -l $HOME/.restart | awk '{print $1}'`
- if (test "$cnt" = "0") then
- echo ' '; echo 'no jobs in restart'; echo ' '
- else
- awk ' BEGIN {
- fmt="\n%-5s %-16s %-16s %-8s\n"
- printf "\n----- restartable jobs on %s -----\n","'$host'"
- printf fmt,"#","script","dataset","position"
- printf "----- ---------------- ---------------- --------\n"
- n=1
- } {
- printf fmt,n,$1,$2,$5
- n++
- } ' $HOME/.restart
- fi;;
- -g) # restart a job from $HOME/.restart and update
- # file to select passed as shell argument 5
- # copys the selected entry to $HOME/.waiting with priority=RESTART
- awk ' BEGIN {
- n=1
- } {
- if (n == "'$5'") printf "%s %s %s %s RESTART\n",$1,$2,$3,$4
- n++
- } ' $HOME/.restart >> $HOME/.waiting
- awk ' BEGIN { # restarted job is purged from $HOME/.restart
- n=1
- } {
- if (n != "'$5'") print $0
- n++
- }' $HOME/.restart > tmp
- mv tmp $HOME/.restart
- echo 'restarting '$tname' at '$datetime >>shepard.log
- #update .current on origin machine
- if (test "$host" = "$iam") then
- run_update -g $scr $dset $sel
- else
- rsh $host run_update -g $scr $dset $sel
- fi
- shepard_queue -r;; # do the restart
- -k) # kill job with extreme prejudice
- # pid passed as shell argument 5, assigned to sel
- # running processes have 2 entries in the process list
- # first = shepard_exec and has the pid stored in running
- # second = the executable application
- # searching the process list for first finds second; both
- # must be killed to stop the application: killing shepard_exec
- # alone leaves the application program still running
- if (test "$iam" = "big") then
- cleanup=`ps -axl | awk ' "'$sel'" == $4 {print $3}'`
- else
- cleanup=`ps -ef | awk ' "'$sel'" == $4 {print $3}'`
- fi
- kill -9 $sel
- kill -9 $cleanup
- if (test "$?" = "0") then
- echo 'killed '$tname' at '$datetime >>$HOME/shepard.log
- else
- echo 'status of kill command nonzero - check log for problems'
- fi
- awk ' $5 != "'$sel'" { print $0 }' $HOME/.running > $HOME/tmp
- mv $HOME/tmp $HOME/.running
- #update .current on origin machine
- if (test "$host" = "$iam") then
- run_update -k $scr $dset $sel
- else
- rsh $host run_update -k $scr $dset $sel
- fi
- shepard_queue -q;; # check for waiting jobs
- -t) # terminate job gracefully pass script and origin variables
- # source the file containing application-specific scripts
- . $SHEPARD_DIR/$2.script
- . $terminate_script # found in scriptname.script
- echo 'terminated '$tname' at '$datetime >> $HOME/shepard.log
- #update .current on origin machine
- if (test "$host" = "$iam") then
- run_update -t $scr $dset $sel
- else
- rsh $host run_update -t $scr $dset $sel
- fi;; # when the application exits, it will check for waiting jobs
- -l) # list the job log on host
- tail -30 shepard.log;; # only the last is generally interesting
- -b) # bump priority of specific job
- # $HOME/.waiting can be in any order, use 2-pass approach
- # pass 1: set desired to zero, increment all others
- # pass 2: change 0 to 1, zero now being easy to spot
- awk ' {
- if ($1 == "'$scr'") {
- if ($5=="'$sel'") $5 = 0
- if ($5 < "'$sel'") $5 += 1
- }
- printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
- } ' $HOME/.waiting | awk ' {
- if ($5 == 0) $5 = 1
- printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
- } ' > $HOME/tmp
- mv $HOME/tmp $HOME/.waiting
- echo 'bumped '$tname' at '$datetime >> $HOME/shepard.log
- if (test "$host" = "$iam") then
- run_update -b $scr $dset $sel
- else
- rsh $host run_update -b $scr $dset $sel
- fi;;
- -d) # delete a waiting job from waiting, selected passed as shell arg 5
- # same script/higher priority have their priorities--
- awk ' {
- if ($1 == "'$scr'") {
- if ($5=="'$sel'") next # excise deleted job
- if ($5 > "'$sel'") $5 = $5 - 1
- }
- print $0
- } ' $HOME/.waiting > tmp
- mv tmp $HOME/.waiting
- echo 'deleted '$tname' at '$datetime >> $HOME/shepard.log
- #update .current on origin machine
- if (test "$host" = "$iam") then
- run_update -d $scr $dset $sel
- else
- rsh $host run_update -d $scr $dset $sel
- fi;;
- -f) # list finished jobs
- awk ' BEGIN {
- fmt="\n%-16s %-16s %-12s\n"
- printf "\n----- finished jobs on %s -----\n","'$host'"
- printf fmt,"script","dataset","origin"
- printf "---------------- ---------------- ------------\n"
- } {
- printf fmt,$1,$2,$3
- }
- END {
- printf "\npress any key to continue "
- } ' $HOME/.finished;;
- -e) # list error log
- tail -30 $HOME/shepard.err;;
- -z) # go to cleanup routine, $5 has the completed jobs pid number
- echo 'finished '$4':'$2'('$3') at '`date` >>shepard.log
- # write entry to .finished
- # run on origin will look here for completed jobs
- echo $2 $3 $4 $5 `date` >> $HOME/.finished
- # excise finished job from $HOME/.running list
- awk '{
- if ("'$5'" != $5) print $0
- }' $HOME/.running >tmp
- mv tmp $HOME/.running
- #update .current on origin machine
- if (test "$4" = "$iam") then
- run_update -f $2 $3 $5
- else
- rsh $4 run_update -f $2 $3 $5
- fi
- #check queue for waiting process
- shepard_queue -q;;
- esac
-
- rm -f $HOME/.sheplock # remove locking file
-
- # normal return to run if invoked by remote shell, otherwise terminates
-
-
- [LISTING THREE]
-
- trap 'rm -f $HOME/tmp; exit' 1 2 3 15
-
- # shepard_queue - queue manager for shepard system -- B. E. Bauer 1990
- # shepard_queue places jobs in a waiting queue and allows a job
- # to actually start if the count of similar jobs running is
- # below a user defined threshold. Its like a FIFO queue with a twist.
- # This is intended to balance throughput vs system demands on
- # multiprocessor high performance computers. Alter for your environment
- # jobs in $HOME/.waiting have a number associated with their place in the
- # queue. 1=next to start up to limit defined in .limits
- # passed arguments:
- # normal queue submit: 1: script name
- # 2: dataset name
- # 3: originating machine name
- # 4: dataset directory
- # restart 1: -r (no other values passed)
- # queue check 1: -q (no other values passed)
- #
- # for restart, $HOME/.waiting has the restart job preappended
- iam=`hostname`
- . $HOME/.shepard.ini # source the initialization file
- mode=NORMAL
- if (test "$1" = "-r") then # restart entry submitted
- # get the script which has the RESTART code (normally passed as $1)
- scr=`awk 'BEGIN {n=0} $5=="RESTART" {print $1}' $HOME/.waiting`
- # find and replace RESTART with last queue slot for corresponding script
- awk ' BEGIN {
- count = 1
- } {
- if ("'$scr'" != $1) print $0
- else if ($5 != "RESTART") {
- count++
- printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
- }
- else printf "%s %s %s %s %s\n",$1,$2,$3,$4,count
- } ' $HOME/.waiting > $HOME/tmp
- mv $HOME/tmp $HOME/.waiting
- elif (test "$1" != "-q") then # new job to submit
- # append new job entry to $HOME/.waiting list
- echo $1 $2 $3 $4 'NEW' >> $HOME/.waiting
- # change NEW label to count of jobs having that script
- # newest entry has the highest number/last to be executed
- awk ' BEGIN {
- count = 1
- } {
- if ("'$1'" != $1) print $0
- else if ($5 != "NEW") {
- count++
- print $0
- }
- else printf "%s %s %s %s %s\n",$1,$2,$3,$4,count
- } ' $HOME/.waiting > $HOME/tmp
- mv $HOME/tmp $HOME/.waiting
- cnt=`awk 'BEGIN{n=0}"'$1'" == $1 {n++} END {print n}' $HOME/.waiting`
- if (test "$3" = "$iam") then
- run_update -w $1 $2 cnt
- else
- rsh $3 run_update -w $1 $2 cnt
- fi
- else
- mode=QUEUE # flag suppresses terminal response when in -q mode
- fi
-
- didit=NO # flag reports job starting status
-
- # loop through scripts available on this host
- # available scripts are in the environment variable SHEPARD_SCRIPTS
-
- # The FIFO queue has a twist: differing job types are subqueued with
- # limits for each found in .limits without maintaining separate queue
- # structures. This method is easier to implement and permits a maximum
- # load balance consisting of a mix of program types, tailored to ones
- # needs. In this way, a number of program type 'a' exceeding the limit
- # only runs the number set in .limits, while the others queue leaving
- # processor time for program types 'b' and 'c'. The optimum load balance is
- # determined by the system resource requirements of each program and
- # ones needs for throughput; adjusting .limits allows changes on the fly.
-
- for i in $SHEPARD_SCRIPTS
- do
- # count jobs actually running for each script, get associated job limit
- if (test -f "$HOME/.running") then
- rcnt=`awk 'BEGIN{n=0} "'$i'"==$1 {n++} END{print n}' $HOME/.running`
- else
- rcnt=0 # set rcnt to 0 if $HOME/.running is not present
- fi
- rlim=`awk '"'$iam'" == $1 && "'$i'" == $2 {print $3}' $HOME/.limits`
- if (test -z "$rlim") then
- rlim=1 # if no limit in $HOME/.limits, one job permitted
- fi
- # if more running jobs exceeds the limit, continue to next script
- if (test "$rcnt" -ge "$rlim") then
- continue
- fi
- # loop to next script if no jobs waiting with priority=1
- script=`awk ' "'$i'" == $1 && $5 == "1" { print $1}' $HOME/.waiting`
- if (test "$script" != "$i") then
- continue
- fi
- # found one for current script, get the remaining values
- dataset=`awk ' "'$i'" == $1 && $5 == "1" { print $2}' $HOME/.waiting`
- origin=`awk ' "'$i'" == $1 && $5 == "1" { print $3}' $HOME/.waiting`
- datadir=`awk ' "'$i'" == $1 && $5 == "1" { print $4}' $HOME/.waiting`
-
- # put date/time in a single string format
- set - `date`
- day=$3 month=$2 year=$6 tm=$4
- datetime=$day-$month-$year@$tm
- # submit shepard_exec to the background, get its pid
- wait 10 # shepard_queue does not wait for shepard_exec
- nohup shepard_exec $script $dataset $origin $datadir >shepard_junk.log &
- pid=$! # process identification number - unique for job
- errflag=$?
-
- # shepard_exec did not initiate, for some reason
- # append the shepard_junk.log to shepard.err, alert the user
- # the job is placed in .restart
- if (test "$errflag" != "0") then
- #notify the user
- echo $script'('$dataset') did not start at '$datetime
- echo ' return code '$errflag
- echo '------ process error logfile contents -----'
- cat $HOME/shepard_junk.log
- echo '------ end of log from '$script'('$dataset') -----'
- echo ' '; echo 'check the contents of shepard.err for details'
- # update shepard.err
- echo $script'('$dataset') did not start at '$datetime >tmp
- echo '------ process error logfile contents -----' >>tmp
- cat shepard_junk.log >>tmp
- echo '------ end of log from '$script'('$dataset') -----' >>tmp
- cat tmp >> $HOME/shepard.log; rm tmp
- # remove from $HOME/.waiting, place in .restart
- awk ' $1 == "'$script'" && $2 == "'$dataset'" && $5 == 1 {
- print $0
- }' $HOME/.waiting >> $HOME/.restart
- awk '{
- if ($1 == "'$i'" && $5 == 1) continue
- if ($1 == "'$i'") {
- $5 = $5 - 1
- printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
- }
- }' $HOME/.waiting > $HOME/tmp
- mv $HOME/tmp $HOME/.waiting
- exit
- fi
-
- didit=YES
- # append job specifics to $HOME/.running
- echo $script $dataset $origin $datadir $pid >>$HOME/.running
-
- # append job info to shepard.log
- echo $script'('$dataset') started '$datetime >>$HOME/shepard.log
-
- # remove running job from $HOME/.waiting, update priority
- awk '{
- if ($1 == "'$i'" && $5 == 1) next
- if ($1 == "'$i'") {
- $5 = $5 - 1
- printf "%s %s %s %s %s\n",$1,$2,$3,$4,$5
- }
- }' $HOME/.waiting > $HOME/tmp
- mv $HOME/tmp $HOME/.waiting
- #update .current on origin machine
- if (test "$iam" = "$origin") then
- run_update -r $script $dataset $pid
- else
- rsh $origin run_update -r $script $dataset $pid
- fi
- # if job is successfully started, notify user
- if (test "$didit" = "YES") then
- echo ' '
- echo $script'('$dataset') started on '$iam' at '$datetime
- fi
- done
-
- if (test "$didit" = "NO") then
- if (test "$mode" != "QUEUE") then
- echo ' '; echo 'no jobs were submitted'
- fi
- fi
- trap '' 1 2 3 15
-
- [LISTING FOUR]
-
- trap 'shepard -z $1 $2 $3 $$' 1 2 3 15
-
- # shepard_exec - execution potion of shepard system -- B. E. Bauer, 1990
- # passed args: 1: script, 2: dataset, 3: origin, 4: datadir
-
- . $HOME/.shepard.ini # source initialization file
- . $SHEPARD_DIR/$1.script # source application-specific definitions
-
- # routine to move the required files into the execution environment
- # sourcing vs separate shell obviates need to pass values
- . $SHEPARD_DIR/$getdata_script
-
- # run the program. Assumes here that stdin, stdout, and stderr are
- # required (generally true for UNIX) during execution. All other data
- # files were moved into the execution environment by $getdata_script
- $exe < $2$inp 1> $2$log 2> $2$err
-
- # source the script to return data back in its proper location
- . $SHEPARD_DIR/$putdata_script
-
- # clean up and update status files
- shepard -z $1 $2 $3 $$ # pid of completing process returns as arg5
-
- trap '' 1 2 3 15
-
- [LISTING FIVE]
-
- trap 'rm -f $HOME/tmp; exit' 1 2 3 15
-
- # run_update - update component of shepard system -- B.E. Bauer 1990
- # updates the .current file to reflect system activities
-
- flag=$1 script=$2 dataset=$3 opt=$4
- set - `date`
- day=$3 month=$2 year=$6 tm=$4
- datetime=$day-$month-$year@$tm
-
- case $flag in
- -w) stat='WAITING';;
- -r) stat='RUNNING';;
- -g) stat='RESTART';;
- -b) stat='BUMPED';;
- -k) stat='KILLED';;
- -f) stat='DONE';;
- -d) stat='DELETED';;
- -t) stat='TERMINATED';;
- esac
-
- awk ' {
- if ("'$script'" == $1 && "'$dataset'" == $2) {
- printf "%s %s %s %s %s %s %s\n",$1,$2,$3,$4,$5,"'$datetime'","'$stat'"
- }
- else print $0
- }' $HOME/.current >$HOME/tmp
- mv $HOME/tmp $HOME/.current
-
- trap '' 1 2 3 15
-
-
- [LISTING SIX]
-
- big bmin31lv Batchmin large (<2000 atoms)
- big bmin31mv Batchmin medium (<1000 atoms)
- big spartan ab initio electronic structure calculation
- big amber biological structure simulation
- big smapps Monte Carlo peptide simulation
- big ampac semi-empirical electronic structure calculation
- big dspace NMR distance -> structure
- moe bmin31ls Batchmin large (<2000 atoms) use with caution!
- moe bmin31ms Batchmin medium (<1000 atoms) default
- moe bmin31ss Batchmin small (<250 atoms)
- moe amber biological structure simulation
- moe ampac semi-empirical electronic structure calculation
- moe spartan ab initio electronic structure calculation
- moe smapps Monte Carlo peptide simulation
- larry bmin31ss Batchmin small (<250 atoms)
- larry ampac semi-empirical electronic structure calculation
-
-
- [LISTING SEVEN]
-
- larry SGI IRIS 4D/25TG (B-1-3-09)
- curly SGI IRIS 4D/120GTX (B-8-3-22; CADD room)
- moe SGI IRIS 4D/240s (B-8-3-22; CADD room)
- big CONVEX 220 (B-20-B)
-
- [LISTING EIGHT]
-
- -x execute (submit) a job
- -m monitor remote host process status (ps command)
- -p probe running job
- -s status of running jobs on all platforms
- -r running job list
- -k kill job (with extreme prejudice)
- -t terminate job gracefully
- -g restart job
- -l log file on remote machine (tail -30)
- -b bump waiting job to next
- -d delete a waiting job
- -f finished job list on host machine
- -e error log on host machine (tail -30)
- -c change hosts
- -w waiting job list on host machine
- -a restartable job list on host machine
-
- [LISTING NINE]
-
- # Loads values for script, dataset, host, and datadir last used by run.
- # This file is recreated at the end of run.
-
- defscript=bmin31lv
- defdata=bmintest3
- defhost=big
- defdir=pla2
-
-
- [LISTING TEN]
-
- big bmin31lv 3
- big bmin31mv 1
- big spartan 1
- big amber 2
- big smapps 1
- big ampac 3
- moe bmin31ms 2
- moe bmin31ss 4
- moe amber 2
- moe smapps 1
- moe ampac 4
- moe spartan 1
- larry bmin31ss 1
- larry ampac 1
-
- [LISTING ELEVEN]
-
- # Definitions for runnable scripts, dataset network movement and directory for
- # various files. The runnable scripts must be in agreement with contents of
- # .runscripts. Behavior of network for the originating machine is set here.
-
- # options for SHEPARD_NETWORK: server, nfs, remote
- # SHEPARD_DIR is location of application-specific shepard scripts
-
- # this file is sourced and executes directly in environment of script
-
- case `hostname` in
- larry|curly) # SGI IRIS workstation definitions
- SHEPARD_SCRIPTS='bmin31ss ampac'
- SHEPARD_NETWORK=server
- SHEPARD_DIR=$HOME/shepard_dir;;
- moe) # SGI IRIS-240 compute server definitions
- SHEPARD_SCRIPTS='bmin31ss bmin31ms amber smapps ampac spartan'
- SHEPARD_NETWORK=server
- SHEPARD_DIR=$HOME/shepard_dir;;
- big) # CONVEX specific definitions
- SHEPARD_SCRIPTS='bmin31lv bmin31mv amber smapps ampac spartan dspace'
- SHEPARD_NETWORK=server
- SHEPARD_DIR=$HOME/shepard_dir;;
- esac
-
- [LISTING TWELVE]
-
- # bmin31lv script for large vector (CONVEX) version of Batchmin v 3.1
- exe=bmin31lv # the executable (in PATH)
- inp=.com # extension for standard input
- log=.log # extension for standard output
- err=.err # extension for error output (channel 2)
- getdata_script=bmin31lv.getdata # get the input datafiles
- putdata_script=bmin31lv.putdata # put the output back
- terminate_script=bmin31lv.terminate # application-specific shutdown
- probe_script=bmin31lv.probe # conducts an application-specific probe
-
- [LISTING THIRTEEN]
-
- # bmin31lv.getdata: get data script. Sourced in shepard_exec
- # shell args: 2: dataset, 3:origin, 4: datadir
- # datadir is dependent on network choice:
- # server: host-$HOME/datadir (host-$HOME is prepended)
- # nfs: nfs path of data from host to origin machines
- # remote: origin-$HOME/datadir (origin-$HOME is prepended)
-
- case $SHEPARD_NETWORK in
- server) cd $4;; # data stays put on host machine
- remote) rsh $3 cat $4/$2.dat >$2.dat # move data. This is a kluge
- rsh $3 cat $4/$2$inp >$2$inp;; # remote cat puts output on host
- nfs) cp $4/$2.dat . # copy via remotely mounted nfs dir
- cp $4/$2$inp . ;;
- esac
-
- [LISTING FOURTEEN]
-
-
- # bmin31lv.putdata: put data script. Sourced in shepard_exec
- # shell args: 2: dataset, 3:origin, 4: datadir
- # datadir is dependent on network choice:
- # server: host-$HOME/datadir (host-$HOME is prepended)
- # nfs: nfs path of data from host to origin machines
- # remote: origin-$HOME/datadir (origin-$HOME is prepended)
-
- # all application-specific output files are moved, if necessary
- case $SHEPARD_NETWORK in
- server) cd $HOME;; # movement of files is not necessary
- remote) rsh $3 cat ">"$4/$2.out <$2.out # another network kluge
- rsh $3 cat ">"$4/$2$log <$2$log # remote cat with ">" writes
- rsh $3 cat ">"$4/$2$err <$2$err;; # to remote. < read local.
- nfs) cp $2.out $4/$2.out
- cp $2$log $4/$2$log
- cp $2$err $4/$2$err;;
- esac
-
- [LISTING FIFTEEN]
-
- # sourced from shepard
- # batchmin terminates when it finds dataset.stp in execution dir
-
- case $SHEPARD_NETWORK in
- server) echo 'help me, please help me' > $4/$2.stp;;
- remote|nfs) echo 'help me, please help me' > $2.stp;;
- esac
-
- [LISTING SIXTEEN]
-
- # Sourced from shepard. $dset=jobname, $ddir=directory, $log=logfile ext
- # Script prints the last 30 lines of the log file from the selected job
-
- case $SHEPARD_NETWORK in
- server) tail -30 $ddir/$dset$log;;
- remote|nfs) tail -30 $dset$log;;
- esac
-
- [LISTING SEVENTEEN]
-
- /* lockon.c - creates lock file from argv[1] having no privelege
- B. E. Bauer, 1990
- */
- main (argc, argv)
- int argc;
- char *argv[];
- {
- int fp, locked;
-
- locked = 0;
- if (argc != 1) {
- printf ("\nuseage: lockon lockfile\n");
- exit (0);
- }
- if ((fp = creat(argv[1], 0)) < 0) ++locked;
- else close(fp);
- return (locked);
- }
-