home
***
CD-ROM
|
disk
|
FTP
|
other
***
search
/
The World of Computer Software
/
World_Of_Computer_Software-02-385-Vol-1of3.iso
/
c
/
condor40.zip
/
CONDOR
/
src
/
condor_master
/
master.c
< prev
next >
Wrap
C/C++ Source or Header
|
1989-09-14
|
24KB
|
1,036 lines
/*
** Copyright 1986, 1987, 1988, 1989 University of Wisconsin
**
** Permission to use, copy, modify, and distribute this software and its
** documentation for any purpose and without fee is hereby granted,
** provided that the above copyright notice appear in all copies and that
** both that copyright notice and this permission notice appear in
** supporting documentation, and that the name of the University of
** Wisconsin not be used in advertising or publicity pertaining to
** distribution of the software without specific, written prior
** permission. The University of Wisconsin makes no representations about
** the suitability of this software for any purpose. It is provided "as
** is" without express or implied warranty.
**
** THE UNIVERSITY OF WISCONSIN DISCLAIMS ALL WARRANTIES WITH REGARD TO
** THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
** FITNESS. IN NO EVENT SHALL THE UNIVERSITY OF WISCONSIN BE LIABLE FOR
** ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
** WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
** ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
** OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
**
** Authors: Allan Bricker and Michael J. Litzkow,
** University of Wisconsin, Computer Sciences Dept.
**
*/
#include <stdio.h>
#include <signal.h>
#include <errno.h>
#include <pwd.h>
#include <netdb.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <sys/wait.h>
#include <sys/file.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "sched.h"
#include "debug.h"
#include "except.h"
#include "expr.h"
#define MAX_LINES 100
typedef struct {
long data[MAX_LINES + 1];
int first;
int last;
int size;
int n_elem;
} QUEUE;
extern char *SigNames[];
char *param(), *rindex(), *strdup(), *prog_name();
int sigchld_handler(), sigalrm_handler(), sigint_handler(),
sighup_handler(), sigquit_handler(), restart_master();
long delete_queue();
static char *_FileName_ = __FILE__; /* Used by EXCEPT (see except.h) */
#define MINUTE 60
#define HOUR 60 * MINUTE
#define MAX(a,b) ((a)>(b)?(a):(b))
char *MyName;
time_t GetTimeStamp();
int KbdD_Restarts;
int KbdD_Pid;
time_t KbdD_TimeStamp;
int SchedD_Restarts;
int SchedD_Pid;
time_t SchedD_TimeStamp;
int StartD_Restarts;
int StartD_Pid;
time_t StartD_TimeStamp;
int Collector_Restarts;
int Collector_Pid;
time_t Collector_TimeStamp;
int Negotiator_Restarts;
int Negotiator_Pid;
time_t Negotiator_TimeStamp;
time_t Master_TimeStamp;
int MasterLockFD;
char *CollectorHost;
char *NegotiatorHost;
int RestartsPerHour;
int Foreground;
int Termlog;
char *Master;
char *MasterLog;
char *Collector;
char *Collector_Log;
char *Negotiator;
char *Negotiator_Log;
char *StartD;
char *Start_Log;
char *KbdD;
char *Kbd_Log;
char *SchedD;
char *Sched_Log;
char *CondorAdministrator;
int NotFlag;
int PublishObituaries;
int X_runs_here;
int Lines;
usage( name )
char *name;
{
dprintf( D_ALWAYS, "Usage: %s [-f] [-t] [-n]\n", name );
exit( 1 );
}
DoCleanup()
{
(void)signal( SIGCHLD, SIG_IGN );
do_killpg( Collector_Pid, SIGKILL );
do_killpg( Negotiator_Pid, SIGKILL );
do_killpg( StartD_Pid, SIGKILL );
do_killpg( KbdD_Pid, SIGKILL );
do_killpg( SchedD_Pid, SIGKILL );
}
main( argc, argv )
int argc;
char *argv[];
{
struct itimerval timer;
struct passwd *pwd, *getpwnam();
char **ptr, *startem;
MyName = argv[0];
if( getuid() != 0 ) {
dprintf( D_ALWAYS, "%s must be run as ROOT\n", MyName );
exit( 1 );
}
/* Run as group condor so we can access log files even if they
are remotely mounted with NFS - needed because
root = nobody on the remote file system */
if( (pwd=getpwnam("condor")) == NULL ) {
EXCEPT( "condor not in passwd file" );
}
if( setgid(pwd->pw_gid) < 0 ) {
EXCEPT( "setgid(%d)", pwd->pw_gid );
}
config( MyName, (CONTEXT *)0 );
init_params();
if( argc > 4 ) {
usage( argv[0] );
}
_EXCEPT_Cleanup = DoCleanup;
for( ptr=argv+1; *ptr; ptr++ ) {
if( ptr[0][0] != '-' ) {
usage( argv[0] );
}
switch( ptr[0][1] ) {
case 'f':
Foreground++;
break;
case 't':
Termlog++;
break;
case 'n':
NotFlag++;
break;
default:
usage( argv[0] );
}
}
dprintf_config( "MASTER", 2 );
startem = param("START_DAEMONS");
if( !startem || *startem == 'f' || *startem == 'F' ) {
dprintf( D_ALWAYS, "START_DAEMONS flag was set to %s. Exiting.\n",
startem?startem:"(NULL)");
exit( 0 );
}
if( !Termlog ) {
detach();
}
/* Make sure we are the only copy of condor_master running */
get_lock( MasterLog );
if( !Foreground ) {
if( fork() ) {
exit( 0 );
}
}
dprintf( D_ALWAYS,"*************************************************\n" );
dprintf( D_ALWAYS,"*** CONDOR_MASTER STARTING UP ***\n" );
dprintf( D_ALWAYS,"*** PID = %-6d ***\n",
getpid() );
dprintf( D_ALWAYS,"*************************************************\n" );
if( signal(SIGALRM,sigalrm_handler) < 0 ) {
EXCEPT( "signal(SIGALRM,0x%x)", sigalrm_handler );
}
if( signal(SIGCHLD,sigchld_handler) < 0 ) {
EXCEPT( "signal(SIGCHLD,0x%x)", sigchld_handler );
}
if( signal(SIGINT,sigint_handler) < 0 ) {
EXCEPT( "signal(SIGINT,0x%x)", sigint_handler );
}
if( signal(SIGQUIT,sigquit_handler) < 0 ) {
EXCEPT( "signal(SIGQUIT,0x%x)", sigquit_handler );
}
if( signal(SIGHUP,sighup_handler) < 0 ) {
EXCEPT( "signal(SIGHUP,0x%x)", sighup_handler );
}
if( signal(SIGUSR1,restart_master) < 0 ) {
EXCEPT( "signal(SIGUSR1,0x%x)", restart_master );
}
start_all_daemons();
timer.it_interval.tv_sec = HOUR / RestartsPerHour;
timer.it_interval.tv_usec = 0;
timer.it_value = timer.it_interval;
if( setitimer(ITIMER_REAL,&timer,(struct itimerval *)0) < 0 ) {
EXCEPT( "setitimer(ITIMER_REAL,0x%x,0)", &timer );
}
for(;;) {
sigpause( 0 );
}
}
sigalrm_handler()
{
KbdD_Restarts = MAX(0,KbdD_Restarts-1);
SchedD_Restarts = MAX(0,SchedD_Restarts-1);
StartD_Restarts = MAX(0,StartD_Restarts-1);
Collector_Restarts = MAX(0,Collector_Restarts-1);
Negotiator_Restarts = MAX(0,Negotiator_Restarts-1);
if( NewExecutable(Master, &Master_TimeStamp) ) {
restart_master();
}
if( Collector_Pid && NewExecutable(Collector, &Collector_TimeStamp) ) {
dprintf(D_ALWAYS, "Collector was modified. Killing %s\n", Collector);
do_killpg( Collector_Pid, SIGKILL );
Collector_Restarts = 0;
}
if( Negotiator_Pid && NewExecutable(Negotiator, &Negotiator_TimeStamp) ) {
dprintf(D_ALWAYS, "Negotiator was modified. Killing %s\n", Negotiator);
do_killpg( Negotiator_Pid, SIGKILL );
Negotiator_Restarts = 0;
}
if( NewExecutable(KbdD, &KbdD_TimeStamp) ) {
dprintf(D_ALWAYS, "KbdD was modified. Killing %s\n", KbdD);
do_killpg( KbdD_Pid, SIGKILL );
KbdD_Restarts = 0;
}
if( NewExecutable(SchedD, &SchedD_TimeStamp) ) {
dprintf(D_ALWAYS, "SchedD was modified. Killing %s\n", SchedD);
do_killpg( SchedD_Pid, SIGKILL );
SchedD_Restarts = 0;
}
if( NewExecutable(StartD, &StartD_TimeStamp) ) {
dprintf(D_ALWAYS, "StartD was modified. Killing %s\n", StartD);
do_killpg( StartD_Pid, SIGKILL );
StartD_Restarts = 0;
}
}
restart_master()
{
int pid;
dprintf(D_ALWAYS, "RESTARTING MASTER (new executable)\n");
(void)signal( SIGCHLD, SIG_IGN );
if( Collector_Pid ) {
do_killpg( Collector_Pid, SIGKILL );
dprintf(D_ALWAYS, "Killed Collector pid = %d\n", Collector_Pid );
}
if( Negotiator_Pid ) {
do_killpg( Negotiator_Pid, SIGKILL );
dprintf(D_ALWAYS, "Killed Negotiator pid = %d\n", Negotiator_Pid );
}
if( StartD_Pid ) {
do_killpg( StartD_Pid, SIGKILL );
dprintf(D_ALWAYS, "Killed StartD pid = %d\n", StartD_Pid );
}
if( KbdD_Pid ) {
do_killpg( KbdD_Pid, SIGKILL );
dprintf(D_ALWAYS, "Killed KbdD pid = %d\n", KbdD_Pid );
}
if( SchedD_Pid ) {
do_killpg( SchedD_Pid, SIGKILL );
dprintf(D_ALWAYS, "Killed SchedD pid = %d\n", SchedD_Pid );
}
/* Wait until all children die */
for(;;) {
pid = wait( (union wait*)0 );
dprintf( D_ALWAYS, "Wait() returns pid %d\n", pid );
if( pid < 0 ) {
if( errno == ECHILD ) {
break;
} else {
EXCEPT( "wait( 0 )" );
}
}
}
dprintf( D_ALWAYS, "Done waiting for all children\n" );
if( flock(MasterLockFD,LOCK_UN) < 0 ) {
dprintf( D_ALWAYS, "Can't remove lock on \"%s\"\n", MasterLog );
EXCEPT( "flock(%d,0%o)", MasterLockFD, LOCK_UN );
}
dprintf( D_ALWAYS, "Unlocked file descriptor %d\n", MasterLockFD );
(void)close( MasterLockFD );
dprintf( D_ALWAYS, "Closed file descriptor %d\n", MasterLockFD );
dprintf( D_ALWAYS, "Doing exec( \"%s\", \"condor_master\", 0 )", Master );
(void)execl(Master, "condor_master", 0);
EXCEPT("execl(%s, condor_master, 0)", Master);
}
#define IS_DAEMON(p) (p==SchedD_Pid||p==StartD_Pid|| \
p==Collector_Pid||p==Negotiator_Pid||p==KbdD_Pid)
sigchld_handler()
{
int pid = 0;
union wait status;
while( (pid=wait3(&status,WNOHANG,(struct rusage *)0)) != 0 ) {
if( pid == -1 ) {
EXCEPT( "wait3(0x%x,WNOHANG,0) returns %d", &status, pid );
}
if( WIFSTOPPED(status) ) {
continue;
}
if( !IS_DAEMON(pid) ) {
continue;
}
if( status.w_termsig != SIGKILL && PublishObituaries ) {
obituary( pid, &status );
}
restart( pid );
dprintf( D_ALWAYS | D_NOHEADER, "\n" );
}
}
char *
prog_log( pid )
int pid;
{
if( pid == StartD_Pid ) {
return Start_Log;
}
if( pid == KbdD_Pid ) {
return Kbd_Log;
}
if( pid == SchedD_Pid ) {
return Sched_Log;
}
if( pid == Collector_Pid ) {
return Collector_Log;
}
if( pid == Negotiator_Pid ) {
return Negotiator_Log;
}
return "Unknown Program!!!";
}
char *
prog_name( pid )
int pid;
{
if( pid == StartD_Pid ) {
return StartD;
}
if( pid == KbdD_Pid ) {
return KbdD;
}
if( pid == SchedD_Pid ) {
return SchedD;
}
if( pid == Collector_Pid ) {
return Collector;
}
if( pid == Negotiator_Pid ) {
return Negotiator;
}
return "Unknown Program!!!";
}
restart( pid )
int pid;
{
if( pid == KbdD_Pid ) {
dprintf( D_ALWAYS, "The KbdD (process %d) died\n", pid );
do_killpg( pid, SIGKILL ) ;
if( ++KbdD_Restarts > RestartsPerHour ) {
give_up( KbdD );
}
KbdD_Pid = start_daemon( KbdD );
} else if( pid == SchedD_Pid ) {
dprintf( D_ALWAYS, "The SchedD (process %d) died\n", pid );
do_killpg( pid, SIGKILL ) ;
if( ++SchedD_Restarts > RestartsPerHour ) {
give_up( SchedD );
}
SchedD_Pid = start_daemon( SchedD );
} else if( pid == StartD_Pid ) {
dprintf( D_ALWAYS, "The StartD (process %d ) died\n", pid );
do_killpg( pid, SIGKILL ) ;
if( ++StartD_Restarts > RestartsPerHour ) {
give_up( StartD );
}
StartD_Pid = start_daemon( StartD );
} else if( pid == Collector_Pid ) {
dprintf( D_ALWAYS, "The Collector (process %d ) died\n", pid );
do_killpg( pid, SIGKILL ) ;
if( ++Collector_Restarts > RestartsPerHour ) {
give_up( Collector );
}
Collector_Pid = start_daemon( Collector );
} else if( pid == Negotiator_Pid ) {
dprintf( D_ALWAYS, "The Negotiator (process %d ) died\n", pid );
do_killpg( pid, SIGKILL ) ;
if( ++Negotiator_Restarts > RestartsPerHour ) {
give_up( Negotiator );
}
Negotiator_Pid = start_daemon( Negotiator );
} else {
dprintf( D_ALWAYS, "Child %d died, but not a daemon -- Ignored\n", pid);
}
}
SetSyscalls(){}
init_params()
{
char *tmp;
if( (Master = param("MASTER")) == NULL ) {
EXCEPT( "MASTER not specified in config file" );
}
if( (MasterLog = param("MASTER_LOG")) == NULL ) {
EXCEPT( "MASTER_LOG not specified in config file" );
}
if( (CollectorHost = param("COLLECTOR_HOST")) == NULL ) {
EXCEPT( "COLLECTOR_HOST not specified in config file" );
}
if( (NegotiatorHost = param("NEGOTIATOR_HOST")) == NULL ) {
EXCEPT( "NEGOTIATOR_HOST not specified in config file" );
}
if( (Collector = param("COLLECTOR")) == NULL ) {
EXCEPT( "COLLECTOR not specified in config file" );
}
if( (Negotiator = param("NEGOTIATOR")) == NULL ) {
EXCEPT( "NEGOTIATOR not specified in config file" );
}
if( (Collector_Log = param("COLLECTOR_LOG")) == NULL ) {
EXCEPT( "COLLECTOR_LOG not specified in config file" );
}
if( (Negotiator_Log = param("NEGOTIATOR_LOG")) == NULL ) {
EXCEPT( "NEGOTIATOR_LOG not specified in config file" );
}
if( (StartD = param("STARTD")) == NULL ) {
EXCEPT( "STARTD not specified in config file" );
}
if( (Start_Log = param("STARTD_LOG")) == NULL ) {
EXCEPT( "STARTD_LOG not specified in config file" );
}
if( (KbdD = param("KBDD")) == NULL ) {
EXCEPT( "KBDD not specified in config file" );
}
if( (Kbd_Log = param("KBDD_LOG")) == NULL ) {
EXCEPT( "KBDD_LOG not specified in config file" );
}
if( (SchedD = param("SCHEDD")) == NULL ) {
EXCEPT( "SCHEDD not specified in config file" );
}
if( (Sched_Log = param("SCHEDD_LOG")) == NULL ) {
EXCEPT( "SCHEDD_LOG not specified in config file" );
}
if( (CondorAdministrator = param("CONDOR_ADMIN")) == NULL ) {
EXCEPT( "CONDOR_ADMIN not specified in config file" );
}
tmp = param("X_RUNS_HERE");
if( tmp && (*tmp == 't' || *tmp == 'T') ) {
X_runs_here = TRUE;
} else {
X_runs_here = FALSE;
}
tmp = param("PUBLISH_OBITUARIES");
if( tmp && (*tmp == 't' || *tmp == 'T') ) {
PublishObituaries = TRUE;
} else {
PublishObituaries = FALSE;
}
tmp = param("OBITUARY_LOG_LENGTH");
if( tmp == NULL ) {
Lines = 20;
} else {
Lines = atoi( tmp );
}
tmp = param( "RESTARTS_PER_HOUR" );
if( tmp == NULL ) {
RestartsPerHour = 4;
} else {
RestartsPerHour = atoi( tmp );
}
if( param("MASTER_DEBUG") ) {
if( boolean("MASTER_DEBUG","Foreground") ) {
Foreground++;
}
}
}
start_daemon( pathname )
char *pathname;
{
int pid;
char *shortname;
if( NotFlag ) {
dprintf( D_ALWAYS, "NOT Starting \"%s\"\n", pathname );
return 0;
}
if( shortname = rindex(pathname,'/') ) {
shortname += 1;
} else {
shortname = pathname;
}
if( access(pathname,X_OK) != 0 ) {
EXCEPT( "%s: Cannot execute" );
}
if( (pid = vfork()) < 0 ) {
EXCEPT( "vfork()" );
}
if( pid == 0 ) { /* The child */
pid = getpid();
if( setpgrp(0,pid) < 0 ) {
EXCEPT( "setpgrp(0,%d)", pid );
}
(void)execl( pathname, shortname, "-f", 0 );
EXCEPT( "execl( %s, %s, -f, 0 )", pathname, shortname );
#ifdef LINT
return 0;
#endif LINT
} else { /* The parent */
dprintf( D_ALWAYS, "Started \"%s\", pid and pgroup = %d\n",
shortname, pid );
return pid;
}
}
collector_runs_here()
{
char hostname[512];
char *my_host_name;
char *mgr_host_name;
struct hostent *hp, *gethostbyname();
/* Get the "official" name of our own host */
if( gethostname(hostname,sizeof(hostname)) < 0 ) {
EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
}
if( (hp=gethostbyname(hostname)) == NULL ) {
EXCEPT( "gethostbyname(%s)", hostname );
}
my_host_name = strdup( hp->h_name );
/* Get the "official" name of the collector host */
if( (hp=gethostbyname(CollectorHost)) == NULL ) {
EXCEPT( "gethostbyname(%s)", CollectorHost );
}
mgr_host_name = strdup( hp->h_name );
return strcmp(my_host_name,mgr_host_name) == MATCH;
}
negotiator_runs_here()
{
char hostname[512];
char *my_host_name;
char *mgr_host_name;
struct hostent *hp, *gethostbyname();
/* Get the "official" name of our own host */
if( gethostname(hostname,sizeof(hostname)) < 0 ) {
EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
}
if( (hp=gethostbyname(hostname)) == NULL ) {
EXCEPT( "gethostbyname(%s)", hostname );
}
my_host_name = strdup( hp->h_name );
/* Get the "official" name of the negotiator host */
if( (hp=gethostbyname(NegotiatorHost)) == NULL ) {
EXCEPT( "gethostbyname(%s)", NegotiatorHost );
}
mgr_host_name = strdup( hp->h_name );
return strcmp(my_host_name,mgr_host_name) == MATCH;
}
obituary( pid, status )
int pid;
union wait *status;
{
char cmd[512];
char hostname[512];
FILE *mailer, *popen();
char *name, *log;
/* If daemon with a serious bug gets installed, we may end up
** doing many restarts in rapid succession. In that case, we
** don't want to send repeated mail to the CONDOR administrator.
** This could overwhelm the administrator's machine.
*/
if( pid == KbdD_Pid ) {
if( KbdD_Restarts > 1 ) {
return;
}
} else if( pid == SchedD_Pid ) {
if( SchedD_Restarts > 1 ) {
return;
}
} else if( pid == StartD_Pid ) {
if( StartD_Restarts > 1 ) {
return;
}
} else if( pid == Collector_Pid ) {
if( Collector_Restarts > 1 ) {
return;
}
} else if( pid == Negotiator_Pid ) {
if( Negotiator_Restarts > 1 ) {
return;
}
} else {
EXCEPT( "Pid %d returned by wait3(), but not a child\n", pid );
}
name = prog_name( pid );
log = prog_log( pid );
dprintf( D_ALWAYS, "Sending obituary for \"%s\" to \"%s\"\n",
name, CondorAdministrator );
if( gethostname(hostname,sizeof(hostname)) < 0 ) {
EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
}
(void)sprintf( cmd, "/bin/mail %s", CondorAdministrator );
if( (mailer=popen(cmd,"w")) == NULL ) {
EXCEPT( "popen(\"%s\",\"w\")", cmd );
}
fprintf( mailer, "To: %s\n", CondorAdministrator );
fprintf( mailer, "Subject: CONDOR Problem\n" );
fprintf( mailer, "\n" );
if( status->w_termsig ) {
fprintf( mailer, "\"%s\" on \"%s\" died due to signal %d\n",
name, hostname, status->w_termsig );
fprintf( mailer, "(%s core was produced)\n",
status->w_coredump ? "a" : "no" );
} else {
fprintf( mailer,
"\"%s\" on \"%s\" exited with staus %d\n",
name, hostname, status->w_retcode );
}
tail_log( mailer, log, Lines );
/* Don't do a pclose here, it wait()'s, and may steal an
** exit notification of one of our daemons. Instead we'll clean
** up popen's child in our SIGCHLD handler.
*/
(void)fclose( mailer );
}
tail_log( output, file, lines )
FILE *output;
char *file;
int lines;
{
FILE *input;
int ch, last_ch;
long loc, ftell();
QUEUE queue, *q = &queue;
if( (input=fopen(file,"r")) == NULL ) {
fprintf( stderr, "Can't open %s\n", file );
return;
}
init_queue( q, lines );
last_ch = '\n';
while( (ch=getc(input)) != EOF ) {
if( last_ch == '\n' && ch != '\n' ) {
insert_queue( q, ftell(input) - 1 );
}
last_ch = ch;
}
while( !empty_queue( q ) ) {
loc = delete_queue( q );
display_line( loc, input, output );
}
(void)fclose( input );
}
display_line( loc, input, output )
long loc;
FILE *input;
FILE *output;
{
int ch;
(void)fseek( input, loc, 0 );
for(;;) {
ch = getc(input);
(void)putc( ch, output );
if( ch == EOF || ch == '\n' ) {
return;
}
}
}
init_queue( queue, size )
QUEUE *queue;
{
queue->first = 0;
queue->last = 0;
queue->size = size;
queue->n_elem = 0;
}
insert_queue( queue, elem )
QUEUE *queue;
long elem;
{
if( queue->n_elem == queue->size ) {
queue->first = (queue->first + 1) % (queue->size + 1);
} else {
queue->n_elem += 1;
}
queue->data[queue->last] = elem;
queue->last = (queue->last + 1) % (queue->size + 1);
}
long
delete_queue( queue )
QUEUE *queue;
{
long answer;
queue->n_elem -= 1;
answer = queue->data[ queue->first ];
queue->first = (queue->first + 1) % (queue->size + 1);
return answer;
}
empty_queue( queue )
QUEUE *queue;
{
return queue->first == queue->last;
}
give_up( name )
char *name;
{
char cmd[512];
char hostname[512];
FILE *mailer, *popen();
dprintf( D_ALWAYS, "Exceeded %d restarts / hour on \"%s\"\n",
RestartsPerHour, name );
dprintf( D_ALWAYS, "Sending mail to \"%s\"\n", CondorAdministrator );
if( gethostname(hostname,sizeof(hostname)) < 0 ) {
EXCEPT( "gethostname(0x%x,%d)", hostname, sizeof(hostname) );
}
(void)sprintf( cmd, "/bin/mail %s", CondorAdministrator );
if( (mailer=popen(cmd,"w")) == NULL ) {
EXCEPT( "popen(\"%s\",\"w\")", cmd );
}
fprintf( mailer, "To: %s\n", CondorAdministrator );
fprintf( mailer, "Subject: CONDOR Problem\n" );
fprintf( mailer, "\n" );
fprintf( mailer, "HELP!\n\n" );
fprintf( mailer,
"The CONDOR_DaemonMaster on [%s] has exceeded %d restarts/hour for [%s]\n",
hostname, RestartsPerHour, name);
(void)pclose( mailer );
(void)signal( SIGCHLD, SIG_IGN );
do_killpg( Collector_Pid, SIGKILL );
do_killpg( Negotiator_Pid, SIGKILL );
do_killpg( StartD_Pid, SIGKILL );
do_killpg( KbdD_Pid, SIGKILL );
do_killpg( SchedD_Pid, SIGKILL );
dprintf( D_ALWAYS, "*** E X I T I N G ***\n\n" );
exit( 1 );
}
get_lock( file_name )
char *file_name;
{
if( (MasterLockFD=open(file_name,0,0)) < 0 ) {
EXCEPT( "open(%s,0,0)", file_name );
}
if( flock(MasterLockFD,LOCK_EX|LOCK_NB) < 0 ) {
dprintf( D_ALWAYS, "Can't get lock on file \"%s\"\n", file_name );
EXCEPT( "flock(%d,0%o)", MasterLockFD, LOCK_EX | LOCK_NB );
}
}
do_killpg( pgrp, sig )
int pgrp;
int sig;
{
if( !pgrp ) {
return;
}
if( sig < 0 || sig >= NSIG ) {
EXCEPT( "Unknown signal (%d)", sig );
}
(void) killpg( pgrp, sig );
dprintf( D_ALWAYS, "Sent %s to process group %d\n", SigNames[sig], pgrp );
}
do_kill( pid, sig )
int pid;
int sig;
{
if( !pid ) {
return;
}
if( kill(pid,sig) < 0 ) {
EXCEPT( "kill(%d,%d)", pid, sig );
}
dprintf( D_ALWAYS, "Sent %s to process %d\n", SigNames[sig], pid );
}
/*
** Re read the config file, and send all the daemons a signal telling
** them to do so also.
*/
sighup_handler()
{
dprintf( D_ALWAYS, "Re reading config file\n" );
config( MyName, (CONTEXT *)0 );
init_params();
do_kill( Collector_Pid, SIGHUP );
do_kill( Negotiator_Pid, SIGHUP );
do_kill( StartD_Pid, SIGHUP );
do_kill( KbdD_Pid, SIGHUP );
do_kill( SchedD_Pid, SIGHUP );
dprintf( D_ALWAYS | D_NOHEADER, "\n" );
}
/*
** Kill and restart all daemons.
*/
sigint_handler()
{
dprintf( D_ALWAYS, "Killing all daemons\n" );
(void)signal( SIGCHLD, SIG_IGN );
do_killpg( Collector_Pid, SIGKILL );
do_killpg( Negotiator_Pid, SIGKILL );
do_killpg( StartD_Pid, SIGKILL );
do_killpg( KbdD_Pid, SIGKILL );
do_killpg( SchedD_Pid, SIGKILL );
dprintf( D_ALWAYS, "Restarting all daemons\n" );
sleep( 5 ); /* NOT a good way to do this... */
if( signal(SIGCHLD,sigchld_handler) < 0 ) {
EXCEPT( "signal(SIGCHLD,0x%x)", sigchld_handler );
}
start_all_daemons();
}
/*
** Kill all daemons and go away.
*/
sigquit_handler()
{
(void)signal( SIGCHLD, SIG_IGN );
do_killpg( Collector_Pid, SIGKILL );
do_killpg( Negotiator_Pid, SIGKILL );
do_killpg( StartD_Pid, SIGKILL );
do_killpg( KbdD_Pid, SIGKILL );
do_killpg( SchedD_Pid, SIGKILL );
dprintf( D_ALWAYS, "Killed by SIGQUIT\n" );
exit( 0 );
}
start_all_daemons()
{
Master_TimeStamp = GetTimeStamp( Master );
if( collector_runs_here() ) {
Collector_Pid = start_daemon( Collector );
Collector_TimeStamp = GetTimeStamp( Collector );
}
if( negotiator_runs_here() ) {
Negotiator_Pid = start_daemon( Negotiator );
Negotiator_TimeStamp = GetTimeStamp( Negotiator );
}
if( X_runs_here ) {
KbdD_Pid = start_daemon( KbdD );
KbdD_TimeStamp = GetTimeStamp( KbdD );
}
StartD_Pid = start_daemon( StartD );
StartD_TimeStamp = GetTimeStamp( StartD );
SchedD_Pid = start_daemon( SchedD );
SchedD_TimeStamp = GetTimeStamp( SchedD );
dprintf( D_ALWAYS | D_NOHEADER, "\n" );
}
time_t
GetTimeStamp(file)
char *file;
{
struct stat sbuf;
if( stat(file, &sbuf) < 0 ) {
return( (time_t) -1 );
}
return( sbuf.st_mtime );
}
NewExecutable(file, tsp)
char *file;
time_t *tsp;
{
time_t cts = GetTimeStamp(file);
if( cts == (time_t) -1 ) {
/*
** We could have been in the process of installing a new
** version, and that's why the 'stat' failed. Catch it
** next time around.
*/
return( FALSE );
}
if( cts != *tsp ) {
*tsp = cts;
return( TRUE );
}
return FALSE;
}