Icinga-core 1.4.0
next gen monitoring
base/checks.c
Go to the documentation of this file.
00001 /*****************************************************************************
00002  *
00003  * CHECKS.C - Service and host check functions for Icinga
00004  *
00005  * Copyright (c) 1999-2010 Ethan Galstad (egalstad@nagios.org)
00006  * Copyright (c) 2009-2011 Nagios Core Development Team and Community Contributors
00007  * Copyright (c) 2009-2011 Icinga Development Team (http://www.icinga.org)
00008  *
00009  * License:
00010  *
00011  * This program is free software; you can redistribute it and/or modify
00012  * it under the terms of the GNU General Public License version 2 as
00013  * published by the Free Software Foundation.
00014  *
00015  * This program is distributed in the hope that it will be useful,
00016  * but WITHOUT ANY WARRANTY; without even the implied warranty of
00017  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00018  * GNU General Public License for more details.
00019  *
00020  * You should have received a copy of the GNU General Public License
00021  * along with this program; if not, write to the Free Software
00022  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
00023  *
00024  *****************************************************************************/
00025 
00026 #include "../include/config.h"
00027 #include "../include/comments.h"
00028 #include "../include/common.h"
00029 #include "../include/statusdata.h"
00030 #include "../include/downtime.h"
00031 #include "../include/macros.h"
00032 #include "../include/icinga.h"
00033 #include "../include/broker.h"
00034 #include "../include/perfdata.h"
00035 
00036 /*#define DEBUG_CHECKS*/
00037 /*#define DEBUG_HOST_CHECKS 1*/
00038 
00039 
00040 #ifdef EMBEDDEDPERL
00041 #include "../include/epn_icinga.h"
00042 #endif
00043 
00044 #ifdef USE_EVENT_BROKER
00045 #include "../include/neberrors.h"
00046 #endif
00047 
00048 extern int      sigshutdown;
00049 extern int      sigrestart;
00050 
00051 extern char     *temp_file;
00052 extern char     *temp_path;
00053 extern char     *check_result_path;
00054 
00055 extern int      interval_length;
00056 
00057 extern int      command_check_interval;
00058 
00059 extern int      log_initial_states;
00060 extern int      log_passive_checks;
00061 
00062 extern int      service_check_timeout;
00063 extern int      host_check_timeout;
00064 
00065 extern int      check_reaper_interval;
00066 extern int      max_check_reaper_time;
00067 
00068 extern int      use_aggressive_host_checking;
00069 extern unsigned long cached_host_check_horizon;
00070 extern unsigned long cached_service_check_horizon;
00071 extern int      enable_predictive_host_dependency_checks;
00072 extern int      enable_predictive_service_dependency_checks;
00073 
00074 extern int      soft_state_dependencies;
00075 
00076 extern int      currently_running_service_checks;
00077 extern int      currently_running_host_checks;
00078 
00079 extern int      accept_passive_service_checks;
00080 extern int      execute_service_checks;
00081 extern int      accept_passive_host_checks;
00082 extern int      execute_host_checks;
00083 extern int      obsess_over_services;
00084 extern int      obsess_over_hosts;
00085 
00086 extern int      translate_passive_host_checks;
00087 extern int      passive_host_checks_are_soft;
00088 
00089 extern int      check_service_freshness;
00090 extern int      check_host_freshness;
00091 extern int      additional_freshness_latency;
00092 
00093 extern int      max_host_check_spread;
00094 extern int      max_service_check_spread;
00095 
00096 extern int      use_large_installation_tweaks;
00097 extern int      free_child_process_memory;
00098 extern int      child_processes_fork_twice;
00099 
00100 extern int      stalking_event_handlers_for_hosts;
00101 extern int      stalking_event_handlers_for_services;
00102 
00103 extern time_t   program_start;
00104 extern time_t   event_start;
00105 
00106 extern timed_event       *event_list_low;
00107 extern timed_event       *event_list_low_tail;
00108 
00109 extern host              *host_list;
00110 extern service           *service_list;
00111 extern servicedependency *servicedependency_list;
00112 extern hostdependency    *hostdependency_list;
00113 
00114 extern unsigned long   next_event_id;
00115 extern unsigned long   next_problem_id;
00116 
00117 extern check_result    check_result_info;
00118 extern check_result    *check_result_list;
00119 
00120 extern pthread_t       worker_threads[TOTAL_WORKER_THREADS];
00121 
00122 extern unsigned long max_debug_file_size;
00123 
00124 #ifdef EMBEDDEDPERL
00125 extern int      use_embedded_perl;
00126 #endif
00127 
00128 int dummy;      /* reduce compiler warnings */
00129 
00130 /******************************************************************/
00131 /********************* MISCELLANEOUS FUNCTIONS ********************/
00132 /******************************************************************/
00133 
00134 /* extract check result */
00135 static void extract_check_result(FILE *fp,dbuf *checkresult_dbuf){
00136         char output_buffer[MAX_INPUT_BUFFER]="";
00137         char *temp_buffer;
00138 
00139         /* initialize buffer */
00140         strcpy(output_buffer,"");
00141 
00142         /* get all lines of plugin output - escape newlines */
00143         while(fgets(output_buffer,sizeof(output_buffer)-1,fp)){
00144                 temp_buffer=escape_newlines(output_buffer);
00145                 dbuf_strcat(checkresult_dbuf,temp_buffer);
00146                 my_free(temp_buffer);
00147         }
00148 }
00149 
00150 /* convert a command line to an array of arguments, suitable for exec* functions */
00151 static int parse_command_line(char *cmd, char *argv[MAX_CMD_ARGS]){
00152         unsigned int argc=0;
00153         char *parsed_cmd;
00154 
00155         /* Skip initial white-space characters. */
00156         for(parsed_cmd=cmd;isspace(*cmd);++cmd)
00157                 ;
00158 
00159         /* Parse command line. */
00160         while(*cmd&&(argc<MAX_CMD_ARGS-1)){
00161                 argv[argc++]=parsed_cmd;
00162 
00163                 switch(*cmd){
00164                         case '\'':
00165                                 while((*cmd)&&(*cmd!='\''))
00166                                         *(parsed_cmd++)=*(cmd++);
00167                                 if(*cmd)
00168                                         ++cmd;
00169                                 break;
00170                         case '"':
00171                                 while((*cmd)&&(*cmd!='"')){
00172                                         if((*cmd=='\\')&&cmd[1]&&strchr("\"\\\n",cmd[1]))
00173                                                 ++cmd;
00174                                         *(parsed_cmd++)=*(cmd++);
00175                                 }
00176                                 if(*cmd)
00177                                         ++cmd;
00178                                 break;
00179                         default:
00180                                 while((*cmd)&&!isspace(*cmd)){
00181                                         if((*cmd=='\\')&&cmd[1])
00182                                                 ++cmd;
00183                                         *(parsed_cmd++)=*(cmd++);
00184                                 }
00185                 }
00186 
00187                 while(isspace(*cmd))
00188                         ++cmd;
00189 
00190                 if(argc>=MAX_CMD_ARGS-1){
00191                         logit(NSLOG_RUNTIME_WARNING,TRUE,"overlimit args for command %s\n",argv[0]);
00192                         _exit(STATE_UNKNOWN);
00193                 }
00194                 else
00195                         *(parsed_cmd++)='\0';
00196         }
00197 
00198         argv[argc]=NULL;
00199 
00200         return OK;
00201 }
00202 
00203 /* run a check */
00204 static int run_check(char *processed_command,dbuf *checkresult_dbuf){
00205         char *argv[MAX_CMD_ARGS];
00206         FILE *fp;
00207         pid_t pid;
00208         int pipefds[2];
00209         int retval;
00210 
00211         /* check for check execution method (shell or execvp) */
00212         if(!has_shell_metachars(processed_command)){
00213 
00214                 if(pipe(pipefds)<0){
00215                         logit(NSLOG_RUNTIME_WARNING,TRUE,"error creating pipe: %s\n", strerror(errno));
00216                         _exit(STATE_UNKNOWN);
00217                 }
00218                 if((pid=fork())<0){
00219                         logit(NSLOG_RUNTIME_WARNING,TRUE,"fork error\n");
00220                         _exit(STATE_UNKNOWN);
00221                 }
00222                 else if(!pid){
00223                         /* child replaces stdout/stderr with output of the pipe */
00224                         if((dup2(pipefds[1],STDOUT_FILENO)<0)||(dup2(pipefds[1],STDERR_FILENO)<0)){
00225                                 logit(NSLOG_RUNTIME_WARNING,TRUE,"dup2 error\n");
00226                                 _exit(STATE_UNKNOWN);
00227                         }
00228 
00229                         /* close unused half of pipe */
00230                         close(pipefds[1]);
00231 
00232                         /* extract command args for execv */
00233                         parse_command_line(processed_command,argv);
00234 
00235                         if(!argv[0]){
00236                                 logit(NSLOG_RUNTIME_WARNING,TRUE,"plugin command definition empty\n");
00237                                 _exit(STATE_UNKNOWN);
00238                         }
00239 
00240                         log_debug_info(DEBUGL_CHECKS,0,"running command %s via execvp\n",processed_command);
00241 
00242                         if(execvp(argv[0], argv)<0){ /* execvp only returns in case of an error */
00243                                 logit(NSLOG_RUNTIME_WARNING,TRUE,"error executing command '%s': %s. Make sure that the file actually exists (in PATH, if set) and is executable!\n",processed_command, strerror(errno));
00244                                 _exit(STATE_UNKNOWN);
00245                         }
00246                         _exit(STATE_UNKNOWN);
00247                 }
00248 
00249                 /* prepare pipe reading */
00250                 close(pipefds[1]);
00251                 fp=fdopen(pipefds[0],"r");
00252                 if(!fp){
00253                         logit(NSLOG_RUNTIME_WARNING,TRUE,"fdopen error\n");
00254                         _exit(STATE_UNKNOWN);
00255                 }
00256 
00257                 /* extract check result */
00258                 extract_check_result(fp,checkresult_dbuf);
00259 
00260                 /* close the process */
00261                 fclose(fp);
00262                 close(pipefds[0]);
00263 
00264                 if(waitpid(pid,&retval,0)!=pid)
00265                         retval=-1;
00266         }
00267         else{
00268                 log_debug_info(DEBUGL_CHECKS,0,"running command %s via popen\n",processed_command);
00269                 fp=popen(processed_command,"r");
00270 
00271                 if(fp==NULL)
00272                         _exit(STATE_UNKNOWN);
00273 
00274                 /* extract check result */
00275                 extract_check_result(fp,checkresult_dbuf);
00276 
00277                 /* close the process */
00278                 retval=pclose(fp);
00279         }
00280 
00281         return retval;
00282 }
00283 
00284 
00285 /******************************************************************/
00286 /********************** CHECK REAPER FUNCTIONS ********************/
00287 /******************************************************************/
00288 
00289 /* reaps host and service check results */
00290 int reap_check_results(void){
00291         check_result *queued_check_result=NULL;
00292         service *temp_service=NULL;
00293         host *temp_host=NULL;
00294         time_t current_time=0L;
00295         time_t reaper_start_time=0L;
00296         int reaped_checks=0;
00297 
00298         log_debug_info(DEBUGL_FUNCTIONS,0,"reap_check_results() start\n");
00299         log_debug_info(DEBUGL_CHECKS,0,"Starting to reap check results.\n");
00300 
00301         /* get the start time */
00302         time(&reaper_start_time);
00303 
00304         /* process files in the check result queue */
00305         process_check_result_queue(check_result_path);
00306 
00307         /* read all check results that have come in... */
00308         while((queued_check_result=read_check_result())){
00309 
00310                 reaped_checks++;
00311 
00312                 log_debug_info(DEBUGL_CHECKS,2,"Found a check result (#%d) to handle...\n",reaped_checks);
00313 
00314                 /* service check */
00315                 if(queued_check_result->object_check_type==SERVICE_CHECK){
00316 
00317                         /* make sure the service exists */
00318                         if((temp_service=find_service(queued_check_result->host_name,queued_check_result->service_description))==NULL){
00319 
00320                                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check result queue contained results for service '%s' on host '%s', but the service could not be found!  Perhaps you forgot to define the service in your config files?\n",queued_check_result->service_description,queued_check_result->host_name);
00321 
00322                                 /* delete the file that contains the check results, as well as the ok-to-go file */
00323                                 delete_check_result_file(queued_check_result->output_file);
00324 
00325                                 /* free memory */
00326                                 free_check_result(queued_check_result);
00327                                 my_free(queued_check_result);
00328 
00329                                 /* TODO - add new service definition automatically */
00330 
00331                                 continue;
00332                         }
00333 
00334                         log_debug_info(DEBUGL_CHECKS,1,"Handling check result for service '%s' on host '%s'...\n",temp_service->description,temp_service->host_name);
00335 
00336                         /* process the check result */
00337                         handle_async_service_check_result(temp_service,queued_check_result);
00338                 }
00339 
00340                 /* host check */
00341                 else{
00342                         if((temp_host=find_host(queued_check_result->host_name))==NULL){
00343 
00344                                 /* make sure the host exists */
00345                                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check result queue contained results for host '%s', but the host could not be found!  Perhaps you forgot to define the host in your config files?\n",queued_check_result->host_name);
00346 
00347                                 /* delete the file that contains the check results, as well as the ok-to-go file */
00348                                 delete_check_result_file(queued_check_result->output_file);
00349 
00350                                 /* free memory */
00351                                 free_check_result(queued_check_result);
00352                                 my_free(queued_check_result);
00353 
00354                                 /* TODO - add new host definition automatically */
00355 
00356                                 continue;
00357                         }
00358 
00359                         log_debug_info(DEBUGL_CHECKS,1,"Handling check result for host '%s'...\n",temp_host->name);
00360 
00361                         /* process the check result */
00362                         handle_async_host_check_result_3x(temp_host,queued_check_result);
00363                 }
00364 
00365                 /* delete the file that contains the check results, as well as the ok-to-go file */
00366                 /* files can contain multiple check results - in this case, the file will be removed when the first check result is processed */
00367                 delete_check_result_file(queued_check_result->output_file);
00368 
00369                 log_debug_info(DEBUGL_CHECKS|DEBUGL_IPC,1,"Deleted check result file '%s'\n",queued_check_result->output_file);
00370 
00371                 /* free allocated memory */
00372                 free_check_result(queued_check_result);
00373                 my_free(queued_check_result);
00374 
00375                 /* break out if we've been here too long (max_check_reaper_time seconds) */
00376                 time(&current_time);
00377                 if((int)(current_time-reaper_start_time)>max_check_reaper_time){
00378                         log_debug_info(DEBUGL_CHECKS,0,"Breaking out of check result reaper: max reaper time exceeded\n");
00379                         break;
00380                 }
00381 
00382                 /* bail out if we encountered a signal */
00383                 if(sigshutdown==TRUE || sigrestart==TRUE){
00384                         log_debug_info(DEBUGL_CHECKS,0,"Breaking out of check result reaper: signal encountered\n");
00385                         break;
00386                 }
00387         }
00388 
00389         log_debug_info(DEBUGL_CHECKS,0,"Finished reaping %d check results\n",reaped_checks);
00390         log_debug_info(DEBUGL_FUNCTIONS,0,"reap_check_results() end\n");
00391 
00392         return OK;
00393 }
00394 
00395 
00396 
00397 
00398 /******************************************************************/
00399 /****************** SERVICE MONITORING FUNCTIONS ******************/
00400 /******************************************************************/
00401 
00402 /* executes a scheduled service check */
00403 int run_scheduled_service_check(service *svc, int check_options, double latency){
00404         int result=OK;
00405         time_t current_time=0L;
00406         time_t preferred_time=0L;
00407         time_t next_valid_time=0L;
00408         int time_is_valid=TRUE;
00409 
00410         if(svc==NULL)
00411                 return ERROR;
00412 
00413         log_debug_info(DEBUGL_FUNCTIONS,0,"run_scheduled_service_check() start\n");
00414         log_debug_info(DEBUGL_CHECKS,0,"Attempting to run scheduled check of service '%s' on host '%s': check options=%d, latency=%lf\n",svc->description,svc->host_name,check_options,latency);
00415 
00416         /* attempt to run the check */
00417         result=run_async_service_check(svc,check_options,latency,TRUE,TRUE,&time_is_valid,&preferred_time);
00418 
00419         /* an error occurred, so reschedule the check */
00420         if(result==ERROR){
00421 
00422                 log_debug_info(DEBUGL_CHECKS,1,"Unable to run scheduled service check at this time\n");
00423 
00424                 /* only attempt to (re)schedule checks that should get checked... */
00425                 if(svc->should_be_scheduled==TRUE){
00426 
00427                         /* get current time */
00428                         time(&current_time);
00429 
00430                         /* determine next time we should check the service if needed */
00431                         /* if service has no check interval, schedule it again for 5 minutes from now */
00432                         if(current_time>=preferred_time)
00433                                 preferred_time=current_time+((svc->check_interval<=0)?300:(svc->check_interval*interval_length));
00434 
00435                         /* make sure we rescheduled the next service check at a valid time */
00436                         get_next_valid_time(preferred_time,&next_valid_time,svc->check_period_ptr);
00437 
00438                         /*
00439                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Service '%s' on host '%s' timeperiod check failed...\n",svc->description,svc->host_name);
00440                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Current time: %s",ctime(&current_time));
00441                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Preferred time: %s",ctime(&preferred_time));
00442                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Next valid time: %s",ctime(&next_valid_time));
00443                          */
00444 
00445                         /* the service could not be rescheduled properly - set the next check time for next week */
00446                         /*if(time_is_valid==FALSE && next_valid_time==preferred_time){*/
00447                         /* UPDATED 08/12/09 EG to reflect proper timeperod check logic */
00448                         if(time_is_valid==FALSE &&  check_time_against_period(next_valid_time,svc->check_period_ptr)==ERROR){
00449 
00450                                 /*
00451                                 svc->next_check=(time_t)(next_valid_time+(60*60*24*365));
00452                                 svc->should_be_scheduled=FALSE;
00453                                  */
00454 
00455                                 svc->next_check=(time_t)(next_valid_time+(60*60*24*7));
00456 
00457                                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check of service '%s' on host '%s' could not be rescheduled properly.  Scheduling check for next week...\n",svc->description,svc->host_name);
00458 
00459                                 log_debug_info(DEBUGL_CHECKS,1,"Unable to find any valid times to reschedule the next service check!\n");
00460                         }
00461 
00462                         /* this service could be rescheduled... */
00463                         else{
00464                                 svc->next_check=next_valid_time;
00465                                 svc->should_be_scheduled=TRUE;
00466 
00467                                 log_debug_info(DEBUGL_CHECKS,1,"Rescheduled next service check for %s",ctime(&next_valid_time));
00468                         }
00469                 }
00470 
00471                 /* reschedule the next service check - unless we couldn't find a valid next check time */
00472                 /* 10/19/07 EG - keep original check options */
00473                 if(svc->should_be_scheduled==TRUE)
00474                         schedule_service_check(svc,svc->next_check,check_options);
00475 
00476                 /* update the status log */
00477                 update_service_status(svc,FALSE);
00478 
00479                 return ERROR;
00480         }
00481 
00482         return OK;
00483 }
00484 
00485 
00486 /* forks a child process to run a service check, but does not wait for the service check result */
00487 int run_async_service_check(service *svc, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time){
00488         icinga_macros mac;
00489         char *raw_command=NULL;
00490         char *processed_command=NULL;
00491         struct timeval start_time,end_time;
00492         pid_t pid=0;
00493         int fork_error=FALSE;
00494         int wait_result=0;
00495         host *temp_host=NULL;
00496         int pclose_result=0;
00497         mode_t new_umask=077;
00498         mode_t old_umask;
00499         char *output_file=NULL;
00500         double old_latency=0.0;
00501         dbuf checkresult_dbuf;
00502         int dbuf_chunk=1024;
00503 #ifdef USE_EVENT_BROKER
00504         int neb_result=OK;
00505 #endif
00506 #ifdef EMBEDDEDPERL
00507         char fname[512]="";
00508         char *args[5]={"",DO_CLEAN, "", "", NULL };
00509         char *perl_plugin_output=NULL;
00510         char *temp_buffer=NULL;
00511         char *args3=NULL;
00512         SV *plugin_hndlr_cr=NULL; /* perl.h holds typedef struct */
00513         int count;
00514         int use_epn=FALSE;
00515 #ifdef aTHX
00516         dTHX;
00517 #endif
00518         dSP;
00519 #endif
00520 
00521         log_debug_info(DEBUGL_FUNCTIONS,0,"run_async_service_check()\n");
00522 
00523         /* make sure we have something */
00524         if(svc==NULL)
00525                 return ERROR;
00526 
00527         /* is the service check viable at this time? */
00528         if(check_service_check_viability(svc,check_options,time_is_valid,preferred_time)==ERROR)
00529                 return ERROR;
00530 
00531         /* find the host associated with this service */
00532         if((temp_host=svc->host_ptr)==NULL)
00533                 return ERROR;
00534 
00535         /******** GOOD TO GO FOR A REAL SERVICE CHECK AT THIS POINT ********/
00536 
00537 #ifdef USE_EVENT_BROKER
00538         /* initialize start/end times */
00539         start_time.tv_sec=0L;
00540         start_time.tv_usec=0L;
00541         end_time.tv_sec=0L;
00542         end_time.tv_usec=0L;
00543 
00544         /* send data to event broker */
00545         neb_result=broker_service_check(NEBTYPE_SERVICECHECK_ASYNC_PRECHECK,NEBFLAG_NONE,NEBATTR_NONE,svc,SERVICE_CHECK_ACTIVE,start_time,end_time,svc->service_check_command,svc->latency,0.0,0,FALSE,0,NULL,NULL);
00546 
00547         /* neb module wants to cancel the service check - the check will be rescheduled for a later time by the scheduling logic */
00548         if(neb_result==NEBERROR_CALLBACKCANCEL){
00549                 if(preferred_time)
00550                         *preferred_time+=(svc->check_interval*interval_length);
00551                 return ERROR;
00552         }
00553 
00554         /* neb module wants to override (or cancel) the service check - perhaps it will check the service itself */
00555         /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */
00556         /* NOTE: if would be easier for modules to override checks when the NEBTYPE_SERVICECHECK_INITIATE event is called (later) */
00557         if(neb_result==NEBERROR_CALLBACKOVERRIDE)
00558                 return OK;
00559 #endif
00560 
00561 
00562         log_debug_info(DEBUGL_CHECKS,0,"Checking service '%s' on host '%s'...\n",svc->description,svc->host_name);
00563 
00564         /* clear check options - we don't want old check options retained */
00565         /* only clear check options for scheduled checks - ondemand checks shouldn't affected retained check options */
00566         if(scheduled_check==TRUE)
00567                 svc->check_options=CHECK_OPTION_NONE;
00568 
00569         /* update latency for macros, event broker, save old value for later */
00570         old_latency=svc->latency;
00571         svc->latency=latency;
00572 
00573         /* grab the host and service macro variables */
00574         memset(&mac, 0, sizeof(mac));
00575         grab_host_macros_r(&mac, temp_host);
00576         grab_service_macros_r(&mac, svc);
00577 
00578         /* get the raw command line */
00579         get_raw_command_line_r(&mac, svc->check_command_ptr,svc->service_check_command,&raw_command,0);
00580         if(raw_command==NULL){
00581                 clear_volatile_macros_r(&mac);
00582                 log_debug_info(DEBUGL_CHECKS,0,"Raw check command for service '%s' on host '%s' was NULL - aborting.\n",svc->description,svc->host_name);
00583                 if(preferred_time)
00584                         *preferred_time+=(svc->check_interval*interval_length);
00585                 svc->latency=old_latency;
00586                 return ERROR;
00587         }
00588 
00589         /* process any macros contained in the argument */
00590         process_macros_r(&mac, raw_command,&processed_command,0);
00591         if(processed_command==NULL){
00592                 clear_volatile_macros_r(&mac);
00593                 log_debug_info(DEBUGL_CHECKS,0,"Processed check command for service '%s' on host '%s' was NULL - aborting.\n",svc->description,svc->host_name);
00594                 if(preferred_time)
00595                         *preferred_time+=(svc->check_interval*interval_length);
00596                 svc->latency=old_latency;
00597                 my_free(raw_command);
00598                 return ERROR;
00599         }
00600 
00601         /* get the command start time */
00602         gettimeofday(&start_time,NULL);
00603 
00604         /* increment number of service checks that are currently running... */
00605         currently_running_service_checks++;
00606 
00607         /* set the execution flag */
00608         svc->is_executing=TRUE;
00609 
00610         /* start save check info */
00611         check_result_info.object_check_type=SERVICE_CHECK;
00612         check_result_info.check_type=SERVICE_CHECK_ACTIVE;
00613         check_result_info.check_options=check_options;
00614         check_result_info.scheduled_check=scheduled_check;
00615         check_result_info.reschedule_check=reschedule_check;
00616         check_result_info.start_time=start_time;
00617         check_result_info.finish_time=start_time;
00618         check_result_info.early_timeout=FALSE;
00619         check_result_info.exited_ok=TRUE;
00620         check_result_info.return_code=STATE_OK;
00621         check_result_info.output=NULL;
00622 
00623 #ifdef USE_EVENT_BROKER
00624         /* send data to event broker */
00625         neb_result=broker_service_check(NEBTYPE_SERVICECHECK_INITIATE,NEBFLAG_NONE,NEBATTR_NONE,svc,SERVICE_CHECK_ACTIVE,start_time,end_time,svc->service_check_command,svc->latency,0.0,service_check_timeout,FALSE,0,processed_command,NULL);
00626 
00627         my_free(svc->processed_command);
00628         svc->processed_command=strdup(processed_command);
00629 
00630         /* neb module wants to override the service check - perhaps it will check the service itself */
00631         if(neb_result==NEBERROR_CALLBACKOVERRIDE){
00632                 clear_volatile_macros_r(&mac);
00633                 svc->latency=old_latency;
00634                 my_free(processed_command);
00635                 my_free(raw_command);
00636                 return OK;
00637         }
00638 #endif
00639 
00640         /* open a temp file for storing check output */
00641         old_umask=umask(new_umask);
00642         dummy=asprintf(&output_file,"%s/checkXXXXXX",temp_path);
00643         check_result_info.output_file_fd=mkstemp(output_file);
00644         if(check_result_info.output_file_fd>=0)
00645                 check_result_info.output_file_fp=fdopen(check_result_info.output_file_fd,"w");
00646         else{
00647                 check_result_info.output_file_fp=NULL;
00648                 check_result_info.output_file_fd=-1;
00649         }
00650         umask(old_umask);
00651 
00652         log_debug_info(DEBUGL_CHECKS|DEBUGL_IPC,1,"Check result output will be written to '%s' (fd=%d)\n",output_file,check_result_info.output_file_fd);
00653 
00654 
00655         /* finish save check info */
00656         check_result_info.host_name=(char *)strdup(svc->host_name);
00657         check_result_info.service_description=(char *)strdup(svc->description);
00658         check_result_info.output_file=(check_result_info.output_file_fd<0 || output_file==NULL)?NULL:strdup(output_file);
00659 
00660         /* free memory */
00661         my_free(output_file);
00662 
00663         /* write start of check result file */
00664         /* if things go really bad later on down the line, the user will at least have a partial file to help debug missing output results */
00665         if(check_result_info.output_file_fp){
00666 
00667                 fprintf(check_result_info.output_file_fp,"### Active Check Result File ###\n");
00668                 fprintf(check_result_info.output_file_fp,"file_time=%lu\n",(unsigned long)check_result_info.start_time.tv_sec);
00669                 fprintf(check_result_info.output_file_fp,"\n");
00670 
00671                 fprintf(check_result_info.output_file_fp,"### Icinga Service Check Result ###\n");
00672                 fprintf(check_result_info.output_file_fp,"# Time: %s",ctime(&check_result_info.start_time.tv_sec));
00673                 fprintf(check_result_info.output_file_fp,"host_name=%s\n",check_result_info.host_name);
00674                 fprintf(check_result_info.output_file_fp,"service_description=%s\n",check_result_info.service_description);
00675                 fprintf(check_result_info.output_file_fp,"check_type=%d\n",check_result_info.check_type);
00676                 fprintf(check_result_info.output_file_fp,"check_options=%d\n",check_result_info.check_options);
00677                 fprintf(check_result_info.output_file_fp,"scheduled_check=%d\n",check_result_info.scheduled_check);
00678                 fprintf(check_result_info.output_file_fp,"reschedule_check=%d\n",check_result_info.reschedule_check);
00679                 fprintf(check_result_info.output_file_fp,"latency=%f\n",svc->latency);
00680                 fprintf(check_result_info.output_file_fp,"start_time=%lu.%lu\n",check_result_info.start_time.tv_sec,check_result_info.start_time.tv_usec);
00681 
00682                 /* flush output or it'll get written again when we fork() */
00683                 fflush(check_result_info.output_file_fp);
00684         }
00685 
00686         /* initialize dynamic buffer for storing plugin output */
00687         dbuf_init(&checkresult_dbuf,dbuf_chunk);
00688 
00689 
00690         /* reset latency (permanent value will be set later) */
00691         svc->latency=old_latency;
00692 
00693         /* update check statistics */
00694         update_check_stats((scheduled_check==TRUE)?ACTIVE_SCHEDULED_SERVICE_CHECK_STATS:ACTIVE_ONDEMAND_SERVICE_CHECK_STATS,start_time.tv_sec);
00695 
00696 #ifdef EMBEDDEDPERL
00697 
00698         /* get"filename" component of command */
00699         strncpy(fname,processed_command,strcspn(processed_command," "));
00700         fname[strcspn(processed_command," ")]='\x0';
00701 
00702         /* should we use the embedded Perl interpreter to run this script? */
00703         use_epn=file_uses_embedded_perl(fname);
00704 
00705         /* if yes, do some initialization */
00706         if(use_epn==TRUE){
00707 
00708                 log_debug_info(DEBUGL_CHECKS,1,"** Using Embedded Perl interpreter to run service check...\n");
00709 
00710                 args[0]=fname;
00711                 args[2]="";
00712 
00713                 if(strchr(processed_command,' ')==NULL){
00714                         args[3]="";
00715                 } else {
00716                         /* make sure to strip leading whitespaces from args */
00717                         args3=processed_command+strlen(fname)+1;
00718                         for (;isspace(*args3);args3++);
00719                         args[3]=args3;
00720                 }
00721 
00722                 ENTER;
00723                 SAVETMPS;
00724                 PUSHMARK(SP);
00725                 XPUSHs(sv_2mortal(newSVpv(args[0],0)));
00726                 XPUSHs(sv_2mortal(newSVpv(args[1],0)));
00727                 XPUSHs(sv_2mortal(newSVpv(args[2],0)));
00728                 XPUSHs(sv_2mortal(newSVpv(args[3],0)));
00729                 PUTBACK;
00730 
00731                 /* call our perl interpreter to compile and optionally cache the command */
00732 
00733                 call_pv("Embed::Persistent::eval_file", G_SCALAR | G_EVAL);
00734 
00735                 SPAGAIN ;
00736 
00737                 if( SvTRUE(ERRSV) ){
00738 
00739                         /*
00740                          * if SvTRUE(ERRSV)
00741                          *      write failure to IPC pipe
00742                          *      return
00743                          */
00744 
00745                         /* remove the top element of the Perl stack (undef) */
00746                         (void) POPs ;
00747 
00748                         pclose_result=STATE_UNKNOWN;
00749                         perl_plugin_output=SvPVX(ERRSV);
00750 
00751                         log_debug_info(DEBUGL_CHECKS,0,"Embedded Perl failed to compile %s, compile error %s - skipping plugin\n",fname,perl_plugin_output);
00752 
00753                         /* save plugin output */
00754                         if(perl_plugin_output!=NULL){
00755                                 temp_buffer=escape_newlines(perl_plugin_output);
00756                                 dbuf_strcat(&checkresult_dbuf,temp_buffer);
00757                                 my_free(temp_buffer);
00758                         }
00759 
00760                         /* get the check finish time */
00761                         gettimeofday(&end_time,NULL);
00762 
00763                         /* record check result info */
00764                         check_result_info.exited_ok=FALSE;
00765                         check_result_info.return_code=pclose_result;
00766                         check_result_info.finish_time=end_time;
00767 
00768                         /* write check result to file */
00769                         if(check_result_info.output_file_fp){
00770 
00771                                 fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
00772                                 fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout);
00773                                 fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok);
00774                                 fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code);
00775                                 fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
00776 
00777                                 /* close the temp file */
00778                                 fclose(check_result_info.output_file_fp);
00779 
00780                                 /* move check result to queue directory */
00781                                 move_check_result_to_queue(check_result_info.output_file);
00782                         }
00783 
00784                         /* free memory */
00785                         dbuf_free(&checkresult_dbuf);
00786 
00787                         /* free check result memory */
00788                         free_check_result(&check_result_info);
00789 
00790                         return OK;
00791                 }
00792                 else{
00793 
00794                         plugin_hndlr_cr=newSVsv(POPs);
00795 
00796                         log_debug_info(DEBUGL_CHECKS,1,"Embedded Perl successfully compiled %s and returned code ref to plugin handler\n",fname);
00797 
00798                         PUTBACK ;
00799                         FREETMPS ;
00800                         LEAVE ;
00801                 }
00802         }
00803 #endif
00804 
00805         /* plugin is a C plugin or a Perl plugin _without_ compilation errors */
00806 
00807         /* fork a child process */
00808         pid=fork();
00809 
00810         /* an error occurred while trying to fork */
00811         if(pid==-1){
00812 
00813                 fork_error=TRUE;
00814 
00815                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'.  The check will be rescheduled.\n",svc->description,svc->host_name,strerror(errno));
00816 
00817                 log_debug_info(DEBUGL_CHECKS,0,"Check of service '%s' on host '%s' could not be performed due to a fork() error: '%s'!\n",svc->description,svc->host_name,strerror(errno));
00818         }
00819 
00820         /* if we are in the child process... */
00821         else if(pid==0){
00822 
00823                 /* set environment variables */
00824                 set_all_macro_environment_vars_r(&mac, TRUE);
00825 
00826                 /* ADDED 11/12/07 EG */
00827                 /* close external command file and shut down worker thread */
00828                 close_command_file();
00829 
00830                 /* fork again if we're not in a large installation */
00831                 if(child_processes_fork_twice==TRUE){
00832 
00833                         /* fork again... */
00834                         pid=fork();
00835 
00836                         /* an error occurred while trying to fork again */
00837                         if(pid==-1)
00838                                 exit(STATE_UNKNOWN);
00839                 }
00840 
00841                 /* the grandchild (or child if large install tweaks are enabled) process should run the service check... */
00842                 if(pid==0 || child_processes_fork_twice==FALSE){
00843 
00844                         /* reset signal handling */
00845                         reset_sighandler();
00846 
00847                         /* become the process group leader */
00848                         setpgid(0,0);
00849 
00850                         /* catch term signals at this process level */
00851                         signal(SIGTERM,service_check_sighandler);
00852 
00853                         /* catch plugins that don't finish in a timely manner */
00854                         signal(SIGALRM,service_check_sighandler);
00855                         alarm(service_check_timeout);
00856 
00857                         /* disable rotation of the debug file */
00858                         max_debug_file_size=0L;
00859 
00860                         /******** BEGIN EMBEDDED PERL INTERPRETER EXECUTION ********/
00861 #ifdef EMBEDDEDPERL
00862                         if(use_epn==TRUE){
00863 
00864                                 /* execute our previously compiled script - from call_pv("Embed::Persistent::eval_file",..) */
00865                                 /* NB. args[2] is _now_ a code ref (to the Perl subroutine corresp to the plugin) returned by eval_file() */
00866 
00867                                 ENTER;
00868                                 SAVETMPS;
00869                                 PUSHMARK(SP);
00870 
00871                                 XPUSHs(sv_2mortal(newSVpv(args[0],0)));
00872                                 XPUSHs(sv_2mortal(newSVpv(args[1],0)));
00873                                 XPUSHs(plugin_hndlr_cr);
00874                                 XPUSHs(sv_2mortal(newSVpv(args[3],0)));
00875 
00876                                 PUTBACK;
00877 
00878                                 count=call_pv("Embed::Persistent::run_package", G_ARRAY);
00879 
00880                                 SPAGAIN;
00881 
00882                                 perl_plugin_output = POPpx ;
00883                                 pclose_result = POPi ;
00884 
00885                                 /* NOTE: 07/16/07 This has to be done before FREETMPS statement below, or the POPpx pointer will be invalid (Hendrik B.) */
00886                                 /* get perl plugin output - escape newlines */
00887                                 if(perl_plugin_output!=NULL){
00888                                         temp_buffer=escape_newlines(perl_plugin_output);
00889                                         dbuf_strcat(&checkresult_dbuf,temp_buffer);
00890                                         my_free(temp_buffer);
00891                                 }
00892 
00893                                 PUTBACK;
00894                                 FREETMPS;
00895                                 LEAVE;
00896 
00897                                 log_debug_info(DEBUGL_CHECKS,1,"Embedded Perl ran %s: return code=%d, plugin output=%s\n",fname,pclose_result,(perl_plugin_output==NULL)?"NULL":checkresult_dbuf.buf);
00898 
00899                                 /* reset the alarm */
00900                                 alarm(0);
00901 
00902                                 /* get the check finish time */
00903                                 gettimeofday(&end_time,NULL);
00904 
00905                                 /* record check result info */
00906                                 check_result_info.return_code=pclose_result;
00907                                 check_result_info.finish_time=end_time;
00908 
00909                                 /* write check result to file */
00910                                 if(check_result_info.output_file_fp){
00911 
00912                                         fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
00913                                         fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout);
00914                                         fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok);
00915                                         fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code);
00916                                         fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
00917 
00918                                         /* close the temp file */
00919                                         fclose(check_result_info.output_file_fp);
00920 
00921                                         /* move check result to queue directory */
00922                                         move_check_result_to_queue(check_result_info.output_file);
00923                                 }
00924 
00925                                 /* free memory */
00926                                 dbuf_free(&checkresult_dbuf);
00927 
00928                                 /* free check result memory */
00929                                 free_check_result(&check_result_info);
00930 
00931                                 /* return with plugin exit status - not really necessary... */
00932                                 _exit(pclose_result);
00933                         }
00934 #endif
00935                         /******** END EMBEDDED PERL INTERPRETER EXECUTION ********/
00936 
00937 
00938                         /* run the plugin check command */
00939                         pclose_result=run_check(processed_command,&checkresult_dbuf);
00940 
00941                         /* reset the alarm */
00942                         alarm(0);
00943 
00944                         /* get the check finish time */
00945                         gettimeofday(&end_time,NULL);
00946 
00947                         /* record check result info */
00948                         check_result_info.finish_time=end_time;
00949                         check_result_info.early_timeout=FALSE;
00950 
00951                         /* test for execution error */
00952                         if(pclose_result==-1){
00953                                 pclose_result=STATE_UNKNOWN;
00954                                 check_result_info.return_code=STATE_CRITICAL;
00955                                 check_result_info.exited_ok=FALSE;
00956                         }
00957                         else{
00958                                 if(WEXITSTATUS(pclose_result)==0 && WIFSIGNALED(pclose_result))
00959                                         check_result_info.return_code=128+WTERMSIG(pclose_result);
00960                                 else
00961                                         check_result_info.return_code=WEXITSTATUS(pclose_result);
00962                         }
00963 
00964                         /* write check result to file */
00965                         if(check_result_info.output_file_fp){
00966 
00967                                 fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
00968                                 fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout);
00969                                 fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok);
00970                                 fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code);
00971                                 fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
00972 
00973                                 /* close the temp file */
00974                                 fclose(check_result_info.output_file_fp);
00975 
00976                                 /* move check result to queue directory */
00977                                 move_check_result_to_queue(check_result_info.output_file);
00978                         }
00979 
00980                         /* free memory */
00981                         dbuf_free(&checkresult_dbuf);
00982                         my_free(raw_command);
00983                         my_free(processed_command);
00984 
00985                         /* free check result memory */
00986                         free_check_result(&check_result_info);
00987 
00988                         /* return with plugin exit status - not really necessary... */
00989                         _exit(pclose_result);
00990                 }
00991 
00992                 /* NOTE: this code is never reached if large install tweaks are enabled... */
00993 
00994                 /* unset environment variables */
00995                 set_all_macro_environment_vars_r(&mac, FALSE);
00996 
00997                 /* free allocated memory */
00998                 /* this needs to be done last, so we don't free memory for variables before they're used above */
00999                 if(free_child_process_memory==TRUE)
01000                         free_memory(&mac);
01001 
01002                 /* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */
01003                 _exit(STATE_OK);
01004         }
01005 
01006         /* else the parent should wait for the first child to return... */
01007         else if(pid>0){
01008                 clear_volatile_macros_r(&mac);
01009 
01010                 log_debug_info(DEBUGL_CHECKS,2,"Service check is executing in child process (pid=%lu)\n",(unsigned long)pid);
01011 
01012                 /* parent should close output file */
01013                 if(check_result_info.output_file_fp)
01014                         fclose(check_result_info.output_file_fp);
01015 
01016                 /* should this be done in first child process (after spawning grandchild) as well? */
01017                 /* free memory allocated for IPC functionality */
01018                 free_check_result(&check_result_info);
01019 
01020                 /* free memory */
01021                 my_free(raw_command);
01022                 my_free(processed_command);
01023 
01024                 /* wait for the first child to return */
01025                 /* don't do this if large install tweaks are enabled - we'll clean up children in event loop */
01026                 if(child_processes_fork_twice==TRUE)
01027                         wait_result=waitpid(pid,NULL,0);
01028         }
01029 
01030         /* see if we were able to run the check... */
01031         if(fork_error==TRUE)
01032                 return ERROR;
01033 
01034         return OK;
01035 }
01036 
01037 
01038 
01039 /* handles asynchronous service check results */
01040 int handle_async_service_check_result(service *temp_service, check_result *queued_check_result){
01041         host *temp_host=NULL;
01042         time_t next_service_check=0L;
01043         time_t preferred_time=0L;
01044         time_t next_valid_time=0L;
01045         int reschedule_check=FALSE;
01046         int state_change=FALSE;
01047         int hard_state_change=FALSE;
01048         int first_host_check_initiated=FALSE;
01049         int route_result=HOST_UP;
01050         time_t current_time=0L;
01051         int state_was_logged=FALSE;
01052         char *old_plugin_output=NULL;
01053         char *temp_plugin_output=NULL;
01054         char *temp_ptr=NULL;
01055         servicedependency *temp_dependency=NULL;
01056         objectlist *check_servicelist=NULL;
01057         objectlist *servicelist_item=NULL;
01058         service *master_service=NULL;
01059         int run_async_check=TRUE;
01060         int state_changes_use_cached_state=TRUE;  /* TODO - 09/23/07 move this to a global variable */
01061         int flapping_check_done=FALSE;
01062         void *ptr=NULL;
01063 
01064 
01065         log_debug_info(DEBUGL_FUNCTIONS,0,"handle_async_service_check_result()\n");
01066 
01067         /* make sure we have what we need */
01068         if(temp_service==NULL || queued_check_result==NULL)
01069                 return ERROR;
01070 
01071         /* get the current time */
01072         time(&current_time);
01073 
01074         log_debug_info(DEBUGL_CHECKS,0,"** Handling check result for service '%s' on host '%s'...\n",temp_service->description,temp_service->host_name);
01075         log_debug_info(DEBUGL_CHECKS,1,"HOST: %s, SERVICE: %s, CHECK TYPE: %s, OPTIONS: %d, SCHEDULED: %s, RESCHEDULE: %s, EXITED OK: %s, RETURN CODE: %d, OUTPUT: %s\n",temp_service->host_name,temp_service->description,(queued_check_result->check_type==SERVICE_CHECK_ACTIVE)?"Active":"Passive",queued_check_result->check_options,(queued_check_result->scheduled_check==TRUE)?"Yes":"No",(queued_check_result->reschedule_check==TRUE)?"Yes":"No",(queued_check_result->exited_ok==TRUE)?"Yes":"No",queued_check_result->return_code,queued_check_result->output);
01076 
01077         /* decrement the number of service checks still out there... */
01078         if(queued_check_result->check_type==SERVICE_CHECK_ACTIVE && currently_running_service_checks>0)
01079                 currently_running_service_checks--;
01080 
01081         /* skip this service check results if its passive and we aren't accepting passive check results */
01082         if(queued_check_result->check_type==SERVICE_CHECK_PASSIVE){
01083                 if(accept_passive_service_checks==FALSE){
01084                         log_debug_info(DEBUGL_CHECKS,0,"Discarding passive service check result because passive service checks are disabled globally.\n");
01085                         return ERROR;
01086                 }
01087                 if(temp_service->accept_passive_service_checks==FALSE){
01088                         log_debug_info(DEBUGL_CHECKS,0,"Discarding passive service check result because passive checks are disabled for this service.\n");
01089                         return ERROR;
01090                 }
01091         }
01092 
01093         /* clear the freshening flag (it would have been set if this service was determined to be stale) */
01094         if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK)
01095                 temp_service->is_being_freshened=FALSE;
01096 
01097         /* clear the execution flag if this was an active check */
01098         if(queued_check_result->check_type==SERVICE_CHECK_ACTIVE)
01099                 temp_service->is_executing=FALSE;
01100 
01101         /* DISCARD INVALID FRESHNESS CHECK RESULTS */
01102         /* If a services goes stale, Icinga will initiate a forced check in order to freshen it.  There is a race condition whereby a passive check
01103            could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here.  This would
01104            make the service fresh again, so we do a quick check to make sure the service is still stale before we accept the check result. */
01105         if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_service_result_fresh(temp_service,current_time,FALSE)==TRUE){
01106                 log_debug_info(DEBUGL_CHECKS,0,"Discarding service freshness check result because the service is currently fresh (race condition avoided).\n");
01107                 return OK;
01108         }
01109 
01110         /* check latency is passed to us */
01111         temp_service->latency=queued_check_result->latency;
01112 
01113         /* update the execution time for this check (millisecond resolution) */
01114         temp_service->execution_time=(double)((double)(queued_check_result->finish_time.tv_sec-queued_check_result->start_time.tv_sec)+(double)((queued_check_result->finish_time.tv_usec-queued_check_result->start_time.tv_usec)/1000.0)/1000.0);
01115         if(temp_service->execution_time<0.0)
01116                 temp_service->execution_time=0.0;
01117 
01118         /* get the last check time */
01119         temp_service->last_check=queued_check_result->start_time.tv_sec;
01120 
01121         /* was this check passive or active? */
01122         temp_service->check_type=(queued_check_result->check_type==SERVICE_CHECK_ACTIVE)?SERVICE_CHECK_ACTIVE:SERVICE_CHECK_PASSIVE;
01123 
01124         /* update check statistics for passive checks */
01125         if(queued_check_result->check_type==SERVICE_CHECK_PASSIVE)
01126                 update_check_stats(PASSIVE_SERVICE_CHECK_STATS,queued_check_result->start_time.tv_sec);
01127 
01128         /* should we reschedule the next service check? NOTE: This may be overridden later... */
01129         reschedule_check=queued_check_result->reschedule_check;
01130 
01131         /* save the old service status info */
01132         temp_service->last_state=temp_service->current_state;
01133 
01134         /* save old plugin output */
01135         if(temp_service->plugin_output)
01136                 old_plugin_output=(char *)strdup(temp_service->plugin_output);
01137 
01138         /* clear the old plugin output and perf data buffers */
01139         my_free(temp_service->plugin_output);
01140         my_free(temp_service->long_plugin_output);
01141         my_free(temp_service->perf_data);
01142 
01143         /* if there was some error running the command, just skip it (this shouldn't be happening) */
01144         if(queued_check_result->exited_ok==FALSE){
01145 
01146                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning:  Check of service '%s' on host '%s' did not exit properly!\n",temp_service->description,temp_service->host_name);
01147 
01148                 temp_service->plugin_output=(char *)strdup("(Service check did not exit properly)");
01149 
01150                 temp_service->current_state=STATE_CRITICAL;
01151         }
01152 
01153         /* make sure the return code is within bounds */
01154         else if(queued_check_result->return_code<0 || queued_check_result->return_code>3){
01155 
01156                 if ( queued_check_result->return_code==126 ) {
01157                         dummy=asprintf(&temp_service->plugin_output,"The command defined for service %s is not an executable\n", queued_check_result->service_description);
01158                 } else if  ( queued_check_result->return_code==127 ) {
01159                         dummy=asprintf(&temp_service->plugin_output,"The command defined for service %s does not exist\n", queued_check_result->service_description);
01160                 } else {
01161                         dummy=asprintf(&temp_service->plugin_output, "Return code of %d is out of bounds", queued_check_result->return_code);
01162                 }
01163                 logit(NSLOG_RUNTIME_WARNING,TRUE,"%s",temp_service->plugin_output);
01164 
01165                 temp_service->current_state=STATE_CRITICAL;
01166         }
01167 
01168         /* else the return code is okay... */
01169         else{
01170 
01171                 /* parse check output to get: (1) short output, (2) long output, (3) perf data */
01172                 parse_check_output(queued_check_result->output,&temp_service->plugin_output,&temp_service->long_plugin_output,&temp_service->perf_data,TRUE,TRUE);
01173 
01174                 /* make sure the plugin output isn't null */
01175                 if(temp_service->plugin_output==NULL)
01176                         temp_service->plugin_output=(char *)strdup("(No output returned from plugin)");
01177 
01178                 /* replace semicolons in plugin output (but not performance data) with colons */
01179                 else if((temp_ptr=temp_service->plugin_output)){
01180                         while((temp_ptr=strchr(temp_ptr,';')))
01181                                 *temp_ptr=':';
01182                 }
01183 
01184                 log_debug_info(DEBUGL_CHECKS,2,"Parsing check output...\n");
01185                 log_debug_info(DEBUGL_CHECKS,2,"Short Output: %s\n",(temp_service->plugin_output==NULL)?"NULL":temp_service->plugin_output);
01186                 log_debug_info(DEBUGL_CHECKS,2,"Long Output:  %s\n",(temp_service->long_plugin_output==NULL)?"NULL":temp_service->long_plugin_output);
01187                 log_debug_info(DEBUGL_CHECKS,2,"Perf Data:    %s\n",(temp_service->perf_data==NULL)?"NULL":temp_service->perf_data);
01188 
01189                 /* grab the return code */
01190                 temp_service->current_state=queued_check_result->return_code;
01191         }
01192 
01193 
01194         /* record the last state time */
01195         switch(temp_service->current_state){
01196         case STATE_OK:
01197                 temp_service->last_time_ok=temp_service->last_check;
01198                 break;
01199         case STATE_WARNING:
01200                 temp_service->last_time_warning=temp_service->last_check;
01201                 break;
01202         case STATE_UNKNOWN:
01203                 temp_service->last_time_unknown=temp_service->last_check;
01204                 break;
01205         case STATE_CRITICAL:
01206                 temp_service->last_time_critical=temp_service->last_check;
01207                 break;
01208         default:
01209                 break;
01210         }
01211 
01212         /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */
01213         if(temp_service->check_type==SERVICE_CHECK_PASSIVE){
01214                 if(log_passive_checks==TRUE)
01215                         logit(NSLOG_PASSIVE_CHECK,FALSE,"PASSIVE SERVICE CHECK: %s;%s;%d;%s\n",temp_service->host_name,temp_service->description,temp_service->current_state,temp_service->plugin_output);
01216         }
01217 
01218         /* get the host that this service runs on */
01219         temp_host=(host *)temp_service->host_ptr;
01220 
01221         /* if the service check was okay... */
01222         if(temp_service->current_state==STATE_OK){
01223 
01224                 /* if the host has never been checked before, verify its status */
01225                 /* only do this if 1) the initial state was set to non-UP or 2) the host is not scheduled to be checked soon (next 5 minutes) */
01226                 if(temp_host->has_been_checked==FALSE && (temp_host->initial_state!=HOST_UP || (unsigned long)temp_host->next_check==0L || (unsigned long)(temp_host->next_check-current_time)>300)){
01227 
01228                         /* set a flag to remember that we launched a check */
01229                         first_host_check_initiated=TRUE;
01230 
01231                         /* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */
01232                         /* previous logic was to simply run a sync (serial) host check */
01233                         /* do NOT allow cached check results to happen here - we need the host to be checked for real... */
01234                         if(use_aggressive_host_checking==TRUE)
01235                                 perform_on_demand_host_check(temp_host,NULL,CHECK_OPTION_NONE,FALSE,0L);
01236                         else
01237                                 run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL);
01238                 }
01239         }
01240 
01241 
01242         /**** NOTE - THIS WAS MOVED UP FROM LINE 1049 BELOW TO FIX PROBLEMS WHERE CURRENT ATTEMPT VALUE WAS ACTUALLY "LEADING" REAL VALUE ****/
01243         /* increment the current attempt number if this is a soft state (service was rechecked) */
01244         if(temp_service->state_type==SOFT_STATE && (temp_service->current_attempt < temp_service->max_attempts))
01245                 temp_service->current_attempt=temp_service->current_attempt+1;
01246 
01247 
01248         log_debug_info(DEBUGL_CHECKS,2,"ST: %s  CA: %d  MA: %d  CS: %d  LS: %d  LHS: %d\n",(temp_service->state_type==SOFT_STATE)?"SOFT":"HARD",temp_service->current_attempt,temp_service->max_attempts,temp_service->current_state,temp_service->last_state,temp_service->last_hard_state);
01249 
01250         /* check for a state change (either soft or hard) */
01251         if(temp_service->current_state!=temp_service->last_state){
01252                 log_debug_info(DEBUGL_CHECKS,2,"Service has changed state since last check!\n");
01253                 state_change=TRUE;
01254         }
01255 
01256         /* checks for a hard state change where host was down at last service check */
01257         /* this occurs in the case where host goes down and service current attempt gets reset to 1 */
01258         /* if this check is not made, the service recovery looks like a soft recovery instead of a hard one */
01259         if(temp_service->host_problem_at_last_check==TRUE && temp_service->current_state==STATE_OK){
01260                 log_debug_info(DEBUGL_CHECKS,2,"Service had a HARD STATE CHANGE!!\n");
01261                 hard_state_change=TRUE;
01262         }
01263 
01264         /* check for a "normal" hard state change where max check attempts is reached */
01265         if(temp_service->current_attempt>=temp_service->max_attempts && temp_service->current_state!=temp_service->last_hard_state){
01266                 log_debug_info(DEBUGL_CHECKS,2,"Service had a HARD STATE CHANGE!!\n");
01267                 hard_state_change=TRUE;
01268         }
01269 
01270         /* a state change occurred... */
01271         /* reset last and next notification times and acknowledgement flag if necessary, misc other stuff */
01272         if(state_change==TRUE || hard_state_change==TRUE){
01273 
01274                 /* reschedule the service check */
01275                 reschedule_check=TRUE;
01276 
01277                 /* reset notification times */
01278                 temp_service->last_notification=(time_t)0;
01279                 temp_service->next_notification=(time_t)0;
01280 
01281                 /* reset notification suppression option */
01282                 temp_service->no_more_notifications=FALSE;
01283 
01284                 if(temp_service->acknowledgement_type==ACKNOWLEDGEMENT_NORMAL){
01285 
01286                         temp_service->problem_has_been_acknowledged=FALSE;
01287                         temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE;
01288 
01289                         /* remove any non-persistant comments associated with the ack */
01290                         delete_service_acknowledgement_comments(temp_service);
01291                 }
01292                 else if(temp_service->acknowledgement_type==ACKNOWLEDGEMENT_STICKY && temp_service->current_state==STATE_OK){
01293 
01294                         temp_service->problem_has_been_acknowledged=FALSE;
01295                         temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE;
01296 
01297                         /* remove any non-persistant comments associated with the ack */
01298                         delete_service_acknowledgement_comments(temp_service);
01299                 }
01300 
01301                 /* do NOT reset current notification number!!! */
01302                 /* hard changes between non-OK states should continue to be escalated, so don't reset current notification number */
01303                 /*temp_service->current_notification_number=0;*/
01304         }
01305 
01306         /* initialize the last host and service state change times if necessary */
01307         if(temp_service->last_state_change==(time_t)0)
01308                 temp_service->last_state_change=temp_service->last_check;
01309         if(temp_service->last_hard_state_change==(time_t)0)
01310                 temp_service->last_hard_state_change=temp_service->last_check;
01311         if(temp_host->last_state_change==(time_t)0)
01312                 temp_host->last_state_change=temp_service->last_check;
01313         if(temp_host->last_hard_state_change==(time_t)0)
01314                 temp_host->last_hard_state_change=temp_service->last_check;
01315 
01316         /* update last service state change times */
01317         if(state_change==TRUE)
01318                 temp_service->last_state_change=temp_service->last_check;
01319         if(hard_state_change==TRUE)
01320                 temp_service->last_hard_state_change=temp_service->last_check;
01321 
01322         /* update the event and problem ids */
01323         if(state_change==TRUE){
01324 
01325                 /* always update the event id on a state change */
01326                 temp_service->last_event_id=temp_service->current_event_id;
01327                 temp_service->current_event_id=next_event_id;
01328                 next_event_id++;
01329 
01330                 /* update the problem id when transitioning to a problem state */
01331                 if(temp_service->last_state==STATE_OK){
01332                         /* don't reset last problem id, or it will be zero the next time a problem is encountered */
01333                         /* temp_service->last_problem_id=temp_service->current_problem_id;*/
01334                         temp_service->current_problem_id=next_problem_id;
01335                         next_problem_id++;
01336                 }
01337 
01338                 /* clear the problem id when transitioning from a problem state to an OK state */
01339                 if(temp_service->current_state==STATE_OK){
01340                         temp_service->last_problem_id=temp_service->current_problem_id;
01341                         temp_service->current_problem_id=0L;
01342                 }
01343         }
01344 
01345 
01346         /**************************************/
01347         /******* SERVICE CHECK OK LOGIC *******/
01348         /**************************************/
01349 
01350         /* if the service is up and running OK... */
01351         if(temp_service->current_state==STATE_OK){
01352 
01353                 log_debug_info(DEBUGL_CHECKS,1,"Service is OK.\n");
01354 
01355                 /* reset the acknowledgement flag (this should already have been done, but just in case...) */
01356                 temp_service->problem_has_been_acknowledged=FALSE;
01357                 temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE;
01358 
01359                 /* verify the route to the host and send out host recovery notifications */
01360                 if(temp_host->current_state!=HOST_UP){
01361 
01362                         log_debug_info(DEBUGL_CHECKS,1,"Host is NOT UP, so we'll check it to see if it recovered...\n");
01363 
01364                         /* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */
01365                         /* previous logic was to simply run a sync (serial) host check */
01366                         if(use_aggressive_host_checking==TRUE)
01367                                 perform_on_demand_host_check(temp_host,NULL,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon);
01368                         /* 09/23/07 EG don't launch a new host check if we already did so earlier */
01369                         else if(first_host_check_initiated==TRUE)
01370                                 log_debug_info(DEBUGL_CHECKS,1,"First host check was already initiated, so we'll skip a new host check.\n");
01371                         else{
01372                                 /* can we use the last cached host state? */
01373                                 /* usually only use cached host state if no service state change has occurred */
01374                                 if((state_change==FALSE || state_changes_use_cached_state==TRUE) && temp_host->has_been_checked==TRUE && ((current_time-temp_host->last_check) <= cached_host_check_horizon)){
01375                                         log_debug_info(DEBUGL_CHECKS,1,"* Using cached host state: %d\n",temp_host->current_state);
01376                                         update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time);
01377                                         update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time);
01378                                 }
01379 
01380                                 /* else launch an async (parallel) check of the host */
01381                                 else
01382                                         run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL);
01383                         }
01384                 }
01385 
01386                 /* if a hard service recovery has occurred... */
01387                 if(hard_state_change==TRUE){
01388 
01389                         log_debug_info(DEBUGL_CHECKS,1,"Service experienced a HARD RECOVERY.\n");
01390 
01391                         /* set the state type macro */
01392                         temp_service->state_type=HARD_STATE;
01393 
01394                         /* log the service recovery */
01395                         log_service_event(temp_service);
01396                         state_was_logged=TRUE;
01397 
01398                         /* 10/04/07 check to see if the service and/or associate host is flapping */
01399                         /* this should be done before a notification is sent out to ensure the host didn't just start flapping */
01400                         check_for_service_flapping(temp_service,TRUE,TRUE);
01401                         check_for_host_flapping(temp_host,TRUE,FALSE,TRUE);
01402                         flapping_check_done=TRUE;
01403 
01404                         /* notify contacts about the service recovery */
01405                         service_notification(temp_service,NOTIFICATION_NORMAL,NULL,NULL,NOTIFICATION_OPTION_NONE);
01406 
01407                         /* run the service event handler to handle the hard state change */
01408                         handle_service_event(temp_service);
01409                 }
01410 
01411                 /* else if a soft service recovery has occurred... */
01412                 else if(state_change==TRUE){
01413 
01414                         log_debug_info(DEBUGL_CHECKS,1,"Service experienced a SOFT RECOVERY.\n");
01415 
01416                         /* this is a soft recovery */
01417                         temp_service->state_type=SOFT_STATE;
01418 
01419                         /* log the soft recovery */
01420                         log_service_event(temp_service);
01421                         state_was_logged=TRUE;
01422 
01423                         /* run the service event handler to handle the soft state change */
01424                         handle_service_event(temp_service);
01425                 }
01426 
01427                 /* else no service state change has occurred... */
01428                 else{
01429                         log_debug_info(DEBUGL_CHECKS,1,"Service did not change state.\n");
01430                 }
01431 
01432                 /* should we obsessive over service checks? */
01433                 if(obsess_over_services==TRUE)
01434                         obsessive_compulsive_service_check_processor(temp_service);
01435 
01436                 /* reset all service variables because its okay now... */
01437                 temp_service->host_problem_at_last_check=FALSE;
01438                 temp_service->current_attempt=1;
01439                 temp_service->state_type=HARD_STATE;
01440                 temp_service->last_hard_state=STATE_OK;
01441                 temp_service->last_notification=(time_t)0;
01442                 temp_service->next_notification=(time_t)0;
01443                 temp_service->current_notification_number=0;
01444 #ifdef USE_ST_BASED_ESCAL_RANGES
01445                 temp_service->current_warning_notification_number=0;
01446                 temp_service->current_critical_notification_number=0;
01447                 temp_service->current_unknown_notification_number=0;
01448 #endif
01449                 temp_service->problem_has_been_acknowledged=FALSE;
01450                 temp_service->acknowledgement_type=ACKNOWLEDGEMENT_NONE;
01451                 temp_service->notified_on_unknown=FALSE;
01452                 temp_service->notified_on_warning=FALSE;
01453                 temp_service->notified_on_critical=FALSE;
01454                 temp_service->no_more_notifications=FALSE;
01455 
01456                 if(reschedule_check==TRUE)
01457                         next_service_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length));
01458         }
01459 
01460 
01461         /*******************************************/
01462         /******* SERVICE CHECK PROBLEM LOGIC *******/
01463         /*******************************************/
01464 
01465         /* hey, something's not working quite like it should... */
01466         else{
01467 
01468                 log_debug_info(DEBUGL_CHECKS,1,"Service is in a non-OK state!\n");
01469 
01470                 /* check the route to the host if its up right now... */
01471                 if(temp_host->current_state==HOST_UP){
01472 
01473                         log_debug_info(DEBUGL_CHECKS,1,"Host is currently UP, so we'll recheck its state to make sure...\n");
01474 
01475                         /* 08/04/07 EG launch an async (parallel) host check (possibly cached) unless aggressive host checking is enabled */
01476                         /* previous logic was to simply run a sync (serial) host check */
01477                         if(use_aggressive_host_checking==TRUE)
01478                                 perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon);
01479                         else{
01480                                 /* can we use the last cached host state? */
01481                                 /* only use cached host state if no service state change has occurred */
01482                                 if((state_change==FALSE || state_changes_use_cached_state==TRUE) && temp_host->has_been_checked==TRUE && ((current_time-temp_host->last_check) <= cached_host_check_horizon)){
01483                                         /* use current host state as route result */
01484                                         route_result=temp_host->current_state;
01485                                         log_debug_info(DEBUGL_CHECKS,1,"* Using cached host state: %d\n",temp_host->current_state);
01486                                         update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time);
01487                                         update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time);
01488                                 }
01489 
01490                                 /* else launch an async (parallel) check of the host */
01491                                 /* CHANGED 02/15/08 only if service changed state since service was last checked */
01492                                 else if(state_change==TRUE){
01493                                         /* use current host state as route result */
01494                                         route_result=temp_host->current_state;
01495                                         run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL);
01496                                 }
01497 
01498                                 /* ADDED 02/15/08 */
01499                                 /* else assume same host state */
01500                                 else{
01501                                         route_result=temp_host->current_state;
01502                                         log_debug_info(DEBUGL_CHECKS,1,"* Using last known host state: %d\n",temp_host->current_state);
01503                                         update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time);
01504                                         update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time);
01505                                 }
01506                         }
01507                 }
01508 
01509                 /* else the host is either down or unreachable, so recheck it if necessary */
01510                 else{
01511 
01512                         log_debug_info(DEBUGL_CHECKS,1,"Host is currently DOWN/UNREACHABLE.\n");
01513 
01514                         /* we're using aggressive host checking, so really do recheck the host... */
01515                         if(use_aggressive_host_checking==TRUE){
01516                                 log_debug_info(DEBUGL_CHECKS,1,"Agressive host checking is enabled, so we'll recheck the host state...\n");
01517                                 perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon);
01518                         }
01519 
01520                         /* the service wobbled between non-OK states, so check the host... */
01521                         else if((state_change==TRUE && state_changes_use_cached_state==FALSE) && temp_service->last_hard_state!=STATE_OK){
01522                                 log_debug_info(DEBUGL_CHECKS,1,"Service wobbled between non-OK states, so we'll recheck the host state...\n");
01523                                 /* 08/04/07 EG launch an async (parallel) host check unless aggressive host checking is enabled */
01524                                 /* previous logic was to simply run a sync (serial) host check */
01525                                 /* use current host state as route result */
01526                                 route_result=temp_host->current_state;
01527                                 run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL);
01528                                 /*perform_on_demand_host_check(temp_host,&route_result,CHECK_OPTION_NONE,TRUE,cached_host_check_horizon);*/
01529                         }
01530 
01531                         /* else fake the host check, but (possibly) resend host notifications to contacts... */
01532                         else{
01533 
01534                                 log_debug_info(DEBUGL_CHECKS,1,"Assuming host is in same state as before...\n");
01535 
01536                                 /* if the host has never been checked before, set the checked flag and last check time */
01537                                 /* 03/11/06 EG Note: This probably never evaluates to FALSE, present for historical reasons only, can probably be removed in the future */
01538                                 if(temp_host->has_been_checked==FALSE){
01539                                         temp_host->has_been_checked=TRUE;
01540                                         temp_host->last_check=temp_service->last_check;
01541                                 }
01542 
01543                                 /* fake the route check result */
01544                                 route_result=temp_host->current_state;
01545 
01546                                 /* possibly re-send host notifications... */
01547                                 host_notification(temp_host,NOTIFICATION_NORMAL,NULL,NULL,NOTIFICATION_OPTION_NONE);
01548                         }
01549                 }
01550 
01551                 /* if the host is down or unreachable ... */
01552                 /* 05/29/2007 NOTE: The host might be in a SOFT problem state due to host check retries/caching.  Not sure if we should take that into account and do something different or not... */
01553                 if(route_result!=HOST_UP){
01554 
01555                         log_debug_info(DEBUGL_CHECKS,2,"Host is not UP, so we mark state changes if appropriate\n");
01556 
01557                         /* "fake" a hard state change for the service - well, its not really fake, but it didn't get caught earlier... */
01558                         if(temp_service->last_hard_state!=temp_service->current_state)
01559                                 hard_state_change=TRUE;
01560 
01561                         /* update last state change times */
01562                         if(state_change==TRUE || hard_state_change==TRUE)
01563                                 temp_service->last_state_change=temp_service->last_check;
01564                         if(hard_state_change==TRUE) {
01565                                 temp_service->last_hard_state_change=temp_service->last_check;
01566                                 temp_service->state_type=HARD_STATE;
01567                                 temp_service->last_hard_state=temp_service->current_state;
01568                         }
01569 
01570                         /* put service into a hard state without attempting check retries and don't send out notifications about it */
01571                         temp_service->host_problem_at_last_check=TRUE;
01572                         temp_service->state_type=HARD_STATE;
01573                         temp_service->last_hard_state=temp_service->current_state;
01574                         temp_service->current_attempt=1;
01575                 }
01576 
01577                 /* the host is up - it recovered since the last time the service was checked... */
01578                 else if(temp_service->host_problem_at_last_check==TRUE){
01579 
01580                         /* next time the service is checked we shouldn't get into this same case... */
01581                         temp_service->host_problem_at_last_check=FALSE;
01582 
01583                         /* reset the current check counter, so we give the service a chance */
01584                         /* this helps prevent the case where service has N max check attempts, N-1 of which have already occurred. */
01585                         /* if we didn't do this, the next check might fail and result in a hard problem - we should really give it more time */
01586                         /* ADDED IF STATEMENT 01-17-05 EG */
01587                         /* 01-17-05: Services in hard problem states before hosts went down would sometimes come back as soft problem states after */
01588                         /* the hosts recovered.  This caused problems, so hopefully this will fix it */
01589                         if(temp_service->state_type==SOFT_STATE)
01590                                 temp_service->current_attempt=1;
01591                 }
01592 
01593                 log_debug_info(DEBUGL_CHECKS,1,"Current/Max Attempt(s): %d/%d\n",temp_service->current_attempt,temp_service->max_attempts);
01594 
01595                 /* if we should retry the service check, do so (except it the host is down or unreachable!) */
01596                 if(temp_service->current_attempt < temp_service->max_attempts){
01597 
01598                         /* the host is down or unreachable, so don't attempt to retry the service check */
01599                         if(route_result!=HOST_UP){
01600 
01601                                 log_debug_info(DEBUGL_CHECKS,1,"Host isn't UP, so we won't retry the service check...\n");
01602 
01603                                 /* the host is not up, so reschedule the next service check at regular interval */
01604                                 if(reschedule_check==TRUE)
01605                                         next_service_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length));
01606 
01607                                 /* log the problem as a hard state if the host just went down */
01608                                 if(hard_state_change==TRUE){
01609                                         log_service_event(temp_service);
01610                                         state_was_logged=TRUE;
01611 
01612                                         /* run the service event handler to handle the hard state */
01613                                         handle_service_event(temp_service);
01614                                 }
01615                         }
01616 
01617                         /* the host is up, so continue to retry the service check */
01618                         else{
01619 
01620                                 log_debug_info(DEBUGL_CHECKS,1,"Host is UP, so we'll retry the service check...\n");
01621 
01622                                 /* this is a soft state */
01623                                 if (temp_service->current_attempt < temp_service->max_attempts) {
01624                                         temp_service->state_type=SOFT_STATE;
01625                                 }
01626 
01627                                 /* log the service check retry */
01628                                 log_service_event(temp_service);
01629                                 state_was_logged=TRUE;
01630 
01631                                 /* run the service event handler to handle the soft state */
01632                                 handle_service_event(temp_service);
01633 
01634                                 if(reschedule_check==TRUE)
01635                                         next_service_check=(time_t)(temp_service->last_check+(temp_service->retry_interval*interval_length));
01636                         }
01637 
01638                         /* perform dependency checks on the second to last check of the service */
01639                         if(enable_predictive_service_dependency_checks==TRUE && temp_service->current_attempt==(temp_service->max_attempts-1)){
01640 
01641                                 log_debug_info(DEBUGL_CHECKS,1,"Looking for services to check for predictive dependency checks...\n");
01642 
01643                                 /* check services that THIS ONE depends on for notification AND execution */
01644                                 /* we do this because we might be sending out a notification soon and we want the dependency logic to be accurate */
01645                                 for(temp_dependency=get_first_servicedependency_by_dependent_service(temp_service->host_name,temp_service->description,&ptr);temp_dependency!=NULL;temp_dependency=get_next_servicedependency_by_dependent_service(temp_service->host_name,temp_service->description,&ptr)){
01646                                         if(temp_dependency->dependent_service_ptr==temp_service && temp_dependency->master_service_ptr!=NULL){
01647                                                 master_service=(service *)temp_dependency->master_service_ptr;
01648                                                 log_debug_info(DEBUGL_CHECKS,2,"Predictive check of service '%s' on host '%s' queued.\n",master_service->description,master_service->host_name);
01649                                                 add_object_to_objectlist(&check_servicelist,(void *)master_service);
01650                                         }
01651                                 }
01652                         }
01653                 }
01654 
01655 
01656                 /* we've reached the maximum number of service rechecks, so handle the error */
01657                 else{
01658 
01659                         log_debug_info(DEBUGL_CHECKS,1,"Service has reached max number of rechecks, so we'll handle the error...\n");
01660 
01661                         /* this is a hard state */
01662                         temp_service->state_type=HARD_STATE;
01663 
01664                         /* if we've hard a hard state change... */
01665                         if(hard_state_change==TRUE){
01666 
01667                                 /* log the service problem (even if host is not up, which is new in 0.0.5) */
01668                                 log_service_event(temp_service);
01669                                 state_was_logged=TRUE;
01670                         }
01671 
01672                         /* else log the problem (again) if this service is flagged as being volatile */
01673                         else if(temp_service->is_volatile!=FALSE){
01674                                 log_service_event(temp_service);
01675                                 state_was_logged=TRUE;
01676                         }
01677 
01678                         /* check for start of flexible (non-fixed) scheduled downtime if we just had a hard/soft error */
01679                         /* 2011-02-21 MF: we need to check for both, state_change (SOFT) and hard_state_change (HARD) values */
01680                         if((hard_state_change==TRUE || state_change==TRUE) && temp_service->pending_flex_downtime>0)
01681                                 check_pending_flex_service_downtime(temp_service);
01682 
01683                         /* 10/04/07 check to see if the service and/or associate host is flapping */
01684                         /* this should be done before a notification is sent out to ensure the host didn't just start flapping */
01685                         check_for_service_flapping(temp_service,TRUE,TRUE);
01686                         check_for_host_flapping(temp_host,TRUE,FALSE,TRUE);
01687                         flapping_check_done=TRUE;
01688 
01689 #ifdef USE_ST_BASED_ESCAL_RANGES
01690                         if (hard_state_change==TRUE){
01691                                 temp_service->current_warning_notification_number=0;
01692                                 temp_service->current_critical_notification_number=0;
01693                                 temp_service->current_unknown_notification_number=0;
01694                         }
01695 #endif
01696                         /* (re)send notifications out about this service problem if the host is up (and was at last check also) and the dependencies were okay... */
01697                         service_notification(temp_service,NOTIFICATION_NORMAL,NULL,NULL,NOTIFICATION_OPTION_NONE);
01698 
01699                         /* run the service event handler if we changed state from the last hard state or if this service is flagged as being volatile */
01700                         if(hard_state_change==TRUE || temp_service->is_volatile!=FALSE)
01701                                 handle_service_event(temp_service);
01702 
01703                         /* save the last hard state */
01704                         temp_service->last_hard_state=temp_service->current_state;
01705 
01706                         /* reschedule the next check at the regular interval */
01707                         if(reschedule_check==TRUE)
01708                                 next_service_check=(time_t)(temp_service->last_check+(temp_service->check_interval*interval_length));
01709                 }
01710 
01711 
01712                 /* should we obsessive over service checks? */
01713                 if(obsess_over_services==TRUE)
01714                         obsessive_compulsive_service_check_processor(temp_service);
01715         }
01716 
01717         /* reschedule the next service check ONLY for active, scheduled checks */
01718         if(reschedule_check==TRUE){
01719 
01720                 log_debug_info(DEBUGL_CHECKS,1,"Rescheduling next check of service at %s",ctime(&next_service_check));
01721 
01722                 /* default is to reschedule service check unless a test below fails... */
01723                 temp_service->should_be_scheduled=TRUE;
01724 
01725                 /* next check time was calculated above */
01726                 temp_service->next_check=next_service_check;
01727 
01728                 /* make sure we don't get ourselves into too much trouble... */
01729                 if(current_time>temp_service->next_check)
01730                         temp_service->next_check=current_time;
01731 
01732                 /* make sure we rescheduled the next service check at a valid time */
01733                 preferred_time=temp_service->next_check;
01734                 get_next_valid_time(preferred_time,&next_valid_time,temp_service->check_period_ptr);
01735                 temp_service->next_check=next_valid_time;
01736 
01737                 /* services with non-recurring intervals do not get rescheduled */
01738                 if(temp_service->check_interval==0)
01739                         temp_service->should_be_scheduled=FALSE;
01740 
01741                 /* services with active checks disabled do not get rescheduled */
01742                 if(temp_service->checks_enabled==FALSE)
01743                         temp_service->should_be_scheduled=FALSE;
01744 
01745                 /* schedule a non-forced check if we can */
01746                 if(temp_service->should_be_scheduled==TRUE)
01747                         schedule_service_check(temp_service,temp_service->next_check,CHECK_OPTION_NONE);
01748         }
01749 
01750         /* if we're stalking this state type and state was not already logged AND the plugin output changed since last check, log it now.. */
01751         if(temp_service->state_type==HARD_STATE && state_change==FALSE && state_was_logged==FALSE && compare_strings(old_plugin_output,temp_service->plugin_output)){
01752 
01753                 if((temp_service->current_state==STATE_OK && temp_service->stalk_on_ok==TRUE)) {
01754 
01755                         log_service_event(temp_service);
01756 
01757                         /* should we run event handlers ? */
01758                         if (stalking_event_handlers_for_services==TRUE)
01759                                 handle_service_event(temp_service);
01760 
01761                 } else if((temp_service->current_state==STATE_WARNING && temp_service->stalk_on_warning==TRUE)) {
01762 
01763                         log_service_event(temp_service);
01764 
01765                         /* should we run event handlers ? */
01766                         if (stalking_event_handlers_for_services==TRUE)
01767                                 handle_service_event(temp_service);
01768 
01769                 } else if((temp_service->current_state==STATE_UNKNOWN && temp_service->stalk_on_unknown==TRUE)) {
01770 
01771                         log_service_event(temp_service);
01772 
01773                         /* should we run event handlers ? */
01774                         if (stalking_event_handlers_for_services==TRUE)
01775                                 handle_service_event(temp_service);
01776 
01777                 } else if((temp_service->current_state==STATE_CRITICAL && temp_service->stalk_on_critical==TRUE)) {
01778 
01779                         log_service_event(temp_service);
01780 
01781                         /* should we run event handlers ? */
01782                         if (stalking_event_handlers_for_services==TRUE)
01783                                 handle_service_event(temp_service);
01784 
01785                 }
01786         }
01787 
01788 #ifdef USE_EVENT_BROKER
01789         /* send data to event broker */
01790         broker_service_check(NEBTYPE_SERVICECHECK_PROCESSED,NEBFLAG_NONE,NEBATTR_NONE,temp_service,temp_service->check_type,queued_check_result->start_time,queued_check_result->finish_time,temp_service->service_check_command,temp_service->latency,temp_service->execution_time,service_check_timeout,queued_check_result->early_timeout,queued_check_result->return_code,temp_service->processed_command,NULL);
01791 #endif
01792 
01793         /* set the checked flag */
01794         temp_service->has_been_checked=TRUE;
01795 
01796         /* update the current service status log */
01797         update_service_status(temp_service,FALSE);
01798 
01799         /* check to see if the service and/or associate host is flapping */
01800         if(flapping_check_done==FALSE){
01801                 check_for_service_flapping(temp_service,TRUE,TRUE);
01802                 check_for_host_flapping(temp_host,TRUE,FALSE,TRUE);
01803         }
01804 
01805         /* update service performance info */
01806         update_service_performance_data(temp_service);
01807 
01808         /* free allocated memory */
01809         my_free(temp_plugin_output);
01810         my_free(old_plugin_output);
01811 
01812 
01813         /* run async checks of all services we added above */
01814         /* don't run a check if one is already executing or we can get by with a cached state */
01815         for(servicelist_item=check_servicelist;servicelist_item!=NULL;servicelist_item=servicelist_item->next){
01816                 run_async_check=TRUE;
01817                 temp_service=(service *)servicelist_item->object_ptr;
01818 
01819                 /* we can get by with a cached state, so don't check the service */
01820                 if((current_time-temp_service->last_check)<=cached_service_check_horizon){
01821                         run_async_check=FALSE;
01822 
01823                         /* update check statistics */
01824                         update_check_stats(ACTIVE_CACHED_SERVICE_CHECK_STATS,current_time);
01825                 }
01826 
01827                 if(temp_service->is_executing==TRUE)
01828                         run_async_check=FALSE;
01829 
01830                 if(run_async_check==TRUE)
01831                         run_async_service_check(temp_service,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL);
01832         }
01833         free_objectlist(&check_servicelist);
01834 
01835         return OK;
01836 }
01837 
01838 
01839 
01840 /* schedules an immediate or delayed service check */
01841 void schedule_service_check(service *svc, time_t check_time, int options){
01842         timed_event *temp_event=NULL;
01843         timed_event *new_event=NULL;
01844         int found=FALSE;
01845         int use_original_event=TRUE;
01846 
01847         log_debug_info(DEBUGL_FUNCTIONS,0,"schedule_service_check()\n");
01848 
01849         if(svc==NULL)
01850                 return;
01851 
01852         log_debug_info(DEBUGL_CHECKS,0,"Scheduling a %s, active check of service '%s' on host '%s' @ %s",(options & CHECK_OPTION_FORCE_EXECUTION)?"forced":"non-forced",svc->description,svc->host_name,ctime(&check_time));
01853 
01854         /* don't schedule a check if active checks of this service are disabled */
01855         if(svc->checks_enabled==FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)){
01856                 log_debug_info(DEBUGL_CHECKS,0,"Active checks of this service are disabled.\n");
01857                 return;
01858         }
01859 
01860         /* allocate memory for a new event item */
01861         new_event=(timed_event *)malloc(sizeof(timed_event));
01862         if(new_event==NULL){
01863 
01864                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Could not reschedule check of service '%s' on host '%s'!\n",svc->description,svc->host_name);
01865 
01866                 return;
01867         }
01868 
01869         /* default is to use the new event */
01870         use_original_event=FALSE;
01871         found=FALSE;
01872 
01873 #ifdef PERFORMANCE_INCREASE_BUT_VERY_BAD_IDEA_INDEED
01874         /* WARNING! 1/19/07 on-demand async service checks will end up causing mutliple scheduled checks of a service to appear in the queue if the code below is skipped */
01875         /* if(use_large_installation_tweaks==FALSE)... skip code below */
01876 #endif
01877 
01878         /* see if there are any other scheduled checks of this service in the queue */
01879         for(temp_event=event_list_low;temp_event!=NULL;temp_event=temp_event->next){
01880 
01881                 if(temp_event->event_type==EVENT_SERVICE_CHECK && svc==(service *)temp_event->event_data){
01882                         found=TRUE;
01883                         break;
01884                 }
01885         }
01886 
01887         /* we found another service check event for this service in the queue - what should we do? */
01888         if(found==TRUE && temp_event!=NULL){
01889 
01890                 log_debug_info(DEBUGL_CHECKS,2,"Found another service check event for this service @ %s",ctime(&temp_event->run_time));
01891 
01892                 /* use the originally scheduled check unless we decide otherwise */
01893                 use_original_event=TRUE;
01894 
01895                 /* the original event is a forced check... */
01896                 if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)){
01897 
01898                         /* the new event is also forced and its execution time is earlier than the original, so use it instead */
01899                         if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)){
01900                                 use_original_event=FALSE;
01901                                 log_debug_info(DEBUGL_CHECKS,2,"New service check event is forced and occurs before the existing event, so the new event will be used instead.\n");
01902                         }
01903                 }
01904 
01905                 /* the original event is not a forced check... */
01906                 else{
01907 
01908                         /* the new event is a forced check, so use it instead */
01909                         if((options & CHECK_OPTION_FORCE_EXECUTION)){
01910                                 use_original_event=FALSE;
01911                                 log_debug_info(DEBUGL_CHECKS,2,"New service check event is forced, so it will be used instead of the existing event.\n");
01912                         }
01913 
01914                         /* the new event is not forced either and its execution time is earlier than the original, so use it instead */
01915                         else if(check_time < temp_event->run_time){
01916                                 use_original_event=FALSE;
01917                                 log_debug_info(DEBUGL_CHECKS,2,"New service check event occurs before the existing (older) event, so it will be used instead.\n");
01918                         }
01919 
01920                         /* the new event is older, so override the existing one */
01921                         else{
01922                                 log_debug_info(DEBUGL_CHECKS,2,"New service check event occurs after the existing event, so we'll ignore it.\n");
01923                         }
01924                 }
01925 
01926                 /* the originally queued event won the battle, so keep it */
01927                 if(use_original_event==TRUE){
01928                         my_free(new_event);
01929                 }
01930 
01931                 /* else we're using the new event, so remove the old one */
01932                 else{
01933                         remove_event(temp_event,&event_list_low,&event_list_low_tail);
01934                         my_free(temp_event);
01935                 }
01936         }
01937 
01938         /* save check options for retention purposes */
01939         svc->check_options=options;
01940 
01941         /* schedule a new event */
01942         if(use_original_event==FALSE){
01943 
01944                 log_debug_info(DEBUGL_CHECKS,2,"Scheduling new service check event.\n");
01945 
01946                 /* set the next service check time */
01947                 svc->next_check=check_time;
01948 
01949                 /* place the new event in the event queue */
01950                 new_event->event_type=EVENT_SERVICE_CHECK;
01951                 new_event->event_data=(void *)svc;
01952                 new_event->event_args=(void *)NULL;
01953                 new_event->event_options=options;
01954                 new_event->run_time=svc->next_check;
01955                 new_event->recurring=FALSE;
01956                 new_event->event_interval=0L;
01957                 new_event->timing_func=NULL;
01958                 new_event->compensate_for_time_change=TRUE;
01959                 reschedule_event(new_event,&event_list_low,&event_list_low_tail);
01960         }
01961 
01962         else{
01963                 /* reset the next check time (it may be out of sync) */
01964                 if(temp_event!=NULL)
01965                         svc->next_check=temp_event->run_time;
01966 
01967                 log_debug_info(DEBUGL_CHECKS,2,"Keeping original service check event (ignoring the new one).\n");
01968         }
01969 
01970         return;
01971 }
01972 
01973 
01974 
01975 /* checks viability of performing a service check */
01976 int check_service_check_viability(service *svc, int check_options, int *time_is_valid, time_t *new_time){
01977         int result=OK;
01978         int perform_check=TRUE;
01979         time_t current_time=0L;
01980         time_t preferred_time=0L;
01981         int check_interval=0;
01982 
01983         log_debug_info(DEBUGL_FUNCTIONS,0,"check_service_check_viability()\n");
01984 
01985         /* make sure we have a service */
01986         if(svc==NULL)
01987                 return ERROR;
01988 
01989         /* get the check interval to use if we need to reschedule the check */
01990         if(svc->state_type==SOFT_STATE && svc->current_state!=STATE_OK)
01991                 check_interval=(svc->retry_interval*interval_length);
01992         else
01993                 check_interval=(svc->check_interval*interval_length);
01994 
01995         /* get the current time */
01996         time(&current_time);
01997 
01998         /* initialize the next preferred check time */
01999         preferred_time=current_time;
02000 
02001         /* can we check the host right now? */
02002         if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)){
02003 
02004                 /* if checks of the service are currently disabled... */
02005                 if(svc->checks_enabled==FALSE){
02006                         preferred_time=current_time+check_interval;
02007                         perform_check=FALSE;
02008 
02009                         log_debug_info(DEBUGL_CHECKS,2,"Active checks of the service are currently disabled.\n");
02010                 }
02011 
02012                 /* make sure this is a valid time to check the service */
02013                 if(check_time_against_period((unsigned long)current_time,svc->check_period_ptr)==ERROR){
02014                         preferred_time=current_time;
02015                         if(time_is_valid)
02016                                 *time_is_valid=FALSE;
02017                         perform_check=FALSE;
02018 
02019                         log_debug_info(DEBUGL_CHECKS,2,"This is not a valid time for this service to be actively checked.\n");
02020                 }
02021 
02022                 /* check service dependencies for execution */
02023                 if(check_service_dependencies(svc,EXECUTION_DEPENDENCY)==DEPENDENCIES_FAILED){
02024                         preferred_time=current_time+check_interval;
02025                         perform_check=FALSE;
02026 
02027                         log_debug_info(DEBUGL_CHECKS,2,"Execution dependencies for this service failed, so it will not be actively checked.\n");
02028                 }
02029         }
02030 
02031         /* pass back the next viable check time */
02032         if(new_time)
02033                 *new_time=preferred_time;
02034 
02035         result=(perform_check==TRUE)?OK:ERROR;
02036 
02037         return result;
02038 }
02039 
02040 
02041 
02042 /* checks service dependencies */
02043 int check_service_dependencies(service *svc,int dependency_type){
02044         servicedependency *temp_dependency=NULL;
02045         service *temp_service=NULL;
02046         int state=STATE_OK;
02047         time_t current_time=0L;
02048         void *ptr=NULL;
02049 
02050 
02051         log_debug_info(DEBUGL_FUNCTIONS,0,"check_service_dependencies()\n");
02052 
02053         /* check all dependencies... */
02054         for(temp_dependency=get_first_servicedependency_by_dependent_service(svc->host_name,svc->description,&ptr);temp_dependency!=NULL;temp_dependency=get_next_servicedependency_by_dependent_service(svc->host_name,svc->description,&ptr)){
02055 
02056                 /* only check dependencies of the desired type (notification or execution) */
02057                 if(temp_dependency->dependency_type!=dependency_type)
02058                         continue;
02059 
02060                 /* find the service we depend on... */
02061                 if((temp_service=temp_dependency->master_service_ptr)==NULL)
02062                         continue;
02063 
02064                 /* skip this dependency if it has a timeperiod and the current time isn't valid */
02065                 time(&current_time);
02066                 if(temp_dependency->dependency_period!=NULL && check_time_against_period(current_time,temp_dependency->dependency_period_ptr)==ERROR)
02067                         return FALSE;
02068 
02069                 /* get the status to use (use last hard state if its currently in a soft state) */
02070                 if(temp_service->state_type==SOFT_STATE && soft_state_dependencies==FALSE)
02071                         state=temp_service->last_hard_state;
02072                 else
02073                         state=temp_service->current_state;
02074 
02075                 /* is the service we depend on in state that fails the dependency tests? */
02076                 if(state==STATE_OK && temp_dependency->fail_on_ok==TRUE)
02077                         return DEPENDENCIES_FAILED;
02078                 if(state==STATE_WARNING && temp_dependency->fail_on_warning==TRUE)
02079                         return DEPENDENCIES_FAILED;
02080                 if(state==STATE_UNKNOWN && temp_dependency->fail_on_unknown==TRUE)
02081                         return DEPENDENCIES_FAILED;
02082                 if(state==STATE_CRITICAL && temp_dependency->fail_on_critical==TRUE)
02083                         return DEPENDENCIES_FAILED;
02084                 if((state==STATE_OK && temp_service->has_been_checked==FALSE) && temp_dependency->fail_on_pending==TRUE)
02085                         return DEPENDENCIES_FAILED;
02086 
02087                 /* immediate dependencies ok at this point - check parent dependencies if necessary */
02088                 if(temp_dependency->inherits_parent==TRUE){
02089                         if(check_service_dependencies(temp_service,dependency_type)!=DEPENDENCIES_OK)
02090                                 return DEPENDENCIES_FAILED;
02091                 }
02092         }
02093 
02094         return DEPENDENCIES_OK;
02095 }
02096 
02097 
02098 
02099 /* check for services that never returned from a check... */
02100 void check_for_orphaned_services(void){
02101         service *temp_service=NULL;
02102         time_t current_time=0L;
02103         time_t expected_time=0L;
02104 
02105 
02106         log_debug_info(DEBUGL_FUNCTIONS,0,"check_for_orphaned_services()\n");
02107 
02108         /* get the current time */
02109         time(&current_time);
02110 
02111         /* check all services... */
02112         for(temp_service=service_list;temp_service!=NULL;temp_service=temp_service->next){
02113 
02114                 /* skip services that are not currently executing */
02115                 if(temp_service->is_executing==FALSE)
02116                         continue;
02117 
02118                 /* determine the time at which the check results should have come in (allow 10 minutes slack time) */
02119                 expected_time=(time_t)(temp_service->next_check+temp_service->latency+service_check_timeout+check_reaper_interval+600);
02120 
02121                 /* this service was supposed to have executed a while ago, but for some reason the results haven't come back in... */
02122                 if(expected_time<current_time){
02123 
02124                         /* log a warning */
02125                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of service '%s' on host '%s' looks like it was orphaned (results never came back).  I'm scheduling an immediate check of the service...\n",temp_service->description,temp_service->host_name);
02126 
02127                         log_debug_info(DEBUGL_CHECKS,1,"Service '%s' on host '%s' was orphaned, so we're scheduling an immediate check...\n",temp_service->description,temp_service->host_name);
02128 
02129                         /* decrement the number of running service checks */
02130                         if(currently_running_service_checks>0)
02131                                 currently_running_service_checks--;
02132 
02133                         /* disable the executing flag */
02134                         temp_service->is_executing=FALSE;
02135 
02136                         /* schedule an immediate check of the service */
02137                         schedule_service_check(temp_service,current_time,CHECK_OPTION_ORPHAN_CHECK);
02138                 }
02139 
02140         }
02141 
02142         return;
02143 }
02144 
02145 
02146 
02147 /* check freshness of service results */
02148 void check_service_result_freshness(void){
02149         service *temp_service=NULL;
02150         time_t current_time=0L;
02151 
02152 
02153         log_debug_info(DEBUGL_FUNCTIONS,0,"check_service_result_freshness()\n");
02154         log_debug_info(DEBUGL_CHECKS,1,"Checking the freshness of service check results...\n");
02155 
02156         /* bail out if we're not supposed to be checking freshness */
02157         if(check_service_freshness==FALSE){
02158                 log_debug_info(DEBUGL_CHECKS,1,"Service freshness checking is disabled.\n");
02159                 return;
02160         }
02161 
02162         /* get the current time */
02163         time(&current_time);
02164 
02165         /* check all services... */
02166         for(temp_service=service_list;temp_service!=NULL;temp_service=temp_service->next){
02167 
02168                 /* skip services we shouldn't be checking for freshness */
02169                 if(temp_service->check_freshness==FALSE)
02170                         continue;
02171 
02172                 /* skip services that are currently executing (problems here will be caught by orphaned service check) */
02173                 if(temp_service->is_executing==TRUE)
02174                         continue;
02175 
02176                 /* skip services that have both active and passive checks disabled */
02177                 if(temp_service->checks_enabled==FALSE && temp_service->accept_passive_service_checks==FALSE)
02178                         continue;
02179 
02180                 /* skip services that are already being freshened */
02181                 if(temp_service->is_being_freshened==TRUE)
02182                         continue;
02183 
02184                 /* see if the time is right... */
02185                 if(check_time_against_period(current_time,temp_service->check_period_ptr)==ERROR)
02186                         continue;
02187 
02188                 /* EXCEPTION */
02189                 /* don't check freshness of services without regular check intervals if we're using auto-freshness threshold */
02190                 if(temp_service->check_interval==0 && temp_service->freshness_threshold==0)
02191                         continue;
02192 
02193                 /* the results for the last check of this service are stale! */
02194                 if(is_service_result_fresh(temp_service,current_time,TRUE)==FALSE){
02195 
02196                         /* set the freshen flag */
02197                         temp_service->is_being_freshened=TRUE;
02198 
02199                         /* schedule an immediate forced check of the service */
02200                         schedule_service_check(temp_service,current_time,CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK);
02201                 }
02202 
02203         }
02204 
02205         return;
02206 }
02207 
02208 
02209 
02210 /* tests whether or not a service's check results are fresh */
02211 int is_service_result_fresh(service *temp_service, time_t current_time, int log_this){
02212         int freshness_threshold=0;
02213         time_t expiration_time=0L;
02214         int days=0;
02215         int hours=0;
02216         int minutes=0;
02217         int seconds=0;
02218         int tdays=0;
02219         int thours=0;
02220         int tminutes=0;
02221         int tseconds=0;
02222 
02223         log_debug_info(DEBUGL_CHECKS,2,"Checking freshness of service '%s' on host '%s'...\n",temp_service->description,temp_service->host_name);
02224 
02225         /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */
02226         if(temp_service->freshness_threshold==0){
02227                 if(temp_service->state_type==HARD_STATE || temp_service->current_state==STATE_OK)
02228                         freshness_threshold=(temp_service->check_interval*interval_length)+temp_service->latency+additional_freshness_latency;
02229                 else
02230                         freshness_threshold=(temp_service->retry_interval*interval_length)+temp_service->latency+additional_freshness_latency;
02231         }
02232         else
02233                 freshness_threshold=temp_service->freshness_threshold;
02234 
02235         log_debug_info(DEBUGL_CHECKS,2,"Freshness thresholds: service=%d, use=%d\n",temp_service->freshness_threshold,freshness_threshold);
02236 
02237         /* calculate expiration time */
02238         /* CHANGED 11/10/05 EG - program start is only used in expiration time calculation if > last check AND active checks are enabled, so active checks can become stale immediately upon program startup */
02239         /* CHANGED 02/25/06 SG - passive checks also become stale, so remove dependence on active check logic */
02240         if(temp_service->has_been_checked==FALSE)
02241                 expiration_time=(time_t)(event_start+freshness_threshold);
02242         /* CHANGED 06/19/07 EG - Per Ton's suggestion (and user requests), only use program start time over last check if no specific threshold has been set by user.  Otheriwse use it.  Problems can occur if Icinga is restarted more frequently that freshness threshold intervals (services never go stale). */
02243         /* CHANGED 10/07/07 EG - Only match next condition for services that have active checks enabled... */
02244         /* CHANGED 10/07/07 EG - Added max_service_check_spread to expiration time as suggested by Altinity */
02245         else if(temp_service->checks_enabled==TRUE && event_start>temp_service->last_check && temp_service->freshness_threshold==0)
02246                 expiration_time=(time_t)(event_start+freshness_threshold+(max_service_check_spread*interval_length));
02247         else
02248                 expiration_time=(time_t)(temp_service->last_check+freshness_threshold);
02249 
02250         log_debug_info(DEBUGL_CHECKS,2,"HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n",temp_service->has_been_checked,(unsigned long)program_start,(unsigned long)event_start,(unsigned long)temp_service->last_check,(unsigned long)current_time,(unsigned long)expiration_time);
02251 
02252         /* the results for the last check of this service are stale */
02253         if(expiration_time<current_time){
02254 
02255                 get_time_breakdown((current_time-expiration_time),&days,&hours,&minutes,&seconds);
02256                 get_time_breakdown(freshness_threshold,&tdays,&thours,&tminutes,&tseconds);
02257 
02258                 /* log a warning */
02259                 if(log_this==TRUE)
02260                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The results of service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  I'm forcing an immediate check of the service.\n",temp_service->description,temp_service->host_name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds);
02261 
02262                 log_debug_info(DEBUGL_CHECKS,1,"Check results for service '%s' on host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  Forcing an immediate check of the service...\n",temp_service->description,temp_service->host_name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds);
02263 
02264                 return FALSE;
02265         }
02266 
02267         log_debug_info(DEBUGL_CHECKS,1,"Check results for service '%s' on host '%s' are fresh.\n",temp_service->description,temp_service->host_name);
02268 
02269         return TRUE;
02270 }
02271 
02272 
02273 
02274 
02275 /******************************************************************/
02276 /*************** COMMON ROUTE/HOST CHECK FUNCTIONS ****************/
02277 /******************************************************************/
02278 
02279 /* execute an on-demand check  */
02280 int perform_on_demand_host_check(host *hst, int *check_return_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon){
02281 
02282         log_debug_info(DEBUGL_FUNCTIONS,0,"perform_on_demand_host_check()\n");
02283 
02284         perform_on_demand_host_check_3x(hst,check_return_code,check_options,use_cached_result,check_timestamp_horizon);
02285 
02286         return OK;
02287 }
02288 
02289 
02290 
02291 /* execute a scheduled host check using either the 2.x or 3.x logic */
02292 int perform_scheduled_host_check(host *hst, int check_options, double latency){
02293 
02294         log_debug_info(DEBUGL_FUNCTIONS,0,"perform_scheduled_host_check()\n");
02295 
02296         run_scheduled_host_check_3x(hst,check_options,latency);
02297 
02298         return OK;
02299 }
02300 
02301 
02302 
02303 /* schedules an immediate or delayed host check */
02304 void schedule_host_check(host *hst, time_t check_time, int options){
02305         timed_event *temp_event=NULL;
02306         timed_event *new_event=NULL;
02307         int found=FALSE;
02308         int use_original_event=TRUE;
02309 
02310 
02311         log_debug_info(DEBUGL_FUNCTIONS,0,"schedule_host_check()\n");
02312 
02313         if(hst==NULL)
02314                 return;
02315 
02316         log_debug_info(DEBUGL_CHECKS,0,"Scheduling a %s, active check of host '%s' @ %s",(options & CHECK_OPTION_FORCE_EXECUTION)?"forced":"non-forced",hst->name,ctime(&check_time));
02317 
02318         /* don't schedule a check if active checks of this host are disabled */
02319         if(hst->checks_enabled==FALSE && !(options & CHECK_OPTION_FORCE_EXECUTION)){
02320                 log_debug_info(DEBUGL_CHECKS,0,"Active checks are disabled for this host.\n");
02321                 return;
02322         }
02323 
02324         /* allocate memory for a new event item */
02325         if((new_event=(timed_event *)malloc(sizeof(timed_event)))==NULL){
02326 
02327                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Could not reschedule check of host '%s'!\n",hst->name);
02328 
02329                 return;
02330         }
02331 
02332         /* default is to use the new event */
02333         use_original_event=FALSE;
02334         found=FALSE;
02335 
02336 #ifdef PERFORMANCE_INCREASE_BUT_VERY_BAD_IDEA_INDEED
02337         /* WARNING! 1/19/07 on-demand async host checks will end up causing mutliple scheduled checks of a host to appear in the queue if the code below is skipped */
02338         /* if(use_large_installation_tweaks==FALSE)... skip code below */
02339 #endif
02340 
02341         /* see if there are any other scheduled checks of this host in the queue */
02342         for(temp_event=event_list_low;temp_event!=NULL;temp_event=temp_event->next){
02343                 if(temp_event->event_type==EVENT_HOST_CHECK && hst==(host *)temp_event->event_data){
02344                         found=TRUE;
02345                         break;
02346                 }
02347         }
02348 
02349         /* we found another host check event for this host in the queue - what should we do? */
02350         if(found==TRUE && temp_event!=NULL){
02351 
02352                 log_debug_info(DEBUGL_CHECKS,2,"Found another host check event for this host @ %s",ctime(&temp_event->run_time));
02353 
02354                 /* use the originally scheduled check unless we decide otherwise */
02355                 use_original_event=TRUE;
02356 
02357                 /* the original event is a forced check... */
02358                 if((temp_event->event_options & CHECK_OPTION_FORCE_EXECUTION)){
02359 
02360                         /* the new event is also forced and its execution time is earlier than the original, so use it instead */
02361                         if((options & CHECK_OPTION_FORCE_EXECUTION) && (check_time < temp_event->run_time)){
02362                                 log_debug_info(DEBUGL_CHECKS,2,"New host check event is forced and occurs before the existing event, so the new event be used instead.\n");
02363                                 use_original_event=FALSE;
02364                         }
02365                 }
02366 
02367                 /* the original event is not a forced check... */
02368                 else{
02369 
02370                         /* the new event is a forced check, so use it instead */
02371                         if((options & CHECK_OPTION_FORCE_EXECUTION)){
02372                                 use_original_event=FALSE;
02373                                 log_debug_info(DEBUGL_CHECKS,2,"New host check event is forced, so it will be used instead of the existing event.\n");
02374                         }
02375 
02376                         /* the new event is not forced either and its execution time is earlier than the original, so use it instead */
02377                         else if(check_time < temp_event->run_time){
02378                                 use_original_event=FALSE;
02379                                 log_debug_info(DEBUGL_CHECKS,2,"New host check event occurs before the existing (older) event, so it will be used instead.\n");
02380                         }
02381 
02382                         /* the new event is older, so override the existing one */
02383                         else{
02384                                 log_debug_info(DEBUGL_CHECKS,2,"New host check event occurs after the existing event, so we'll ignore it.\n");
02385                         }
02386                 }
02387 
02388                 /* the originally queued event won the battle, so keep it */
02389                 if(use_original_event==TRUE){
02390                         my_free(new_event);
02391                 }
02392 
02393                 /* else use the new event, so remove the old */
02394                 else{
02395                         remove_event(temp_event,&event_list_low,&event_list_low_tail);
02396                         my_free(temp_event);
02397                 }
02398         }
02399 
02400         /* save check options for retention purposes */
02401         hst->check_options=options;
02402 
02403         /* use the new event */
02404         if(use_original_event==FALSE){
02405 
02406                 log_debug_info(DEBUGL_CHECKS,2,"Scheduling new host check event.\n");
02407 
02408                 /* set the next host check time */
02409                 hst->next_check=check_time;
02410 
02411                 /* place the new event in the event queue */
02412                 new_event->event_type=EVENT_HOST_CHECK;
02413                 new_event->event_data=(void *)hst;
02414                 new_event->event_args=(void *)NULL;
02415                 new_event->event_options=options;
02416                 new_event->run_time=hst->next_check;
02417                 new_event->recurring=FALSE;
02418                 new_event->event_interval=0L;
02419                 new_event->timing_func=NULL;
02420                 new_event->compensate_for_time_change=TRUE;
02421                 reschedule_event(new_event,&event_list_low,&event_list_low_tail);
02422         }
02423 
02424         else{
02425                 /* reset the next check time (it may be out of sync) */
02426                 if(temp_event!=NULL)
02427                         hst->next_check=temp_event->run_time;
02428 
02429                 log_debug_info(DEBUGL_CHECKS,2,"Keeping original host check event (ignoring the new one).\n");
02430         }
02431 
02432         return;
02433 }
02434 
02435 
02436 
02437 /* checks host dependencies */
02438 int check_host_dependencies(host *hst,int dependency_type){
02439         hostdependency *temp_dependency=NULL;
02440         host *temp_host=NULL;
02441         int state=HOST_UP;
02442         time_t current_time=0L;
02443         void *ptr=NULL;
02444 
02445 
02446         log_debug_info(DEBUGL_FUNCTIONS,0,"check_host_dependencies()\n");
02447 
02448         /* check all dependencies... */
02449         for(temp_dependency=get_first_hostdependency_by_dependent_host(hst->name,&ptr);temp_dependency!=NULL;temp_dependency=get_next_hostdependency_by_dependent_host(hst->name,&ptr)){
02450 
02451                 /* only check dependencies of the desired type (notification or execution) */
02452                 if(temp_dependency->dependency_type!=dependency_type)
02453                         continue;
02454 
02455                 /* find the host we depend on... */
02456                 if((temp_host=temp_dependency->master_host_ptr)==NULL)
02457                         continue;
02458 
02459                 /* skip this dependency if it has a timeperiod and the current time isn't valid */
02460                 time(&current_time);
02461                 if(temp_dependency->dependency_period!=NULL && check_time_against_period(current_time,temp_dependency->dependency_period_ptr)==ERROR)
02462                         return FALSE;
02463 
02464                 /* get the status to use (use last hard state if its currently in a soft state) */
02465                 if(temp_host->state_type==SOFT_STATE && soft_state_dependencies==FALSE)
02466                         state=temp_host->last_hard_state;
02467                 else
02468                         state=temp_host->current_state;
02469 
02470                 /* is the host we depend on in state that fails the dependency tests? */
02471                 if(state==HOST_UP && temp_dependency->fail_on_up==TRUE)
02472                         return DEPENDENCIES_FAILED;
02473                 if(state==HOST_DOWN && temp_dependency->fail_on_down==TRUE)
02474                         return DEPENDENCIES_FAILED;
02475                 if(state==HOST_UNREACHABLE && temp_dependency->fail_on_unreachable==TRUE)
02476                         return DEPENDENCIES_FAILED;
02477                 if((state==HOST_UP && temp_host->has_been_checked==FALSE) && temp_dependency->fail_on_pending==TRUE)
02478                         return DEPENDENCIES_FAILED;
02479 
02480                 /* immediate dependencies ok at this point - check parent dependencies if necessary */
02481                 if(temp_dependency->inherits_parent==TRUE){
02482                         if(check_host_dependencies(temp_host,dependency_type)!=DEPENDENCIES_OK)
02483                                 return DEPENDENCIES_FAILED;
02484                 }
02485         }
02486 
02487         return DEPENDENCIES_OK;
02488 }
02489 
02490 
02491 
02492 /* check for hosts that never returned from a check... */
02493 void check_for_orphaned_hosts(void){
02494         host *temp_host=NULL;
02495         time_t current_time=0L;
02496         time_t expected_time=0L;
02497 
02498 
02499         log_debug_info(DEBUGL_FUNCTIONS,0,"check_for_orphaned_hosts()\n");
02500 
02501         /* get the current time */
02502         time(&current_time);
02503 
02504         /* check all hosts... */
02505         for(temp_host=host_list;temp_host!=NULL;temp_host=temp_host->next){
02506 
02507                 /* skip hosts that don't have a set check interval (on-demand checks are missed by the orphan logic) */
02508                 if(temp_host->next_check==(time_t)0L)
02509                         continue;
02510 
02511                 /* skip hosts that are not currently executing */
02512                 if(temp_host->is_executing==FALSE)
02513                         continue;
02514 
02515                 /* determine the time at which the check results should have come in (allow 10 minutes slack time) */
02516                 expected_time=(time_t)(temp_host->next_check+temp_host->latency+host_check_timeout+check_reaper_interval+600);
02517 
02518                 /* this host was supposed to have executed a while ago, but for some reason the results haven't come back in... */
02519                 if(expected_time<current_time){
02520 
02521                         /* log a warning */
02522                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of host '%s' looks like it was orphaned (results never came back).  I'm scheduling an immediate check of the host...\n",temp_host->name);
02523 
02524                         log_debug_info(DEBUGL_CHECKS,1,"Host '%s' was orphaned, so we're scheduling an immediate check...\n",temp_host->name);
02525 
02526                         /* decrement the number of running host checks */
02527                         if(currently_running_host_checks>0)
02528                                 currently_running_host_checks--;
02529 
02530                         /* disable the executing flag */
02531                         temp_host->is_executing=FALSE;
02532 
02533                         /* schedule an immediate check of the host */
02534                         schedule_host_check(temp_host,current_time,CHECK_OPTION_ORPHAN_CHECK);
02535                 }
02536 
02537         }
02538 
02539         return;
02540 }
02541 
02542 
02543 
02544 /* check freshness of host results */
02545 void check_host_result_freshness(void){
02546         host *temp_host=NULL;
02547         time_t current_time=0L;
02548 
02549 
02550         log_debug_info(DEBUGL_FUNCTIONS,0,"check_host_result_freshness()\n");
02551         log_debug_info(DEBUGL_CHECKS,2,"Attempting to check the freshness of host check results...\n");
02552 
02553         /* bail out if we're not supposed to be checking freshness */
02554         if(check_host_freshness==FALSE){
02555                 log_debug_info(DEBUGL_CHECKS,2,"Host freshness checking is disabled.\n");
02556                 return;
02557         }
02558 
02559         /* get the current time */
02560         time(&current_time);
02561 
02562         /* check all hosts... */
02563         for(temp_host=host_list;temp_host!=NULL;temp_host=temp_host->next){
02564 
02565                 /* skip hosts we shouldn't be checking for freshness */
02566                 if(temp_host->check_freshness==FALSE)
02567                         continue;
02568 
02569                 /* skip hosts that have both active and passive checks disabled */
02570                 if(temp_host->checks_enabled==FALSE && temp_host->accept_passive_host_checks==FALSE)
02571                         continue;
02572 
02573                 /* skip hosts that are currently executing (problems here will be caught by orphaned host check) */
02574                 if(temp_host->is_executing==TRUE)
02575                         continue;
02576 
02577                 /* skip hosts that are already being freshened */
02578                 if(temp_host->is_being_freshened==TRUE)
02579                         continue;
02580 
02581                 /* see if the time is right... */
02582                 if(check_time_against_period(current_time,temp_host->check_period_ptr)==ERROR)
02583                         continue;
02584 
02585                 /* the results for the last check of this host are stale */
02586                 if(is_host_result_fresh(temp_host,current_time,TRUE)==FALSE){
02587 
02588                         /* set the freshen flag */
02589                         temp_host->is_being_freshened=TRUE;
02590 
02591                         /* schedule an immediate forced check of the host */
02592                         schedule_host_check(temp_host,current_time,CHECK_OPTION_FORCE_EXECUTION | CHECK_OPTION_FRESHNESS_CHECK);
02593                 }
02594         }
02595 
02596         return;
02597 }
02598 
02599 
02600 
02601 /* checks to see if a hosts's check results are fresh */
02602 int is_host_result_fresh(host *temp_host, time_t current_time, int log_this){
02603         time_t expiration_time=0L;
02604         int freshness_threshold=0;
02605         int days=0;
02606         int hours=0;
02607         int minutes=0;
02608         int seconds=0;
02609         int tdays=0;
02610         int thours=0;
02611         int tminutes=0;
02612         int tseconds=0;
02613 
02614         log_debug_info(DEBUGL_CHECKS,2,"Checking freshness of host '%s'...\n",temp_host->name);
02615 
02616         /* use user-supplied freshness threshold or auto-calculate a freshness threshold to use? */
02617         if(temp_host->freshness_threshold==0)
02618                 freshness_threshold=(temp_host->check_interval*interval_length)+temp_host->latency+additional_freshness_latency;
02619         else
02620                 freshness_threshold=temp_host->freshness_threshold;
02621 
02622         log_debug_info(DEBUGL_CHECKS,2,"Freshness thresholds: host=%d, use=%d\n",temp_host->freshness_threshold,freshness_threshold);
02623 
02624         /* calculate expiration time */
02625         /* CHANGED 11/10/05 EG - program start is only used in expiration time calculation if > last check AND active checks are enabled, so active checks can become stale immediately upon program startup */
02626         if(temp_host->has_been_checked==FALSE)
02627                 expiration_time=(time_t)(event_start+freshness_threshold);
02628         /* CHANGED 06/19/07 EG - Per Ton's suggestion (and user requests), only use program start time over last check if no specific threshold has been set by user.  Otheriwse use it.  Problems can occur if Icinga is restarted more frequently that freshness threshold intervals (hosts never go stale). */
02629         /* CHANGED 10/07/07 EG - Added max_host_check_spread to expiration time as suggested by Altinity */
02630         else if(temp_host->checks_enabled==TRUE && event_start>temp_host->last_check && temp_host->freshness_threshold==0)
02631                 expiration_time=(time_t)(event_start+freshness_threshold+(max_host_check_spread*interval_length));
02632         else
02633                 expiration_time=(time_t)(temp_host->last_check+freshness_threshold);
02634 
02635         log_debug_info(DEBUGL_CHECKS,2,"HBC: %d, PS: %lu, ES: %lu, LC: %lu, CT: %lu, ET: %lu\n",temp_host->has_been_checked,(unsigned long)program_start,(unsigned long)event_start,(unsigned long)temp_host->last_check,(unsigned long)current_time,(unsigned long)expiration_time);
02636 
02637         /* the results for the last check of this host are stale */
02638         if(expiration_time<current_time){
02639 
02640                 get_time_breakdown((current_time-expiration_time),&days,&hours,&minutes,&seconds);
02641                 get_time_breakdown(freshness_threshold,&tdays,&thours,&tminutes,&tseconds);
02642 
02643                 /* log a warning */
02644                 if(log_this==TRUE)
02645                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The results of host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  I'm forcing an immediate check of the host.\n",temp_host->name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds);
02646 
02647                 log_debug_info(DEBUGL_CHECKS,1,"Check results for host '%s' are stale by %dd %dh %dm %ds (threshold=%dd %dh %dm %ds).  Forcing an immediate check of the host...\n",temp_host->name,days,hours,minutes,seconds,tdays,thours,tminutes,tseconds);
02648 
02649                 return FALSE;
02650         }
02651         else
02652                 log_debug_info(DEBUGL_CHECKS,1,"Check results for host '%s' are fresh.\n",temp_host->name);
02653 
02654         return TRUE;
02655 }
02656 
02657 
02658 
02659 /******************************************************************/
02660 /************* Icinga 3.X ROUTE/HOST CHECK FUNCTIONS **************/
02661 /******************************************************************/
02662 
02663 
02664 /*** ON-DEMAND HOST CHECKS USE THIS FUNCTION ***/
02665 /* check to see if we can reach the host */
02666 int perform_on_demand_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon){
02667         int result=OK;
02668 
02669         log_debug_info(DEBUGL_FUNCTIONS,0,"perform_on_demand_host_check_3x()\n");
02670 
02671         /* make sure we have a host */
02672         if(hst==NULL)
02673                 return ERROR;
02674 
02675         log_debug_info(DEBUGL_CHECKS,0,"** On-demand check for host '%s'...\n",hst->name);
02676 
02677         /* check the status of the host */
02678         result=run_sync_host_check_3x(hst,check_result_code,check_options,use_cached_result,check_timestamp_horizon);
02679 
02680         return result;
02681 }
02682 
02683 
02684 
02685 /* perform a synchronous check of a host */
02686 /* on-demand host checks will use this... */
02687 int run_sync_host_check_3x(host *hst, int *check_result_code, int check_options, int use_cached_result, unsigned long check_timestamp_horizon){
02688         int result=OK;
02689         time_t current_time=0L;
02690         int host_result=HOST_UP;
02691         char *old_plugin_output=NULL;
02692         struct timeval start_time;
02693         struct timeval end_time;
02694 
02695 
02696         log_debug_info(DEBUGL_FUNCTIONS,0,"run_sync_host_check_3x()\n");
02697 
02698         /* make sure we have a host */
02699         if(hst==NULL)
02700                 return ERROR;
02701 
02702         log_debug_info(DEBUGL_CHECKS,0,"** Run sync check of host '%s'...\n",hst->name);
02703 
02704         /* is the host check viable at this time? */
02705         /* if not, return current state and bail out */
02706         if(check_host_check_viability_3x(hst,check_options,NULL,NULL)==ERROR){
02707                 if(check_result_code)
02708                         *check_result_code=hst->current_state;
02709                 log_debug_info(DEBUGL_CHECKS,0,"Host check is not viable at this time.\n");
02710                 return OK;
02711         }
02712 
02713         /* get the current time */
02714         time(&current_time);
02715 
02716         /* high resolution start time for event broker */
02717         gettimeofday(&start_time,NULL);
02718 
02719         /* can we use the last cached host state? */
02720         if(use_cached_result==TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)){
02721 
02722                 /* we can used the cached result, so return it and get out of here... */
02723                 if(hst->has_been_checked==TRUE && ((current_time-hst->last_check) <= check_timestamp_horizon)){
02724                         if(check_result_code)
02725                                 *check_result_code=hst->current_state;
02726 
02727                         log_debug_info(DEBUGL_CHECKS,1,"* Using cached host state: %d\n",hst->current_state);
02728 
02729                         /* update check statistics */
02730                         update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time);
02731                         update_check_stats(ACTIVE_CACHED_HOST_CHECK_STATS,current_time);
02732 
02733                         return OK;
02734                 }
02735         }
02736 
02737 
02738         log_debug_info(DEBUGL_CHECKS,1,"* Running actual host check: old state=%d\n",hst->current_state);
02739 
02740 
02741         /******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/
02742 
02743         /* update check statistics */
02744         update_check_stats(ACTIVE_ONDEMAND_HOST_CHECK_STATS,current_time);
02745         update_check_stats(SERIAL_HOST_CHECK_STATS,start_time.tv_sec);
02746 
02747         /* reset host check latency, since on-demand checks have none */
02748         hst->latency=0.0;
02749 
02750         /* adjust host check attempt */
02751         adjust_host_check_attempt_3x(hst,TRUE);
02752 
02753         /* save old host state */
02754         hst->last_state=hst->current_state;
02755         if(hst->state_type==HARD_STATE)
02756                 hst->last_hard_state=hst->current_state;
02757 
02758         /* save old plugin output for state stalking */
02759         if(hst->plugin_output)
02760                 old_plugin_output=(char *)strdup(hst->plugin_output);
02761 
02762         /* set the checked flag */
02763         hst->has_been_checked=TRUE;
02764 
02765         /* clear the freshness flag */
02766         hst->is_being_freshened=FALSE;
02767 
02768         /* clear check options - we don't want old check options retained */
02769         hst->check_options=CHECK_OPTION_NONE;
02770 
02771         /* set the check type */
02772         hst->check_type=HOST_CHECK_ACTIVE;
02773 
02774 
02775         /*********** EXECUTE THE CHECK AND PROCESS THE RESULTS **********/
02776 
02777 #ifdef USE_EVENT_BROKER
02778         /* send data to event broker */
02779         end_time.tv_sec=0L;
02780         end_time.tv_usec=0L;
02781         broker_host_check(NEBTYPE_HOSTCHECK_INITIATE,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,NULL,NULL,NULL,NULL,NULL);
02782 #endif
02783 
02784         /* execute the host check */
02785         host_result=execute_sync_host_check_3x(hst);
02786 
02787         /* process the host check result */
02788         process_host_check_result_3x(hst,host_result,old_plugin_output,check_options,FALSE,use_cached_result,check_timestamp_horizon);
02789 
02790         /* free memory */
02791         my_free(old_plugin_output);
02792 
02793         log_debug_info(DEBUGL_CHECKS,1,"* Sync host check done: new state=%d\n",hst->current_state);
02794 
02795         /* high resolution end time for event broker */
02796         gettimeofday(&end_time,NULL);
02797 
02798 #ifdef USE_EVENT_BROKER
02799         /* send data to event broker */
02800         broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,hst->execution_time,host_check_timeout,FALSE,hst->current_state,hst->processed_command,hst->plugin_output,hst->long_plugin_output,hst->perf_data,NULL);
02801 #endif
02802 
02803         return result;
02804 }
02805 
02806 
02807 
02808 /* run an "alive" check on a host */
02809 /* on-demand host checks will use this... */
02810 int execute_sync_host_check_3x(host *hst){
02811         icinga_macros mac;
02812         int result=STATE_OK;
02813         int return_result=HOST_UP;
02814         char *processed_command=NULL;
02815         char *raw_command=NULL;
02816         struct timeval start_time;
02817         struct timeval end_time;
02818         char *temp_ptr;
02819         int early_timeout=FALSE;
02820         double exectime;
02821         char *temp_plugin_output=NULL;
02822 #ifdef USE_EVENT_BROKER
02823         int neb_result=OK;
02824 #endif
02825 
02826 
02827         log_debug_info(DEBUGL_FUNCTIONS,0,"execute_sync_host_check_3x()\n");
02828 
02829         if(hst==NULL)
02830                 return HOST_DOWN;
02831 
02832         log_debug_info(DEBUGL_CHECKS,0,"** Executing sync check of host '%s'...\n",hst->name);
02833 
02834 #ifdef USE_EVENT_BROKER
02835         /* initialize start/end times */
02836         start_time.tv_sec=0L;
02837         start_time.tv_usec=0L;
02838         end_time.tv_sec=0L;
02839         end_time.tv_usec=0L;
02840 
02841         /* send data to event broker */
02842         neb_result=broker_host_check(NEBTYPE_HOSTCHECK_SYNC_PRECHECK,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,NULL,NULL,NULL,NULL,NULL);
02843 
02844         /* neb module wants to cancel the host check - return the current state of the host */
02845         if(neb_result==NEBERROR_CALLBACKCANCEL)
02846                 return hst->current_state;
02847 
02848         /* neb module wants to override the host check - perhaps it will check the host itself */
02849         /* NOTE: if a module does this, it must check the status of the host and populate the data structures BEFORE it returns from the callback! */
02850         if(neb_result==NEBERROR_CALLBACKOVERRIDE)
02851                 return hst->current_state;
02852 #endif
02853 
02854         /* grab the host macros */
02855         memset(&mac, 0, sizeof(mac));
02856         grab_host_macros_r(&mac, hst);
02857 
02858         /* high resolution start time for event broker */
02859         gettimeofday(&start_time,NULL);
02860 
02861         /* get the last host check time */
02862         time(&hst->last_check);
02863 
02864         /* get the raw command line */
02865         get_raw_command_line_r(&mac, hst->check_command_ptr,hst->host_check_command,&raw_command,0);
02866         if(raw_command==NULL) {
02867                 clear_volatile_macros_r(&mac);
02868                 return ERROR;
02869         }
02870 
02871         /* process any macros contained in the argument */
02872         process_macros_r(&mac, raw_command,&processed_command,0);
02873         if(processed_command==NULL) {
02874                 clear_volatile_macros_r(&mac);
02875                 return ERROR;
02876         }
02877 
02878         my_free(hst->processed_command);
02879         hst->processed_command=strdup(processed_command);
02880 
02881 #ifdef USE_EVENT_BROKER
02882         /* send data to event broker */
02883         end_time.tv_sec=0L;
02884         end_time.tv_usec=0L;
02885         broker_host_check(NEBTYPE_HOSTCHECK_RAW_START,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,return_result,hst->state_type,start_time,end_time,hst->host_check_command,0.0,0.0,host_check_timeout,early_timeout,result,processed_command,hst->plugin_output,hst->long_plugin_output,hst->perf_data,NULL);
02886 #endif
02887 
02888         log_debug_info(DEBUGL_COMMANDS,1,"Raw host check command: %s\n",raw_command);
02889         log_debug_info(DEBUGL_COMMANDS,0,"Processed host check ommand: %s\n",processed_command);
02890 
02891         /* clear plugin output and performance data buffers */
02892         my_free(hst->plugin_output);
02893         my_free(hst->long_plugin_output);
02894         my_free(hst->perf_data);
02895 
02896         /* run the host check command */
02897         result=my_system_r(&mac, processed_command,host_check_timeout,&early_timeout,&exectime,&temp_plugin_output,MAX_PLUGIN_OUTPUT_LENGTH);
02898         clear_volatile_macros_r(&mac);
02899 
02900         /* if the check timed out, report an error */
02901         if(early_timeout==TRUE){
02902 
02903                 my_free(temp_plugin_output);
02904                 dummy=asprintf(&temp_plugin_output,"Host check timed out after %d seconds\n",host_check_timeout);
02905 
02906                 /* log the timeout */
02907                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Host check command '%s' for host '%s' timed out after %d seconds\n",processed_command,hst->name,host_check_timeout);
02908         }
02909 
02910         /* calculate total execution time */
02911         hst->execution_time=exectime;
02912 
02913         /* record check type */
02914         hst->check_type=HOST_CHECK_ACTIVE;
02915 
02916         /* parse the output: short and long output, and perf data */
02917         parse_check_output(temp_plugin_output,&hst->plugin_output,&hst->long_plugin_output,&hst->perf_data,TRUE,TRUE);
02918 
02919         /* free memory */
02920         my_free(temp_plugin_output);
02921         my_free(raw_command);
02922         my_free(processed_command);
02923 
02924         /* a NULL host check command means we should assume the host is UP */
02925         if(hst->host_check_command==NULL){
02926                 my_free(hst->plugin_output);
02927                 hst->plugin_output=(char *)strdup("(Host assumed to be UP)");
02928                 result=STATE_OK;
02929         }
02930 
02931         /* make sure we have some data */
02932         if(hst->plugin_output==NULL || !strcmp(hst->plugin_output,"")){
02933                 my_free(hst->plugin_output);
02934                 hst->plugin_output=(char *)strdup("(No output returned from host check)");
02935         }
02936 
02937         /* replace semicolons in plugin output (but not performance data) with colons */
02938         if((temp_ptr=hst->plugin_output)){
02939                 while((temp_ptr=strchr(temp_ptr,';')))
02940                         *temp_ptr=':';
02941         }
02942 
02943         /* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */
02944         if(use_aggressive_host_checking==FALSE && result==STATE_WARNING)
02945                 result=STATE_OK;
02946 
02947 
02948         if(result==STATE_OK)
02949                 return_result=HOST_UP;
02950         else
02951                 return_result=HOST_DOWN;
02952 
02953         /* high resolution end time for event broker */
02954         gettimeofday(&end_time,NULL);
02955 
02956 #ifdef USE_EVENT_BROKER
02957         /* send data to event broker */
02958         broker_host_check(NEBTYPE_HOSTCHECK_RAW_END,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,return_result,hst->state_type,start_time,end_time,hst->host_check_command,0.0,exectime,host_check_timeout,early_timeout,result,processed_command,hst->plugin_output,hst->long_plugin_output,hst->perf_data,NULL);
02959 #endif
02960 
02961         log_debug_info(DEBUGL_CHECKS,0,"** Sync host check done: state=%d\n",return_result);
02962 
02963         return return_result;
02964 }
02965 
02966 
02967 
02968 /* run a scheduled host check asynchronously */
02969 int run_scheduled_host_check_3x(host *hst, int check_options, double latency){
02970         int result=OK;
02971         time_t current_time=0L;
02972         time_t preferred_time=0L;
02973         time_t next_valid_time=0L;
02974         int time_is_valid=TRUE;
02975 
02976 
02977         log_debug_info(DEBUGL_FUNCTIONS,0,"run_scheduled_host_check_3x()\n");
02978 
02979         if(hst==NULL)
02980                 return ERROR;
02981 
02982         log_debug_info(DEBUGL_CHECKS,0,"Attempting to run scheduled check of host '%s': check options=%d, latency=%lf\n",hst->name,check_options,latency);
02983 
02984         /* attempt to run the check */
02985         result=run_async_host_check_3x(hst,check_options,latency,TRUE,TRUE,&time_is_valid,&preferred_time);
02986 
02987         /* an error occurred, so reschedule the check */
02988         if(result==ERROR){
02989 
02990                 log_debug_info(DEBUGL_CHECKS,1,"Unable to run scheduled host check at this time\n");
02991 
02992                 /* only attempt to (re)schedule checks that should get checked... */
02993                 if(hst->should_be_scheduled==TRUE){
02994 
02995                         /* get current time */
02996                         time(&current_time);
02997 
02998                         /* determine next time we should check the host if needed */
02999                         /* if host has no check interval, schedule it again for 5 minutes from now */
03000                         if(current_time>=preferred_time)
03001                                 preferred_time=current_time+((hst->check_interval<=0)?300:(hst->check_interval*interval_length));
03002 
03003                         /* make sure we rescheduled the next host check at a valid time */
03004                         get_next_valid_time(preferred_time,&next_valid_time,hst->check_period_ptr);
03005 
03006                         /* the host could not be rescheduled properly - set the next check time for next week */
03007                         if(time_is_valid==FALSE && next_valid_time==preferred_time){
03008 
03009                                 /*
03010                                 hst->next_check=(time_t)(next_valid_time+(60*60*24*365));
03011                                 hst->should_be_scheduled=FALSE;
03012                                  */
03013 
03014                                 hst->next_check=(time_t)(next_valid_time+(60*60*24*7));
03015 
03016                                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Check of host '%s' could not be rescheduled properly.  Scheduling check for next week...\n",hst->name);
03017 
03018                                 log_debug_info(DEBUGL_CHECKS,1,"Unable to find any valid times to reschedule the next host check!\n");
03019                         }
03020 
03021                         /* this service could be rescheduled... */
03022                         else{
03023                                 hst->next_check=next_valid_time;
03024                                 hst->should_be_scheduled=TRUE;
03025 
03026                                 log_debug_info(DEBUGL_CHECKS,1,"Rescheduled next host check for %s",ctime(&next_valid_time));
03027                         }
03028                 }
03029 
03030                 /* update the status log */
03031                 update_host_status(hst,FALSE);
03032 
03033                 /* reschedule the next host check - unless we couldn't find a valid next check time */
03034                 /* 10/19/07 EG - keep original check options */
03035                 if(hst->should_be_scheduled==TRUE)
03036                         schedule_host_check(hst,hst->next_check,check_options);
03037 
03038                 return ERROR;
03039         }
03040 
03041         return OK;
03042 }
03043 
03044 
03045 
03046 /* perform an asynchronous check of a host */
03047 /* scheduled host checks will use this, as will some checks that result from on-demand checks... */
03048 int run_async_host_check_3x(host *hst, int check_options, double latency, int scheduled_check, int reschedule_check, int *time_is_valid, time_t *preferred_time){
03049         icinga_macros mac;
03050         char *raw_command=NULL;
03051         char *processed_command=NULL;
03052         struct timeval start_time,end_time;
03053         pid_t pid=0;
03054         int fork_error=FALSE;
03055         int wait_result=0;
03056         int pclose_result=0;
03057         mode_t new_umask=077;
03058         mode_t old_umask;
03059         char *output_file=NULL;
03060         double old_latency=0.0;
03061         dbuf checkresult_dbuf;
03062         int dbuf_chunk=1024;
03063 #ifdef USE_EVENT_BROKER
03064         int neb_result=OK;
03065 #endif
03066 
03067         log_debug_info(DEBUGL_FUNCTIONS,0,"run_async_host_check_3x()\n");
03068 
03069         /* make sure we have a host */
03070         if(hst==NULL)
03071                 return ERROR;
03072 
03073         log_debug_info(DEBUGL_CHECKS,0,"** Running async check of host '%s'...\n",hst->name);
03074 
03075         /* is the host check viable at this time? */
03076         if(check_host_check_viability_3x(hst,check_options,time_is_valid,preferred_time)==ERROR)
03077                 return ERROR;
03078 
03079         /* 08/04/07 EG don't execute a new host check if one is already running */
03080         if(hst->is_executing==TRUE && !(check_options & CHECK_OPTION_FORCE_EXECUTION)){
03081                 log_debug_info(DEBUGL_CHECKS,1,"A check of this host is already being executed, so we'll pass for the moment...\n");
03082                 return ERROR;
03083         }
03084 
03085         /******** GOOD TO GO FOR A REAL HOST CHECK AT THIS POINT ********/
03086 
03087 #ifdef USE_EVENT_BROKER
03088         /* initialize start/end times */
03089         start_time.tv_sec=0L;
03090         start_time.tv_usec=0L;
03091         end_time.tv_sec=0L;
03092         end_time.tv_usec=0L;
03093 
03094         /* send data to event broker */
03095         neb_result=broker_host_check(NEBTYPE_HOSTCHECK_ASYNC_PRECHECK,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,NULL,NULL,NULL,NULL,NULL);
03096 
03097         /* neb module wants to cancel the host check - the check will be rescheduled for a later time by the scheduling logic */
03098         if(neb_result==NEBERROR_CALLBACKCANCEL)
03099                 return ERROR;
03100 
03101         /* neb module wants to override the host check - perhaps it will check the host itself */
03102         /* NOTE: if a module does this, it has to do a lot of the stuff found below to make sure things don't get whacked out of shape! */
03103         if(neb_result==NEBERROR_CALLBACKOVERRIDE)
03104                 return OK;
03105 #endif
03106 
03107         log_debug_info(DEBUGL_CHECKS,0,"Checking host '%s'...\n",hst->name);
03108 
03109         /* clear check options - we don't want old check options retained */
03110         /* only clear options if this was a scheduled check - on demand check options shouldn't affect retained info */
03111         if(scheduled_check==TRUE)
03112                 hst->check_options=CHECK_OPTION_NONE;
03113 
03114         /* adjust host check attempt */
03115         adjust_host_check_attempt_3x(hst,TRUE);
03116 
03117         /* set latency (temporarily) for macros and event broker */
03118         old_latency=hst->latency;
03119         hst->latency=latency;
03120 
03121         /* grab the host macro variables */
03122         memset(&mac, 0, sizeof(mac));
03123         grab_host_macros_r(&mac, hst);
03124 
03125         /* get the raw command line */
03126         get_raw_command_line_r(&mac, hst->check_command_ptr,hst->host_check_command,&raw_command,0);
03127         if(raw_command==NULL){
03128                 clear_volatile_macros_r(&mac);
03129                 log_debug_info(DEBUGL_CHECKS,0,"Raw check command for host '%s' was NULL - aborting.\n",hst->name);
03130                 return ERROR;
03131         }
03132 
03133         /* process any macros contained in the argument */
03134         process_macros_r(&mac, raw_command,&processed_command,0);
03135         if(processed_command==NULL){
03136                 clear_volatile_macros_r(&mac);
03137                 log_debug_info(DEBUGL_CHECKS,0,"Processed check command for host '%s' was NULL - aborting.\n",hst->name);
03138                 return ERROR;
03139         }
03140 
03141         my_free(hst->processed_command);
03142         hst->processed_command=strdup(processed_command);
03143 
03144         /* get the command start time */
03145         gettimeofday(&start_time,NULL);
03146 
03147         /* set check time for on-demand checks, so they're not incorrectly detected as being orphaned - Luke Ross 5/16/08 */
03148         /* NOTE: 06/23/08 EG not sure if there will be side effects to this or not.... */
03149         if(scheduled_check==FALSE)
03150                 hst->next_check=start_time.tv_sec;
03151 
03152         /* increment number of host checks that are currently running... */
03153         currently_running_host_checks++;
03154 
03155         /* set the execution flag */
03156         hst->is_executing=TRUE;
03157 
03158         /* open a temp file for storing check output */
03159         old_umask=umask(new_umask);
03160         dummy=asprintf(&output_file,"%s/checkXXXXXX",temp_path);
03161         check_result_info.output_file_fd=mkstemp(output_file);
03162         if(check_result_info.output_file_fd>=0)
03163                 check_result_info.output_file_fp=fdopen(check_result_info.output_file_fd,"w");
03164         else{
03165                 check_result_info.output_file_fp=NULL;
03166                 check_result_info.output_file_fd=-1;
03167         }
03168         umask(old_umask);
03169 
03170         log_debug_info(DEBUGL_CHECKS|DEBUGL_IPC,1,"Check result output will be written to '%s' (fd=%d)\n",output_file,check_result_info.output_file_fd);
03171 
03172         /* save check info */
03173         check_result_info.object_check_type=HOST_CHECK;
03174         check_result_info.host_name=(char *)strdup(hst->name);
03175         check_result_info.service_description=NULL;
03176         check_result_info.check_type=HOST_CHECK_ACTIVE;
03177         check_result_info.check_options=check_options;
03178         check_result_info.scheduled_check=scheduled_check;
03179         check_result_info.reschedule_check=reschedule_check;
03180         check_result_info.output_file=(check_result_info.output_file_fd<0 || output_file==NULL)?NULL:strdup(output_file);
03181         check_result_info.latency=latency;
03182         check_result_info.start_time=start_time;
03183         check_result_info.finish_time=start_time;
03184         check_result_info.early_timeout=FALSE;
03185         check_result_info.exited_ok=TRUE;
03186         check_result_info.return_code=STATE_OK;
03187         check_result_info.output=NULL;
03188 
03189         /* free memory */
03190         my_free(output_file);
03191 
03192         /* write initial check info to file */
03193         /* if things go bad later on, the user will at least have something to go on when debugging... */
03194         if(check_result_info.output_file_fp){
03195 
03196                 fprintf(check_result_info.output_file_fp,"### Active Check Result File ###\n");
03197                 fprintf(check_result_info.output_file_fp,"file_time=%lu\n",(unsigned long)check_result_info.start_time.tv_sec);
03198                 fprintf(check_result_info.output_file_fp,"\n");
03199 
03200                 fprintf(check_result_info.output_file_fp,"### Icinga Host Check Result ###\n");
03201                 fprintf(check_result_info.output_file_fp,"# Time: %s",ctime(&check_result_info.start_time.tv_sec));
03202                 fprintf(check_result_info.output_file_fp,"host_name=%s\n",check_result_info.host_name);
03203                 fprintf(check_result_info.output_file_fp,"check_type=%d\n",check_result_info.check_type);
03204                 fprintf(check_result_info.output_file_fp,"check_options=%d\n",check_result_info.check_options);
03205                 fprintf(check_result_info.output_file_fp,"scheduled_check=%d\n",check_result_info.scheduled_check);
03206                 fprintf(check_result_info.output_file_fp,"reschedule_check=%d\n",check_result_info.reschedule_check);
03207                 fprintf(check_result_info.output_file_fp,"latency=%f\n",hst->latency);
03208                 fprintf(check_result_info.output_file_fp,"start_time=%lu.%lu\n",check_result_info.start_time.tv_sec,check_result_info.start_time.tv_usec);
03209 
03210                 /* flush buffer or we'll end up writing twice when we fork() */
03211                 fflush(check_result_info.output_file_fp);
03212         }
03213 
03214         /* initialize dynamic buffer for storing plugin output */
03215         dbuf_init(&checkresult_dbuf,dbuf_chunk);
03216 
03217 #ifdef USE_EVENT_BROKER
03218         /* send data to event broker */
03219         broker_host_check(NEBTYPE_HOSTCHECK_INITIATE,NEBFLAG_NONE,NEBATTR_NONE,hst,HOST_CHECK_ACTIVE,hst->current_state,hst->state_type,start_time,end_time,hst->host_check_command,hst->latency,0.0,host_check_timeout,FALSE,0,processed_command,NULL,NULL,NULL,NULL);
03220 #endif
03221 
03222         /* reset latency (permanent value for this check will get set later) */
03223         hst->latency=old_latency;
03224 
03225         /* update check statistics */
03226         update_check_stats((scheduled_check==TRUE)?ACTIVE_SCHEDULED_HOST_CHECK_STATS:ACTIVE_ONDEMAND_HOST_CHECK_STATS,start_time.tv_sec);
03227         update_check_stats(PARALLEL_HOST_CHECK_STATS,start_time.tv_sec);
03228 
03229         /* fork a child process */
03230         pid=fork();
03231 
03232         /* an error occurred while trying to fork */
03233         if(pid==-1){
03234 
03235                 fork_error=TRUE;
03236 
03237                 /* log an error */
03238                 logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: The check of host '%s' could not be performed due to a fork() error: '%s'.\n",hst->name,strerror(errno));
03239 
03240                 log_debug_info(DEBUGL_CHECKS,0,"Check of host '%s' could not be performed due to a fork() error: '%s'!\n",hst->name,strerror(errno));
03241         }
03242 
03243         /* if we are in the child process... */
03244         else if(pid==0){
03245 
03246                 /* set environment variables */
03247                 set_all_macro_environment_vars_r(&mac, TRUE);
03248 
03249                 /* ADDED 11/12/07 EG */
03250                 /* close external command file and shut down worker thread */
03251                 close_command_file();
03252 
03253                 /* fork again if we're not in a large installation */
03254                 if(child_processes_fork_twice==TRUE){
03255 
03256                         /* fork again... */
03257                         pid=fork();
03258 
03259                         /* an error occurred while trying to fork again */
03260                         if(pid==-1)
03261                                 exit(STATE_UNKNOWN);
03262                 }
03263 
03264                 /* the grandchild (or child if large install tweaks are enabled) process should run the host check... */
03265                 if(pid==0 || child_processes_fork_twice==FALSE){
03266 
03267                         /* reset signal handling */
03268                         reset_sighandler();
03269 
03270                         /* become the process group leader */
03271                         setpgid(0,0);
03272 
03273                         /* catch term signals at this process level */
03274                         signal(SIGTERM,host_check_sighandler);
03275 
03276                         /* catch plugins that don't finish in a timely manner */
03277                         signal(SIGALRM,host_check_sighandler);
03278                         alarm(host_check_timeout);
03279 
03280                         /* disable rotation of the debug file */
03281                         max_debug_file_size=0L;
03282 
03283                         /* run the plugin check command */
03284                         pclose_result=run_check(processed_command,&checkresult_dbuf);
03285 
03286                         /* reset the alarm */
03287                         alarm(0);
03288 
03289                         /* get the check finish time */
03290                         gettimeofday(&end_time,NULL);
03291 
03292                         /* record check result info */
03293                         check_result_info.finish_time=end_time;
03294                         check_result_info.early_timeout=FALSE;
03295 
03296                         /* test for execution error */
03297                         if(pclose_result==-1){
03298                                 pclose_result=STATE_UNKNOWN;
03299                                 check_result_info.return_code=STATE_CRITICAL;
03300                                 check_result_info.exited_ok=FALSE;
03301                         }
03302                         else{
03303                                 if(WEXITSTATUS(pclose_result)==0 && WIFSIGNALED(pclose_result))
03304                                         check_result_info.return_code=128+WTERMSIG(pclose_result);
03305                                 else
03306                                         check_result_info.return_code=WEXITSTATUS(pclose_result);
03307                         }
03308 
03309                         /* write check result to file */
03310                         if(check_result_info.output_file_fp){
03311 
03312                                 fprintf(check_result_info.output_file_fp,"finish_time=%lu.%lu\n",check_result_info.finish_time.tv_sec,check_result_info.finish_time.tv_usec);
03313                                 fprintf(check_result_info.output_file_fp,"early_timeout=%d\n",check_result_info.early_timeout);
03314                                 fprintf(check_result_info.output_file_fp,"exited_ok=%d\n",check_result_info.exited_ok);
03315                                 fprintf(check_result_info.output_file_fp,"return_code=%d\n",check_result_info.return_code);
03316                                 fprintf(check_result_info.output_file_fp,"output=%s\n",(checkresult_dbuf.buf==NULL)?"(null)":checkresult_dbuf.buf);
03317 
03318                                 /* close the temp file */
03319                                 fclose(check_result_info.output_file_fp);
03320 
03321                                 /* move check result to queue directory */
03322                                 move_check_result_to_queue(check_result_info.output_file);
03323                         }
03324 
03325                         /* free memory */
03326                         dbuf_free(&checkresult_dbuf);
03327                         my_free(raw_command);
03328                         my_free(processed_command);
03329 
03330                         /* free check result memory */
03331                         free_check_result(&check_result_info);
03332 
03333                         /* return with plugin exit status - not really necessary... */
03334                         _exit(pclose_result);
03335                 }
03336 
03337                 /* NOTE: this code is never reached if large install tweaks are enabled... */
03338 
03339                 /* unset environment variables */
03340                 set_all_macro_environment_vars_r(&mac, FALSE);
03341 
03342                 /* free allocated memory */
03343                 /* this needs to be done last, so we don't free memory for variables before they're used above */
03344                 if(free_child_process_memory==TRUE)
03345                         free_memory(&mac);
03346 
03347                 /* parent exits immediately - grandchild process is inherited by the INIT process, so we have no zombie problem... */
03348                 _exit(STATE_OK);
03349         }
03350 
03351         /* else the parent should wait for the first child to return... */
03352         else if(pid>0){
03353                 clear_volatile_macros_r(&mac);
03354 
03355                 log_debug_info(DEBUGL_CHECKS,2,"Host check is executing in child process (pid=%lu)\n",(unsigned long)pid);
03356 
03357                 /* parent should close output file */
03358                 if(check_result_info.output_file_fp)
03359                         fclose(check_result_info.output_file_fp);
03360 
03361                 /* should this be done in first child process (after spawning grandchild) as well? */
03362                 /* free memory allocated for IPC functionality */
03363                 free_check_result(&check_result_info);
03364 
03365                 /* free memory */
03366                 my_free(raw_command);
03367                 my_free(processed_command);
03368 
03369                 /* wait for the first child to return */
03370                 /* if large install tweaks are enabled, we'll clean up the zombie process later */
03371                 if(child_processes_fork_twice==TRUE)
03372                         wait_result=waitpid(pid,NULL,0);
03373         }
03374 
03375         /* see if we were able to run the check... */
03376         if(fork_error==TRUE)
03377                 return ERROR;
03378 
03379         return OK;
03380 }
03381 
03382 
03383 
03384 /* process results of an asynchronous host check */
03385 int handle_async_host_check_result_3x(host *temp_host, check_result *queued_check_result){
03386         time_t current_time;
03387         int result=STATE_OK;
03388         int reschedule_check=FALSE;
03389         char *old_plugin_output=NULL;
03390         char *temp_ptr=NULL;
03391         struct timeval start_time_hires;
03392         struct timeval end_time_hires;
03393 
03394         log_debug_info(DEBUGL_FUNCTIONS,0,"handle_async_host_check_result_3x()\n");
03395 
03396         /* make sure we have what we need */
03397         if(temp_host==NULL || queued_check_result==NULL)
03398                 return ERROR;
03399 
03400         time(&current_time);
03401 
03402         log_debug_info(DEBUGL_CHECKS,1,"** Handling async check result for host '%s'...\n",temp_host->name);
03403 
03404         log_debug_info(DEBUGL_CHECKS,2,"\tCheck Type:         %s\n",(queued_check_result->check_type==HOST_CHECK_ACTIVE)?"Active":"Passive");
03405         log_debug_info(DEBUGL_CHECKS,2,"\tCheck Options:      %d\n",queued_check_result->check_options);
03406         log_debug_info(DEBUGL_CHECKS,2,"\tScheduled Check?:   %s\n",(queued_check_result->scheduled_check==TRUE)?"Yes":"No");
03407         log_debug_info(DEBUGL_CHECKS,2,"\tReschedule Check?:  %s\n",(queued_check_result->reschedule_check==TRUE)?"Yes":"No");
03408         log_debug_info(DEBUGL_CHECKS,2,"\tExited OK?:         %s\n",(queued_check_result->exited_ok==TRUE)?"Yes":"No");
03409         log_debug_info(DEBUGL_CHECKS,2,"\tExec Time:          %.3f\n",temp_host->execution_time);
03410         log_debug_info(DEBUGL_CHECKS,2,"\tLatency:            %.3f\n",temp_host->latency);
03411         log_debug_info(DEBUGL_CHECKS,2,"\tReturn Status:      %d\n",queued_check_result->return_code);
03412         log_debug_info(DEBUGL_CHECKS,2,"\tOutput:             %s\n",(queued_check_result==NULL)?"NULL":queued_check_result->output);
03413 
03414         /* decrement the number of host checks still out there... */
03415         if(queued_check_result->check_type==HOST_CHECK_ACTIVE && currently_running_host_checks>0)
03416                 currently_running_host_checks--;
03417 
03418         /* skip this host check results if its passive and we aren't accepting passive check results */
03419         if(queued_check_result->check_type==HOST_CHECK_PASSIVE){
03420                 if(accept_passive_host_checks==FALSE){
03421                         log_debug_info(DEBUGL_CHECKS,0,"Discarding passive host check result because passive host checks are disabled globally.\n");
03422                         return ERROR;
03423                 }
03424                 if(temp_host->accept_passive_host_checks==FALSE){
03425                         log_debug_info(DEBUGL_CHECKS,0,"Discarding passive host check result because passive checks are disabled for this host.\n");
03426                         return ERROR;
03427                 }
03428         }
03429 
03430         /* clear the freshening flag (it would have been set if this host was determined to be stale) */
03431         if(queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK)
03432                 temp_host->is_being_freshened=FALSE;
03433 
03434         /* DISCARD INVALID FRESHNESS CHECK RESULTS */
03435         /* If a host goes stale, Icinga will initiate a forced check in order to freshen it.  There is a race condition whereby a passive check
03436            could arrive between the 1) initiation of the forced check and 2) the time when the forced check result is processed here.  This would
03437            make the host fresh again, so we do a quick check to make sure the host is still stale before we accept the check result. */
03438         if((queued_check_result->check_options & CHECK_OPTION_FRESHNESS_CHECK) && is_host_result_fresh(temp_host,current_time,FALSE)==TRUE){
03439                 log_debug_info(DEBUGL_CHECKS,0,"Discarding host freshness check result because the host is currently fresh (race condition avoided).\n");
03440                 return OK;
03441         }
03442 
03443         /* was this check passive or active? */
03444         temp_host->check_type=(queued_check_result->check_type==HOST_CHECK_ACTIVE)?HOST_CHECK_ACTIVE:HOST_CHECK_PASSIVE;
03445 
03446         /* update check statistics for passive results */
03447         if(queued_check_result->check_type==HOST_CHECK_PASSIVE)
03448                 update_check_stats(PASSIVE_HOST_CHECK_STATS,queued_check_result->start_time.tv_sec);
03449 
03450         /* should we reschedule the next check of the host? NOTE: this might be overridden later... */
03451         reschedule_check=queued_check_result->reschedule_check;
03452 
03453         /* check latency is passed to us for both active and passive checks */
03454         temp_host->latency=queued_check_result->latency;
03455 
03456         /* update the execution time for this check (millisecond resolution) */
03457         temp_host->execution_time=(double)((double)(queued_check_result->finish_time.tv_sec-queued_check_result->start_time.tv_sec)+(double)((queued_check_result->finish_time.tv_usec-queued_check_result->start_time.tv_usec)/1000.0)/1000.0);
03458         if(temp_host->execution_time<0.0)
03459                 temp_host->execution_time=0.0;
03460 
03461         /* set the checked flag */
03462         temp_host->has_been_checked=TRUE;
03463 
03464         /* clear the execution flag if this was an active check */
03465         if(queued_check_result->check_type==HOST_CHECK_ACTIVE)
03466                 temp_host->is_executing=FALSE;
03467 
03468         /* get the last check time */
03469         temp_host->last_check=queued_check_result->start_time.tv_sec;
03470 
03471         /* was this check passive or active? */
03472         temp_host->check_type=(queued_check_result->check_type==HOST_CHECK_ACTIVE)?HOST_CHECK_ACTIVE:HOST_CHECK_PASSIVE;
03473 
03474         /* save the old host state */
03475         temp_host->last_state=temp_host->current_state;
03476         if(temp_host->state_type==HARD_STATE)
03477                 temp_host->last_hard_state=temp_host->current_state;
03478 
03479         /* save old plugin output */
03480         if(temp_host->plugin_output)
03481                 old_plugin_output=(char *)strdup(temp_host->plugin_output);
03482 
03483         /* clear the old plugin output and perf data buffers */
03484         my_free(temp_host->plugin_output);
03485         my_free(temp_host->long_plugin_output);
03486         my_free(temp_host->perf_data);
03487 
03488         /* parse check output to get: (1) short output, (2) long output, (3) perf data */
03489         parse_check_output(queued_check_result->output,&temp_host->plugin_output,&temp_host->long_plugin_output,&temp_host->perf_data,TRUE,TRUE);
03490 
03491         /* make sure we have some data */
03492         if(temp_host->plugin_output==NULL || !strcmp(temp_host->plugin_output,"")){
03493                 my_free(temp_host->plugin_output);
03494                 temp_host->plugin_output=(char *)strdup("(No output returned from host check)");
03495         }
03496 
03497         /* replace semicolons in plugin output (but not performance data) with colons */
03498         if((temp_ptr=temp_host->plugin_output)){
03499                 while((temp_ptr=strchr(temp_ptr,';')))
03500                         *temp_ptr=':';
03501         }
03502 
03503         log_debug_info(DEBUGL_CHECKS,2,"Parsing check output...\n");
03504         log_debug_info(DEBUGL_CHECKS,2,"Short Output: %s\n",(temp_host->plugin_output==NULL)?"NULL":temp_host->plugin_output);
03505         log_debug_info(DEBUGL_CHECKS,2,"Long Output:  %s\n",(temp_host->long_plugin_output==NULL)?"NULL":temp_host->long_plugin_output);
03506         log_debug_info(DEBUGL_CHECKS,2,"Perf Data:    %s\n",(temp_host->perf_data==NULL)?"NULL":temp_host->perf_data);
03507 
03508         /* get the unprocessed return code */
03509         /* NOTE: for passive checks, this is the final/processed state */
03510         result=queued_check_result->return_code;
03511 
03512         /* adjust return code (active checks only) */
03513         if(queued_check_result->check_type==HOST_CHECK_ACTIVE){
03514 
03515                 /* if there was some error running the command, just skip it (this shouldn't be happening) */
03516                 if(queued_check_result->exited_ok==FALSE){
03517 
03518                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning:  Check of host '%s' did not exit properly!\n",temp_host->name);
03519 
03520                         my_free(temp_host->plugin_output);
03521                         my_free(temp_host->long_plugin_output);
03522                         my_free(temp_host->perf_data);
03523 
03524                         temp_host->plugin_output=(char *)strdup("(Host check did not exit properly)");
03525 
03526                         result=STATE_CRITICAL;
03527                 }
03528 
03529                 /* make sure the return code is within bounds */
03530                 else if(queued_check_result->return_code<0 || queued_check_result->return_code>3){
03531 
03532                         logit(NSLOG_RUNTIME_WARNING,TRUE,"Warning: Return code of %d for check of host '%s' was out of bounds.%s\n",queued_check_result->return_code,temp_host->name,(queued_check_result->return_code==126 || queued_check_result->return_code==127)?" Make sure the plugin you're trying to run actually exists.":"");
03533 
03534                         my_free(temp_host->plugin_output);
03535                         my_free(temp_host->long_plugin_output);
03536                         my_free(temp_host->perf_data);
03537 
03538                         dummy=asprintf(&temp_host->plugin_output,"(Return code of %d is out of bounds%s)",queued_check_result->return_code,(queued_check_result->return_code==126 || queued_check_result->return_code==127)?" - plugin may be missing":"");
03539 
03540                         result=STATE_CRITICAL;
03541                 }
03542 
03543                 /* a NULL host check command means we should assume the host is UP */
03544                 if(temp_host->host_check_command==NULL){
03545                         my_free(temp_host->plugin_output);
03546                         temp_host->plugin_output=(char *)strdup("(Host assumed to be UP)");
03547                         result=STATE_OK;
03548                 }
03549         }
03550 
03551         /* translate return code to basic UP/DOWN state - the DOWN/UNREACHABLE state determination is made later */
03552         /* NOTE: only do this for active checks - passive check results already have the final state */
03553         if(queued_check_result->check_type==HOST_CHECK_ACTIVE){
03554 
03555                 /* if we're not doing aggressive host checking, let WARNING states indicate the host is up (fake the result to be STATE_OK) */
03556                 if(use_aggressive_host_checking==FALSE && result==STATE_WARNING)
03557                         result=STATE_OK;
03558 
03559                 /* OK states means the host is UP */
03560                 if(result==STATE_OK)
03561                         result=HOST_UP;
03562 
03563                 /* any problem state indicates the host is not UP */
03564                 else
03565                         result=HOST_DOWN;
03566         }
03567 
03568 
03569         /******************* PROCESS THE CHECK RESULTS ******************/
03570 
03571         /* process the host check result */
03572         process_host_check_result_3x(temp_host,result,old_plugin_output,CHECK_OPTION_NONE,reschedule_check,TRUE,cached_host_check_horizon);
03573 
03574         /* free memory */
03575         my_free(old_plugin_output);
03576 
03577         log_debug_info(DEBUGL_CHECKS,1,"** Async check result for host '%s' handled: new state=%d\n",temp_host->name,temp_host->current_state);
03578 
03579         /* high resolution start time for event broker */
03580         start_time_hires=queued_check_result->start_time;
03581 
03582         /* high resolution end time for event broker */
03583         gettimeofday(&end_time_hires,NULL);
03584 
03585 #ifdef USE_EVENT_BROKER
03586         /* send data to event broker */
03587         broker_host_check(NEBTYPE_HOSTCHECK_PROCESSED,NEBFLAG_NONE,NEBATTR_NONE,temp_host,temp_host->check_type,temp_host->current_state,temp_host->state_type,start_time_hires,end_time_hires,temp_host->host_check_command,temp_host->latency,temp_host->execution_time,host_check_timeout,queued_check_result->early_timeout,queued_check_result->return_code,temp_host->processed_command,temp_host->plugin_output,temp_host->long_plugin_output,temp_host->perf_data,NULL);
03588 #endif
03589 
03590         return OK;
03591 }
03592 
03593 
03594 
03595 /* processes the result of a synchronous or asynchronous host check */
03596 int process_host_check_result_3x(host *hst, int new_state, char *old_plugin_output, int check_options, int reschedule_check, int use_cached_result, unsigned long check_timestamp_horizon){
03597         hostsmember *temp_hostsmember=NULL;
03598         host *child_host=NULL;
03599         host *parent_host=NULL;
03600         host *master_host=NULL;
03601         host *temp_host=NULL;
03602         hostdependency *temp_dependency=NULL;
03603         objectlist *check_hostlist=NULL;
03604         objectlist *hostlist_item=NULL;
03605         int parent_state=HOST_UP;
03606         time_t current_time=0L;
03607         time_t next_check=0L;
03608         time_t preferred_time=0L;
03609         time_t next_valid_time=0L;
03610         int run_async_check=TRUE;
03611         void *ptr=NULL;
03612 
03613 
03614         log_debug_info(DEBUGL_FUNCTIONS,0,"process_host_check_result_3x()\n");
03615 
03616         log_debug_info(DEBUGL_CHECKS,1,"HOST: %s, ATTEMPT=%d/%d, CHECK TYPE=%s, STATE TYPE=%s, OLD STATE=%d, NEW STATE=%d\n",hst->name,hst->current_attempt,hst->max_attempts,(hst->check_type==HOST_CHECK_ACTIVE)?"ACTIVE":"PASSIVE",(hst->state_type==HARD_STATE)?"HARD":"SOFT",hst->current_state,new_state);
03617 
03618         /* get the current time */
03619         time(&current_time);
03620 
03621         /* default next check time */
03622         next_check=(unsigned long)(current_time+(hst->check_interval*interval_length));
03623 
03624         /* we have to adjust current attempt # for passive checks, as it isn't done elsewhere */
03625         if(hst->check_type==HOST_CHECK_PASSIVE && passive_host_checks_are_soft==TRUE)
03626                 adjust_host_check_attempt_3x(hst,FALSE);
03627 
03628         /* log passive checks - we need to do this here, as some my bypass external commands by getting dropped in checkresults dir */
03629         if(hst->check_type==HOST_CHECK_PASSIVE){
03630                 if(log_passive_checks==TRUE)
03631                         logit(NSLOG_PASSIVE_CHECK,FALSE,"PASSIVE HOST CHECK: %s;%d;%s\n",hst->name,new_state,hst->plugin_output);
03632         }
03633 
03634 
03635         /******* HOST WAS DOWN/UNREACHABLE INITIALLY *******/
03636         if(hst->current_state!=HOST_UP){
03637 
03638                 log_debug_info(DEBUGL_CHECKS,1,"Host was DOWN/UNREACHABLE.\n");
03639 
03640                 /***** HOST IS NOW UP *****/
03641                 /* the host just recovered! */
03642                 if(new_state==HOST_UP){
03643 
03644                         /* set the current state */
03645                         hst->current_state=HOST_UP;
03646 
03647                         /* set the state type */
03648                         /* set state type to HARD for passive checks and active checks that were previously in a HARD STATE */
03649                         if(hst->state_type==HARD_STATE || (hst->check_type==HOST_CHECK_PASSIVE && passive_host_checks_are_soft==FALSE))
03650                                 hst->state_type=HARD_STATE;
03651                         else
03652                                 hst->state_type=SOFT_STATE;
03653 
03654                         log_debug_info(DEBUGL_CHECKS,1,"Host experienced a %s recovery (it's now UP).\n",(hst->state_type==HARD_STATE)?"HARD":"SOFT");
03655 
03656                         /* reschedule the next check of the host at the normal interval */
03657                         reschedule_check=TRUE;
03658                         next_check=(unsigned long)(current_time+(hst->check_interval*interval_length));
03659 
03660                         /* propagate checks to immediate parents if they are not already UP */
03661                         /* we do this because a parent host (or grandparent) may have recovered somewhere and we should catch the recovery as soon as possible */
03662                         log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to parent host(s)...\n");
03663 
03664                         for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){
03665                                 if((parent_host=temp_hostsmember->host_ptr)==NULL)
03666                                         continue;
03667                                 if(parent_host->current_state!=HOST_UP){
03668                                         log_debug_info(DEBUGL_CHECKS,1,"Check of parent host '%s' queued.\n",parent_host->name);
03669                                         add_object_to_objectlist(&check_hostlist,(void *)parent_host);
03670                                 }
03671                         }
03672 
03673                         /* propagate checks to immediate children if they are not already UP */
03674                         /* we do this because children may currently be UNREACHABLE, but may (as a result of this recovery) switch to UP or DOWN states */
03675                         log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to child host(s)...\n");
03676 
03677                         for(temp_hostsmember=hst->child_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){
03678                                 if((child_host=temp_hostsmember->host_ptr)==NULL)
03679                                         continue;
03680                                 if(child_host->current_state!=HOST_UP){
03681                                         log_debug_info(DEBUGL_CHECKS,1,"Check of child host '%s' queued.\n",child_host->name);
03682                                         add_object_to_objectlist(&check_hostlist,(void *)child_host);
03683                                 }
03684                         }
03685                 }
03686 
03687                 /***** HOST IS STILL DOWN/UNREACHABLE *****/
03688                 /* we're still in a problem state... */
03689                 else{
03690 
03691                         log_debug_info(DEBUGL_CHECKS,1,"Host is still DOWN/UNREACHABLE.\n");
03692 
03693                         /* passive checks are treated as HARD states by default... */
03694                         if(hst->check_type==HOST_CHECK_PASSIVE && passive_host_checks_are_soft==FALSE){
03695 
03696                                 /* set the state type */
03697                                 hst->state_type=HARD_STATE;
03698 
03699                                 /* reset the current attempt */
03700                                 hst->current_attempt=1;
03701                         }
03702 
03703                         /* active checks and passive checks (treated as SOFT states) */
03704                         else{
03705 
03706                                 /* set the state type */
03707                                 /* we've maxed out on the retries */
03708                                 if(hst->current_attempt==hst->max_attempts)
03709                                         hst->state_type=HARD_STATE;
03710                                 /* the host was in a hard problem state before, so it still is now */
03711                                 else if(hst->current_attempt==1)
03712                                         hst->state_type=HARD_STATE;
03713                                 /* the host is in a soft state and the check will be retried */
03714                                 else
03715                                         hst->state_type=SOFT_STATE;
03716                         }
03717 
03718                         /* make a determination of the host's state */
03719                         /* translate host state between DOWN/UNREACHABLE (only for passive checks if enabled) */
03720                         hst->current_state=new_state;
03721                         if(hst->check_type==HOST_CHECK_ACTIVE || translate_passive_host_checks==TRUE)
03722                                 hst->current_state=determine_host_reachability(hst);
03723 
03724                         /* reschedule the next check if the host state changed */
03725                         if(hst->last_state!=hst->current_state || hst->last_hard_state!=hst->current_state){
03726 
03727                                 reschedule_check=TRUE;
03728 
03729                                 /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */
03730                                 if(hst->state_type==SOFT_STATE)
03731                                         next_check=(unsigned long)(current_time+(hst->retry_interval*interval_length));
03732 
03733                                 /* host has maxed out on retries (or was previously in a hard problem state), so reschedule the next check at the normal interval */
03734                                 else
03735                                         next_check=(unsigned long)(current_time+(hst->check_interval*interval_length));
03736                         }
03737 
03738                 }
03739 
03740         }
03741 
03742         /******* HOST WAS UP INITIALLY *******/
03743         else{
03744 
03745                 log_debug_info(DEBUGL_CHECKS,1,"Host was UP.\n");
03746 
03747                 /***** HOST IS STILL UP *****/
03748                 /* either the host never went down since last check */
03749                 if(new_state==HOST_UP){
03750 
03751                         log_debug_info(DEBUGL_CHECKS,1,"Host is still UP.\n");
03752 
03753                         /* set the current state */
03754                         hst->current_state=HOST_UP;
03755 
03756                         /* set the state type */
03757                         hst->state_type=HARD_STATE;
03758 
03759                         /* reschedule the next check at the normal interval */
03760                         if(reschedule_check==TRUE)
03761                                 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length));
03762                 }
03763 
03764                 /***** HOST IS NOW DOWN/UNREACHABLE *****/
03765                 else{
03766 
03767                         log_debug_info(DEBUGL_CHECKS,1,"Host is now DOWN/UNREACHABLE.\n");
03768 
03769                         /***** SPECIAL CASE FOR HOSTS WITH MAX_ATTEMPTS==1 *****/
03770                         if(hst->max_attempts==1){
03771 
03772                                 log_debug_info(DEBUGL_CHECKS,1,"Max attempts = 1!.\n");
03773 
03774                                 /* set the state type */
03775                                 hst->state_type=HARD_STATE;
03776 
03777                                 /* host has maxed out on retries, so reschedule the next check at the normal interval */
03778                                 reschedule_check=TRUE;
03779                                 next_check=(unsigned long)(current_time+(hst->check_interval*interval_length));
03780 
03781                                 /* we need to run SYNCHRONOUS checks of all parent hosts to accurately determine the state of this host */
03782                                 /* this is extremely inefficient (reminiscent of Icinga 2.x logic), but there's no other good way around it */
03783                                 /* check all parent hosts to see if we're DOWN or UNREACHABLE */
03784                                 /* only do this for ACTIVE checks, as PASSIVE checks contain a pre-determined state */
03785                                 if(hst->check_type==HOST_CHECK_ACTIVE){
03786 
03787                                         log_debug_info(DEBUGL_CHECKS,1,"** WARNING: Max attempts = 1, so we have to run serial checks of all parent hosts!\n");
03788 
03789                                         for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){
03790 
03791                                                 if((parent_host=temp_hostsmember->host_ptr)==NULL)
03792                                                         continue;
03793 
03794                                                 log_debug_info(DEBUGL_CHECKS,1,"Running serial check parent host '%s'...\n",parent_host->name);
03795 
03796                                                 /* run an immediate check of the parent host */
03797                                                 run_sync_host_check_3x(parent_host,&parent_state,check_options,use_cached_result,check_timestamp_horizon);
03798 
03799                                                 /* bail out as soon as we find one parent host that is UP */
03800                                                 if(parent_state==HOST_UP){
03801 
03802                                                         log_debug_info(DEBUGL_CHECKS,1,"Parent host is UP, so this one is DOWN.\n");
03803 
03804                                                         /* set the current state */
03805                                                         hst->current_state=HOST_DOWN;
03806                                                         break;
03807                                                 }
03808                                         }
03809 
03810                                         if(temp_hostsmember==NULL){
03811                                                 /* host has no parents, so its up */
03812                                                 if(hst->parent_hosts==NULL){
03813                                                         log_debug_info(DEBUGL_CHECKS,1,"Host has no parents, so it's DOWN.\n");
03814                                                         hst->current_state=HOST_DOWN;
03815                                                 }
03816                                                 else{
03817                                                         /* no parents were up, so this host is UNREACHABLE */
03818                                                         log_debug_info(DEBUGL_CHECKS,1,"No parents were UP, so this host is UNREACHABLE.\n");
03819                                                         hst->current_state=HOST_UNREACHABLE;
03820                                                 }
03821                                         }
03822                                 }
03823 
03824                                 /* set the host state for passive checks */
03825                                 else{
03826                                         /* set the state */
03827                                         hst->current_state=new_state;
03828 
03829                                         /* translate host state between DOWN/UNREACHABLE for passive checks (if enabled) */
03830                                         /* make a determination of the host's state */
03831                                         if(translate_passive_host_checks==TRUE)
03832                                                 hst->current_state=determine_host_reachability(hst);
03833 
03834                                 }
03835 
03836                                 /* propagate checks to immediate children if they are not UNREACHABLE */
03837                                 /* we do this because we may now be blocking the route to child hosts */
03838                                 log_debug_info(DEBUGL_CHECKS,1,"Propagating check to immediate non-UNREACHABLE child hosts...\n");
03839 
03840                                 for(temp_hostsmember=hst->child_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){
03841                                         if((child_host=temp_hostsmember->host_ptr)==NULL)
03842                                                 continue;
03843                                         if(child_host->current_state!=HOST_UNREACHABLE){
03844                                                 log_debug_info(DEBUGL_CHECKS,1,"Check of child host '%s' queued.\n",child_host->name);
03845                                                 add_object_to_objectlist(&check_hostlist,(void *)child_host);
03846                                         }
03847                                 }
03848                         }
03849 
03850                         /***** MAX ATTEMPTS > 1 *****/
03851                         else{
03852 
03853                                 /* active and (in some cases) passive check results are treated as SOFT states */
03854                                 if(hst->check_type==HOST_CHECK_ACTIVE || passive_host_checks_are_soft==TRUE){
03855 
03856                                         /* set the state type */
03857                                         hst->state_type=SOFT_STATE;
03858                                 }
03859 
03860                                 /* by default, passive check results are treated as HARD states */
03861                                 else{
03862 
03863                                         /* set the state type */
03864                                         hst->state_type=HARD_STATE;
03865 
03866                                         /* reset the current attempt */
03867                                         hst->current_attempt=1;
03868                                 }
03869 
03870                                 /* make a (in some cases) preliminary determination of the host's state */
03871                                 /* translate host state between DOWN/UNREACHABLE (for passive checks only if enabled) */
03872                                 hst->current_state=new_state;
03873                                 if(hst->check_type==HOST_CHECK_ACTIVE || translate_passive_host_checks==TRUE)
03874                                         hst->current_state=determine_host_reachability(hst);
03875 
03876                                 /* reschedule a check of the host */
03877                                 reschedule_check=TRUE;
03878 
03879                                 /* schedule a re-check of the host at the retry interval because we can't determine its final state yet... */
03880                                 if(hst->check_type==HOST_CHECK_ACTIVE || passive_host_checks_are_soft==TRUE)
03881                                         next_check=(unsigned long)(current_time+(hst->retry_interval*interval_length));
03882 
03883                                 /* schedule a re-check of the host at the normal interval */
03884                                 else
03885                                         next_check=(unsigned long)(current_time+(hst->check_interval*interval_length));
03886 
03887                                 /* propagate checks to immediate parents if they are UP */
03888                                 /* we do this because a parent host (or grandparent) may have gone down and blocked our route */
03889                                 /* checking the parents ASAP will allow us to better determine the final state (DOWN/UNREACHABLE) of this host later */
03890                                 log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to immediate parent hosts that are UP...\n");
03891 
03892                                 for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){
03893                                         if((parent_host=temp_hostsmember->host_ptr)==NULL)
03894                                                 continue;
03895                                         if(parent_host->current_state==HOST_UP){
03896                                                 add_object_to_objectlist(&check_hostlist,(void *)parent_host);
03897                                                 log_debug_info(DEBUGL_CHECKS,1,"Check of host '%s' queued.\n",parent_host->name);
03898                                         }
03899                                 }
03900 
03901                                 /* propagate checks to immediate children if they are not UNREACHABLE */
03902                                 /* we do this because we may now be blocking the route to child hosts */
03903                                 log_debug_info(DEBUGL_CHECKS,1,"Propagating checks to immediate non-UNREACHABLE child hosts...\n");
03904 
03905                                 for(temp_hostsmember=hst->child_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){
03906                                         if((child_host=temp_hostsmember->host_ptr)==NULL)
03907                                                 continue;
03908                                         if(child_host->current_state!=HOST_UNREACHABLE){
03909                                                 log_debug_info(DEBUGL_CHECKS,1,"Check of child host '%s' queued.\n",child_host->name);
03910                                                 add_object_to_objectlist(&check_hostlist,(void *)child_host);
03911                                         }
03912                                 }
03913 
03914                                 /* check dependencies on second to last host check */
03915                                 if(enable_predictive_host_dependency_checks==TRUE && hst->current_attempt==(hst->max_attempts-1)){
03916 
03917                                         /* propagate checks to hosts that THIS ONE depends on for notifications AND execution */
03918                                         /* we do to help ensure that the dependency checks are accurate before it comes time to notify */
03919                                         log_debug_info(DEBUGL_CHECKS,1,"Propagating predictive dependency checks to hosts this one depends on...\n");
03920 
03921                                         for(temp_dependency=get_first_hostdependency_by_dependent_host(hst->name,&ptr);temp_dependency!=NULL;temp_dependency=get_next_hostdependency_by_dependent_host(hst->name,&ptr)){
03922                                                 if(temp_dependency->dependent_host_ptr==hst && temp_dependency->master_host_ptr!=NULL){
03923                                                         master_host=(host *)temp_dependency->master_host_ptr;
03924                                                         log_debug_info(DEBUGL_CHECKS,1,"Check of host '%s' queued.\n",master_host->name);
03925                                                         add_object_to_objectlist(&check_hostlist,(void *)master_host);
03926                                                 }
03927                                         }
03928                                 }
03929                         }
03930                 }
03931         }
03932 
03933         log_debug_info(DEBUGL_CHECKS,1,"Pre-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n",hst->name,hst->current_attempt,hst->max_attempts,(hst->state_type==HARD_STATE)?"HARD":"SOFT",hst->current_state);
03934 
03935         /* handle the host state */
03936         handle_host_state(hst);
03937 
03938         log_debug_info(DEBUGL_CHECKS,1,"Post-handle_host_state() Host: %s, Attempt=%d/%d, Type=%s, Final State=%d\n",hst->name,hst->current_attempt,hst->max_attempts,(hst->state_type==HARD_STATE)?"HARD":"SOFT",hst->current_state);
03939 
03940 
03941         /******************** POST-PROCESSING STUFF *********************/
03942 
03943         /* if the plugin output differs from previous check and no state change, log the current state/output if state stalking is enabled */
03944         if(hst->last_state==hst->current_state && compare_strings(old_plugin_output,hst->plugin_output)){
03945 
03946                 if(hst->current_state==HOST_UP && hst->stalk_on_up==TRUE) {
03947 
03948                         log_host_event(hst);
03949 
03950                         /* should we run event handlers ? */
03951                         if (stalking_event_handlers_for_hosts==TRUE)
03952                                 handle_host_event(hst);
03953 
03954                 } else if(hst->current_state==HOST_DOWN && hst->stalk_on_down==TRUE) {
03955 
03956                         log_host_event(hst);
03957 
03958                         /* should we run event handlers ? */
03959                         if (stalking_event_handlers_for_hosts==TRUE)
03960                                 handle_host_event(hst);
03961 
03962                 } else if(hst->current_state==HOST_UNREACHABLE && hst->stalk_on_unreachable==TRUE) {
03963 
03964                         log_host_event(hst);
03965 
03966                         /* should we run event handlers ? */
03967                         if (stalking_event_handlers_for_hosts==TRUE)
03968                                 handle_host_event(hst);
03969                 }
03970         }
03971 
03972         /* check to see if the associated host is flapping */
03973         check_for_host_flapping(hst,TRUE,TRUE,TRUE);
03974 
03975         /* reschedule the next check of the host (usually ONLY for scheduled, active checks, unless overridden above) */
03976         if(reschedule_check==TRUE){
03977 
03978                 log_debug_info(DEBUGL_CHECKS,1,"Rescheduling next check of host at %s",ctime(&next_check));
03979 
03980                 /* default is to reschedule host check unless a test below fails... */
03981                 hst->should_be_scheduled=TRUE;
03982 
03983                 /* get the new current time */
03984                 time(&current_time);
03985 
03986                 /* make sure we don't get ourselves into too much trouble... */
03987                 if(current_time>next_check)
03988                         hst->next_check=current_time;
03989                 else
03990                         hst->next_check=next_check;
03991 
03992                 /* make sure we rescheduled the next service check at a valid time */
03993                 preferred_time=hst->next_check;
03994                 get_next_valid_time(preferred_time,&next_valid_time,hst->check_period_ptr);
03995                 hst->next_check=next_valid_time;
03996 
03997                 /* hosts with non-recurring intervals do not get rescheduled if we're in a HARD or UP state */
03998                 if(hst->check_interval==0 && (hst->state_type==HARD_STATE || hst->current_state==HOST_UP))
03999                         hst->should_be_scheduled=FALSE;
04000 
04001                 /* host with active checks disabled do not get rescheduled */
04002                 if(hst->checks_enabled==FALSE)
04003                         hst->should_be_scheduled=FALSE;
04004 
04005                 /* schedule a non-forced check if we can */
04006                 if(hst->should_be_scheduled==TRUE){
04007                         schedule_host_check(hst,hst->next_check,CHECK_OPTION_NONE);
04008                 }
04009 
04010         }
04011 
04012         /* update host status - for both active (scheduled) and passive (non-scheduled) hosts */
04013         update_host_status(hst,FALSE);
04014 
04015         /* run async checks of all hosts we added above */
04016         /* don't run a check if one is already executing or we can get by with a cached state */
04017         for(hostlist_item=check_hostlist;hostlist_item!=NULL;hostlist_item=hostlist_item->next){
04018                 run_async_check=TRUE;
04019                 temp_host=(host *)hostlist_item->object_ptr;
04020 
04021                 log_debug_info(DEBUGL_CHECKS,2,"ASYNC CHECK OF HOST: %s, CURRENTTIME: %lu, LASTHOSTCHECK: %lu, CACHEDTIMEHORIZON: %lu, USECACHEDRESULT: %d, ISEXECUTING: %d\n",temp_host->name,current_time,temp_host->last_check,check_timestamp_horizon,use_cached_result,temp_host->is_executing);
04022 
04023                 if(use_cached_result==TRUE && ((current_time-temp_host->last_check)<=check_timestamp_horizon))
04024                         run_async_check=FALSE;
04025                 if(temp_host->is_executing==TRUE)
04026                         run_async_check=FALSE;
04027                 if(run_async_check==TRUE)
04028                         run_async_host_check_3x(temp_host,CHECK_OPTION_NONE,0.0,FALSE,FALSE,NULL,NULL);
04029         }
04030         free_objectlist(&check_hostlist);
04031 
04032         return OK;
04033 }
04034 
04035 
04036 
04037 /* checks viability of performing a host check */
04038 int check_host_check_viability_3x(host *hst, int check_options, int *time_is_valid, time_t *new_time){
04039         int result=OK;
04040         int perform_check=TRUE;
04041         time_t current_time=0L;
04042         time_t preferred_time=0L;
04043         int check_interval=0;
04044 
04045         log_debug_info(DEBUGL_FUNCTIONS,0,"check_host_check_viability_3x()\n");
04046 
04047         /* make sure we have a host */
04048         if(hst==NULL)
04049                 return ERROR;
04050 
04051         /* get the check interval to use if we need to reschedule the check */
04052         if(hst->state_type==SOFT_STATE && hst->current_state!=HOST_UP)
04053                 check_interval=(hst->retry_interval*interval_length);
04054         else
04055                 check_interval=(hst->check_interval*interval_length);
04056 
04057         /* make sure check interval is positive - otherwise use 5 minutes out for next check */
04058         if(check_interval<=0)
04059                 check_interval=300;
04060 
04061         /* get the current time */
04062         time(&current_time);
04063 
04064         /* initialize the next preferred check time */
04065         preferred_time=current_time;
04066 
04067         /* can we check the host right now? */
04068         if(!(check_options & CHECK_OPTION_FORCE_EXECUTION)){
04069 
04070                 /* if checks of the host are currently disabled... */
04071                 if(hst->checks_enabled==FALSE){
04072                         preferred_time=current_time+check_interval;
04073                         perform_check=FALSE;
04074                 }
04075 
04076                 /* make sure this is a valid time to check the host */
04077                 if(check_time_against_period((unsigned long)current_time,hst->check_period_ptr)==ERROR){
04078                         preferred_time=current_time;
04079                         if(time_is_valid)
04080                                 *time_is_valid=FALSE;
04081                         perform_check=FALSE;
04082                 }
04083 
04084                 /* check host dependencies for execution */
04085                 if(check_host_dependencies(hst,EXECUTION_DEPENDENCY)==DEPENDENCIES_FAILED){
04086                         preferred_time=current_time+check_interval;
04087                         perform_check=FALSE;
04088                 }
04089         }
04090 
04091         /* pass back the next viable check time */
04092         if(new_time)
04093                 *new_time=preferred_time;
04094 
04095         result=(perform_check==TRUE)?OK:ERROR;
04096 
04097         return result;
04098 }
04099 
04100 
04101 
04102 /* adjusts current host check attempt before a new check is performed */
04103 int adjust_host_check_attempt_3x(host *hst, int is_active){
04104 
04105         log_debug_info(DEBUGL_FUNCTIONS,0,"adjust_host_check_attempt_3x()\n");
04106 
04107         if(hst==NULL)
04108                 return ERROR;
04109 
04110         log_debug_info(DEBUGL_CHECKS,2,"Adjusting check attempt number for host '%s': current attempt=%d/%d, state=%d, state type=%d\n",hst->name,hst->current_attempt,hst->max_attempts,hst->current_state,hst->state_type);
04111 
04112         /* if host is in a hard state, reset current attempt number */
04113         if(hst->state_type==HARD_STATE)
04114                 hst->current_attempt=1;
04115 
04116         /* if host is in a soft UP state, reset current attempt number (active checks only) */
04117         else if(is_active==TRUE && hst->state_type==SOFT_STATE && hst->current_state==HOST_UP)
04118                 hst->current_attempt=1;
04119 
04120         /* increment current attempt number */
04121         else if(hst->current_attempt < hst->max_attempts)
04122                 hst->current_attempt++;
04123 
04124         log_debug_info(DEBUGL_CHECKS,2,"New check attempt number = %d\n",hst->current_attempt);
04125 
04126         return OK;
04127 }
04128 
04129 
04130 
04131 /* determination of the host's state based on route availability*/
04132 /* used only to determine difference between DOWN and UNREACHABLE states */
04133 int determine_host_reachability(host *hst){
04134         int state=HOST_DOWN;
04135         host *parent_host=NULL;
04136         hostsmember *temp_hostsmember=NULL;
04137 
04138         log_debug_info(DEBUGL_FUNCTIONS,0,"determine_host_reachability()\n");
04139 
04140         if(hst==NULL)
04141                 return HOST_DOWN;
04142 
04143         log_debug_info(DEBUGL_CHECKS,2,"Determining state of host '%s': current state=%d\n",hst->name,hst->current_state);
04144 
04145         /* host is UP - no translation needed */
04146         if(hst->current_state==HOST_UP){
04147                 state=HOST_UP;
04148                 log_debug_info(DEBUGL_CHECKS,2,"Host is UP, no state translation needed.\n");
04149         }
04150 
04151         /* host has no parents, so it is DOWN */
04152         else if(hst->parent_hosts==NULL){
04153                 state=HOST_DOWN;
04154                 log_debug_info(DEBUGL_CHECKS,2,"Host has no parents, so it is DOWN.\n");
04155         }
04156 
04157         /* check all parent hosts to see if we're DOWN or UNREACHABLE */
04158         else{
04159 
04160                 for(temp_hostsmember=hst->parent_hosts;temp_hostsmember!=NULL;temp_hostsmember=temp_hostsmember->next){
04161 
04162                         if((parent_host=temp_hostsmember->host_ptr)==NULL)
04163                                 continue;
04164 
04165                         /* bail out as soon as we find one parent host that is UP */
04166                         if(parent_host->current_state==HOST_UP){
04167                                 /* set the current state */
04168                                 state=HOST_DOWN;
04169                                 log_debug_info(DEBUGL_CHECKS,2,"At least one parent (%s) is up, so host is DOWN.\n",parent_host->name);
04170                                 break;
04171                         }
04172                 }
04173                 /* no parents were up, so this host is UNREACHABLE */
04174                 if(temp_hostsmember==NULL){
04175                         state=HOST_UNREACHABLE;
04176                         log_debug_info(DEBUGL_CHECKS,2,"No parents were up, so host is UNREACHABLE.\n");
04177                 }
04178         }
04179 
04180         return state;
04181 }
 All Data Structures Files Functions Variables Typedefs Defines