1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 
  22 /*
  23  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  24  * Use is subject to license terms.
  25  * Copyright 2012 Milan Jurik. All rights reserved.
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/socket.h>
  30 #include <sys/list.h>
  31 #include <sys/stropts.h>
  32 #include <sys/siginfo.h>
  33 #include <sys/wait.h>
  34 #include <arpa/inet.h>
  35 #include <netinet/in.h>
  36 #include <stdlib.h>
  37 #include <stdio.h>
  38 #include <strings.h>
  39 #include <stddef.h>
  40 #include <unistd.h>
  41 #include <libilb.h>
  42 #include <port.h>
  43 #include <time.h>
  44 #include <signal.h>
  45 #include <assert.h>
  46 #include <errno.h>
  47 #include <spawn.h>
  48 #include <fcntl.h>
  49 #include <limits.h>
  50 #include "libilb_impl.h"
  51 #include "ilbd.h"
  52 
  53 /* Global list of HC objects */
  54 list_t ilbd_hc_list;
  55 
  56 /* Timer queue for all hc related timers. */
  57 static iu_tq_t *ilbd_hc_timer_q;
  58 
  59 /* Indicate whether the timer needs to be updated */
  60 static boolean_t hc_timer_restarted;
  61 
  62 static void ilbd_hc_probe_timer(iu_tq_t *, void *);
  63 static ilb_status_t ilbd_hc_restart_timer(ilbd_hc_t *, ilbd_hc_srv_t *);
  64 static boolean_t ilbd_run_probe(ilbd_hc_srv_t *);
  65 
  66 #define MAX(a, b)       ((a) > (b) ? (a) : (b))
  67 
  68 /*
  69  * Number of arguments passed to a probe.  argc[0] is the path name of
  70  * the probe.
  71  */
  72 #define HC_PROBE_ARGC   8
  73 
  74 /*
  75  * Max number of characters to be read from the output of a probe.  It
  76  * is long enough to read in a 64 bit integer.
  77  */
  78 #define HC_MAX_PROBE_OUTPUT     24
  79 
  80 void
  81 i_ilbd_setup_hc_list(void)
  82 {
  83         list_create(&ilbd_hc_list, sizeof (ilbd_hc_t),
  84             offsetof(ilbd_hc_t, ihc_link));
  85 }
  86 
  87 /*
  88  * Given a hc object name, return a pointer to hc object if found.
  89  */
  90 ilbd_hc_t *
  91 ilbd_get_hc(const char *name)
  92 {
  93         ilbd_hc_t *hc;
  94 
  95         for (hc = list_head(&ilbd_hc_list); hc != NULL;
  96             hc = list_next(&ilbd_hc_list, hc)) {
  97                 if (strcasecmp(hc->ihc_name, name) == 0)
  98                         return (hc);
  99         }
 100         return (NULL);
 101 }
 102 
 103 /*
 104  * Generates an audit record for create-healthcheck,
 105  * delete-healtcheck subcommands.
 106  */
 107 static void
 108 ilbd_audit_hc_event(const char *audit_hcname,
 109     const ilb_hc_info_t *audit_hcinfo, ilbd_cmd_t cmd,
 110     ilb_status_t rc, ucred_t *ucredp)
 111 {
 112         adt_session_data_t      *ah;
 113         adt_event_data_t        *event;
 114         au_event_t      flag;
 115         int     audit_error;
 116 
 117         if ((ucredp == NULL) && (cmd == ILBD_CREATE_HC))  {
 118                 /*
 119                  * we came here from the path where ilbd incorporates
 120                  * the configuration that is listed in SCF:
 121                  * i_ilbd_read_config->ilbd_walk_hc_pgs->
 122                  *   ->ilbd_scf_instance_walk_pg->ilbd_create_hc
 123                  * We skip auditing in that case
 124                  */
 125                 logdebug("ilbd_audit_hc_event: skipping auditing");
 126                 return;
 127         }
 128 
 129         if (adt_start_session(&ah, NULL, 0) != 0) {
 130                 logerr("ilbd_audit_hc_event: adt_start_session failed");
 131                 exit(EXIT_FAILURE);
 132         }
 133         if (adt_set_from_ucred(ah, ucredp, ADT_NEW) != 0) {
 134                 (void) adt_end_session(ah);
 135                 logerr("ilbd_audit_rule_event: adt_set_from_ucred failed");
 136                 exit(EXIT_FAILURE);
 137         }
 138         if (cmd == ILBD_CREATE_HC)
 139                 flag = ADT_ilb_create_healthcheck;
 140         else if (cmd == ILBD_DESTROY_HC)
 141                 flag = ADT_ilb_delete_healthcheck;
 142 
 143         if ((event = adt_alloc_event(ah, flag)) == NULL) {
 144                 logerr("ilbd_audit_hc_event: adt_alloc_event failed");
 145                 exit(EXIT_FAILURE);
 146         }
 147         (void) memset((char *)event, 0, sizeof (adt_event_data_t));
 148 
 149         switch (cmd) {
 150         case ILBD_CREATE_HC:
 151                 event->adt_ilb_create_healthcheck.auth_used =
 152                     NET_ILB_CONFIG_AUTH;
 153                 event->adt_ilb_create_healthcheck.hc_test =
 154                     (char *)audit_hcinfo->hci_test;
 155                 event->adt_ilb_create_healthcheck.hc_name =
 156                     (char *)audit_hcinfo->hci_name;
 157 
 158                 /*
 159                  * If the value 0 is stored, the default values are
 160                  * set in the kernel. User land does not know about them
 161                  * So if the user does not specify them, audit record
 162                  * will show them as 0
 163                  */
 164                 event->adt_ilb_create_healthcheck.hc_timeout =
 165                     audit_hcinfo->hci_timeout;
 166                 event->adt_ilb_create_healthcheck.hc_count =
 167                     audit_hcinfo->hci_count;
 168                 event->adt_ilb_create_healthcheck.hc_interval =
 169                     audit_hcinfo->hci_interval;
 170                 break;
 171         case ILBD_DESTROY_HC:
 172                 event->adt_ilb_delete_healthcheck.auth_used =
 173                     NET_ILB_CONFIG_AUTH;
 174                 event->adt_ilb_delete_healthcheck.hc_name =
 175                     (char *)audit_hcname;
 176                 break;
 177         }
 178 
 179         /* Fill in success/failure */
 180         if (rc == ILB_STATUS_OK) {
 181                 if (adt_put_event(event, ADT_SUCCESS, ADT_SUCCESS) != 0) {
 182                         logerr("ilbd_audit_hc_event: adt_put_event failed");
 183                         exit(EXIT_FAILURE);
 184                 }
 185         } else {
 186                 audit_error = ilberror2auditerror(rc);
 187                 if (adt_put_event(event, ADT_FAILURE, audit_error) != 0) {
 188                         logerr("ilbd_audit_hc_event: adt_put_event failed");
 189                         exit(EXIT_FAILURE);
 190                 }
 191         }
 192         adt_free_event(event);
 193         (void) adt_end_session(ah);
 194 }
 195 
 196 /*
 197  * Given the ilb_hc_info_t passed in (from the libilb), create a hc object
 198  * in ilbd.  The parameter ev_port is not used, refer to comments of
 199  * ilbd_create_sg() in ilbd_sg.c
 200  */
 201 /* ARGSUSED */
 202 ilb_status_t
 203 ilbd_create_hc(const ilb_hc_info_t *hc_info, int ev_port,
 204     const struct passwd *ps, ucred_t *ucredp)
 205 {
 206         ilbd_hc_t *hc;
 207         ilb_status_t ret = ILB_STATUS_OK;
 208 
 209         /*
 210          * ps == NULL is from the daemon when it starts and load configuration
 211          * ps != NULL is from client.
 212          */
 213         if (ps != NULL) {
 214                 ret = ilbd_check_client_config_auth(ps);
 215                 if (ret != ILB_STATUS_OK) {
 216                         ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
 217                             ret, ucredp);
 218                         return (ret);
 219                 }
 220         }
 221 
 222         if (hc_info->hci_name[0] == '\0') {
 223                 logdebug("ilbd_create_hc: missing healthcheck info");
 224                 ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
 225                     ILB_STATUS_ENOHCINFO, ucredp);
 226                 return (ILB_STATUS_ENOHCINFO);
 227         }
 228 
 229         hc = ilbd_get_hc(hc_info->hci_name);
 230         if (hc != NULL) {
 231                 logdebug("ilbd_create_hc: healthcheck name %s already"
 232                     " exists", hc_info->hci_name);
 233                 ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
 234                     ILB_STATUS_EEXIST, ucredp);
 235                 return (ILB_STATUS_EEXIST);
 236         }
 237 
 238         /*
 239          * Sanity check on user supplied probe.  The given path name
 240          * must be a full path name (starts with '/') and is
 241          * executable.
 242          */
 243         if (strcasecmp(hc_info->hci_test, ILB_HC_STR_TCP) != 0 &&
 244             strcasecmp(hc_info->hci_test, ILB_HC_STR_UDP) != 0 &&
 245             strcasecmp(hc_info->hci_test, ILB_HC_STR_PING) != 0 &&
 246             (hc_info->hci_test[0] != '/' ||
 247             access(hc_info->hci_test, X_OK) == -1)) {
 248                 if (errno == ENOENT) {
 249                         logdebug("ilbd_create_hc: user script %s doesn't "
 250                             "exist", hc_info->hci_test);
 251                         ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
 252                             ILB_STATUS_ENOENT, ucredp);
 253                         return (ILB_STATUS_ENOENT);
 254                 } else {
 255                         logdebug("ilbd_create_hc: user script %s is "
 256                             "invalid", hc_info->hci_test);
 257                         ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
 258                             ILB_STATUS_EINVAL, ucredp);
 259                         return (ILB_STATUS_EINVAL);
 260                 }
 261         }
 262 
 263         /* Create and add the hc object */
 264         hc = calloc(1, sizeof (ilbd_hc_t));
 265         if (hc == NULL) {
 266                 ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
 267                     ILB_STATUS_ENOMEM, ucredp);
 268                 return (ILB_STATUS_ENOMEM);
 269         }
 270         (void) memcpy(&hc->ihc_info, hc_info, sizeof (ilb_hc_info_t));
 271         if (strcasecmp(hc->ihc_test, ILB_HC_STR_TCP) == 0)
 272                 hc->ihc_test_type = ILBD_HC_TCP;
 273         else if (strcasecmp(hc->ihc_test, ILB_HC_STR_UDP) == 0)
 274                 hc->ihc_test_type = ILBD_HC_UDP;
 275         else if (strcasecmp(hc->ihc_test, ILB_HC_STR_PING) == 0)
 276                 hc->ihc_test_type = ILBD_HC_PING;
 277         else
 278                 hc->ihc_test_type = ILBD_HC_USER;
 279         list_create(&hc->ihc_rules, sizeof (ilbd_hc_rule_t),
 280             offsetof(ilbd_hc_rule_t, hcr_link));
 281 
 282         /* Update SCF */
 283         if (ps != NULL) {
 284                 if ((ret = ilbd_create_pg(ILBD_SCF_HC, (void *)hc)) !=
 285                     ILB_STATUS_OK) {
 286                         ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC,
 287                             ret, ucredp);
 288                         list_destroy(&hc->ihc_rules);
 289                         free(hc);
 290                         return (ret);
 291                 }
 292         }
 293 
 294         /* Everything is fine, now add it to the global list. */
 295         list_insert_tail(&ilbd_hc_list, hc);
 296         ilbd_audit_hc_event(NULL, hc_info, ILBD_CREATE_HC, ret, ucredp);
 297         return (ret);
 298 }
 299 
 300 /*
 301  * Given a name of a hc object, destroy it.
 302  */
 303 ilb_status_t
 304 ilbd_destroy_hc(const char *hc_name, const struct passwd *ps,
 305     ucred_t *ucredp)
 306 {
 307         ilb_status_t ret;
 308         ilbd_hc_t *hc;
 309 
 310         /*
 311          * No need to check ps == NULL, daemon won't call any destroy func
 312          * at start up.
 313          */
 314         ret = ilbd_check_client_config_auth(ps);
 315         if (ret != ILB_STATUS_OK) {
 316                 ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
 317                     ret, ucredp);
 318                 return (ret);
 319         }
 320 
 321         hc = ilbd_get_hc(hc_name);
 322         if (hc == NULL) {
 323                 logdebug("ilbd_destroy_hc: healthcheck %s does not exist",
 324                     hc_name);
 325                 ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
 326                     ILB_STATUS_ENOENT, ucredp);
 327                 return (ILB_STATUS_ENOENT);
 328         }
 329 
 330         /* If hc is in use, cannot delete it */
 331         if (hc->ihc_rule_cnt > 0) {
 332                 logdebug("ilbd_destroy_hc: healthcheck %s is associated"
 333                     " with a rule - cannot remove", hc_name);
 334                 ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
 335                     ILB_STATUS_INUSE, ucredp);
 336                 return (ILB_STATUS_INUSE);
 337         }
 338 
 339         if ((ret = ilbd_destroy_pg(ILBD_SCF_HC, hc_name)) !=
 340             ILB_STATUS_OK) {
 341                 logdebug("ilbd_destroy_hc: cannot destroy healthcheck %s "
 342                     "property group", hc_name);
 343                 ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC,
 344                     ret, ucredp);
 345                 return (ret);
 346         }
 347 
 348         list_remove(&ilbd_hc_list, hc);
 349         list_destroy(&hc->ihc_rules);
 350         free(hc);
 351         ilbd_audit_hc_event(hc_name, NULL, ILBD_DESTROY_HC, ret, ucredp);
 352         return (ret);
 353 }
 354 
 355 /*
 356  * Given a hc object name, return its information.  Used by libilb to
 357  * get hc info.
 358  */
 359 ilb_status_t
 360 ilbd_get_hc_info(const char *hc_name, uint32_t *rbuf, size_t *rbufsz)
 361 {
 362         ilbd_hc_t       *hc;
 363         ilb_hc_info_t   *hc_info;
 364         ilb_comm_t      *ic = (ilb_comm_t *)rbuf;
 365 
 366         hc = ilbd_get_hc(hc_name);
 367         if (hc == NULL) {
 368                 logdebug("%s: healthcheck %s does not exist", __func__,
 369                     hc_name);
 370                 return (ILB_STATUS_ENOENT);
 371         }
 372         ilbd_reply_ok(rbuf, rbufsz);
 373         hc_info = (ilb_hc_info_t *)&ic->ic_data;
 374 
 375         (void) strlcpy(hc_info->hci_name, hc->ihc_name, sizeof (hc->ihc_name));
 376         (void) strlcpy(hc_info->hci_test, hc->ihc_test, sizeof (hc->ihc_test));
 377         hc_info->hci_timeout = hc->ihc_timeout;
 378         hc_info->hci_count = hc->ihc_count;
 379         hc_info->hci_interval = hc->ihc_interval;
 380         hc_info->hci_def_ping = hc->ihc_def_ping;
 381 
 382         *rbufsz += sizeof (ilb_hc_info_t);
 383 
 384         return (ILB_STATUS_OK);
 385 }
 386 
 387 static void
 388 ilbd_hc_copy_srvs(uint32_t *rbuf, size_t *rbufsz, ilbd_hc_rule_t *hc_rule,
 389     const char *rulename)
 390 {
 391         ilbd_hc_srv_t           *tmp_srv;
 392         ilb_hc_srv_t            *dst_srv;
 393         ilb_hc_rule_srv_t       *srvs;
 394         size_t                  tmp_rbufsz;
 395         int                     i;
 396 
 397         tmp_rbufsz = *rbufsz;
 398         /* Set up the reply buffer.  rbufsz will be set to the new size. */
 399         ilbd_reply_ok(rbuf, rbufsz);
 400 
 401         /* Calculate how much space is left for holding server info. */
 402         *rbufsz += sizeof (ilb_hc_rule_srv_t);
 403         tmp_rbufsz -= *rbufsz;
 404 
 405         srvs = (ilb_hc_rule_srv_t *)&((ilb_comm_t *)rbuf)->ic_data;
 406 
 407         tmp_srv = list_head(&hc_rule->hcr_servers);
 408         for (i = 0; tmp_srv != NULL && tmp_rbufsz >= sizeof (*dst_srv); i++) {
 409                 dst_srv = &srvs->rs_srvs[i];
 410 
 411                 (void) strlcpy(dst_srv->hcs_rule_name, rulename, ILB_NAMESZ);
 412                 (void) strlcpy(dst_srv->hcs_ID, tmp_srv->shc_sg_srv->sgs_srvID,
 413                     ILB_NAMESZ);
 414                 (void) strlcpy(dst_srv->hcs_hc_name,
 415                     tmp_srv->shc_hc->ihc_name, ILB_NAMESZ);
 416                 dst_srv->hcs_IP = tmp_srv->shc_sg_srv->sgs_addr;
 417                 dst_srv->hcs_fail_cnt = tmp_srv->shc_fail_cnt;
 418                 dst_srv->hcs_status = tmp_srv->shc_status;
 419                 dst_srv->hcs_rtt = tmp_srv->shc_rtt;
 420                 dst_srv->hcs_lasttime = tmp_srv->shc_lasttime;
 421                 dst_srv->hcs_nexttime = tmp_srv->shc_nexttime;
 422 
 423                 tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv);
 424                 tmp_rbufsz -= sizeof (*dst_srv);
 425         }
 426         srvs->rs_num_srvs = i;
 427         *rbufsz += i * sizeof (*dst_srv);
 428 }
 429 
 430 /*
 431  * Given a rule name, return the hc status of its servers.
 432  */
 433 ilb_status_t
 434 ilbd_get_hc_srvs(const char *rulename, uint32_t *rbuf, size_t *rbufsz)
 435 {
 436         ilbd_hc_t       *hc;
 437         ilbd_hc_rule_t  *hc_rule;
 438 
 439         for (hc = list_head(&ilbd_hc_list); hc != NULL;
 440             hc = list_next(&ilbd_hc_list, hc)) {
 441                 for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
 442                     hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
 443                         if (strcasecmp(hc_rule->hcr_rule->irl_name,
 444                             rulename) != 0) {
 445                                 continue;
 446                         }
 447                         ilbd_hc_copy_srvs(rbuf, rbufsz, hc_rule, rulename);
 448                         return (ILB_STATUS_OK);
 449                 }
 450         }
 451         return (ILB_STATUS_RULE_NO_HC);
 452 }
 453 
 454 /*
 455  * Initialize the hc timer and associate the notification of timeout to
 456  * the given event port.
 457  */
 458 void
 459 ilbd_hc_timer_init(int ev_port, ilbd_timer_event_obj_t *ev_obj)
 460 {
 461         struct sigevent sigev;
 462         port_notify_t notify;
 463 
 464         if ((ilbd_hc_timer_q = iu_tq_create()) == NULL) {
 465                 logerr("%s: cannot create hc timer queue", __func__);
 466                 exit(EXIT_FAILURE);
 467         }
 468         hc_timer_restarted = B_FALSE;
 469 
 470         ev_obj->ev = ILBD_EVENT_TIMER;
 471         ev_obj->timerid = -1;
 472 
 473         notify.portnfy_port = ev_port;
 474         notify.portnfy_user = ev_obj;
 475         sigev.sigev_notify = SIGEV_PORT;
 476         sigev.sigev_value.sival_ptr = &notify;
 477         if (timer_create(CLOCK_REALTIME, &sigev, &ev_obj->timerid) == -1) {
 478                 logerr("%s: cannot create timer", __func__);
 479                 exit(EXIT_FAILURE);
 480         }
 481 }
 482 
 483 /*
 484  * HC timeout handler.
 485  */
 486 void
 487 ilbd_hc_timeout(void)
 488 {
 489         (void) iu_expire_timers(ilbd_hc_timer_q);
 490         hc_timer_restarted = B_TRUE;
 491 }
 492 
 493 /*
 494  * Set up the timer to fire at the earliest timeout.
 495  */
 496 void
 497 ilbd_hc_timer_update(ilbd_timer_event_obj_t *ev_obj)
 498 {
 499         itimerspec_t itimeout;
 500         int timeout;
 501 
 502         /*
 503          * There is no change on the timer list, so no need to set up the
 504          * timer again.
 505          */
 506         if (!hc_timer_restarted)
 507                 return;
 508 
 509 restart:
 510         if ((timeout = iu_earliest_timer(ilbd_hc_timer_q)) == INFTIM) {
 511                 hc_timer_restarted = B_FALSE;
 512                 return;
 513         } else if (timeout == 0) {
 514                 /*
 515                  * Handle the timeout immediately.  After that (clearing all
 516                  * the expired timers), check to  see if there are still
 517                  * timers running.  If yes, start them.
 518                  */
 519                 (void) iu_expire_timers(ilbd_hc_timer_q);
 520                 goto restart;
 521         }
 522 
 523         itimeout.it_value.tv_sec = timeout / MILLISEC + 1;
 524         itimeout.it_value.tv_nsec = 0;
 525         itimeout.it_interval.tv_sec = 0;
 526         itimeout.it_interval.tv_nsec = 0;
 527 
 528         /*
 529          * Failure to set a timeout is "OK" since hopefully there will be
 530          * other events and timer_settime() will be called again.  So
 531          * we will only miss some timeouts.  But in the worst case, no event
 532          * will happen and ilbd will get stuck...
 533          */
 534         if (timer_settime(ev_obj->timerid, 0, &itimeout, NULL) == -1)
 535                 logerr("%s: cannot set timer", __func__);
 536         hc_timer_restarted = B_FALSE;
 537 }
 538 
 539 /*
 540  * Kill the probe process of a server.
 541  */
 542 static void
 543 ilbd_hc_kill_probe(ilbd_hc_srv_t *srv)
 544 {
 545         /*
 546          * First dissociate the fd from the event port.  It should not
 547          * fail.
 548          */
 549         if (port_dissociate(srv->shc_ev_port, PORT_SOURCE_FD,
 550             srv->shc_child_fd) != 0) {
 551                 logdebug("%s: port_dissociate: %s", __func__, strerror(errno));
 552         }
 553         (void) close(srv->shc_child_fd);
 554         free(srv->shc_ev);
 555         srv->shc_ev = NULL;
 556 
 557         /* Then kill the probe process. */
 558         if (kill(srv->shc_child_pid, SIGKILL) != 0) {
 559                 logerr("%s: rule %s server %s: %s", __func__,
 560                     srv->shc_hc_rule->hcr_rule->irl_name,
 561                     srv->shc_sg_srv->sgs_srvID, strerror(errno));
 562         }
 563         /* Should not fail... */
 564         if (waitpid(srv->shc_child_pid, NULL, 0) != srv->shc_child_pid) {
 565                 logdebug("%s: waitpid: rule %s server %s", __func__,
 566                     srv->shc_hc_rule->hcr_rule->irl_name,
 567                     srv->shc_sg_srv->sgs_srvID);
 568         }
 569         srv->shc_child_pid = 0;
 570 }
 571 
 572 /*
 573  * Disable the server, either because the server is dead or because a timer
 574  * cannot be started for this server.  Note that this only affects the
 575  * transient configuration, meaning only in memory.  The persistent
 576  * configuration is not affected.
 577  */
 578 static void
 579 ilbd_mark_server_disabled(ilbd_hc_srv_t *srv)
 580 {
 581         srv->shc_status = ILB_HCS_DISABLED;
 582 
 583         /* Disable the server in kernel. */
 584         if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
 585             srv->shc_hc_rule->hcr_rule->irl_name,
 586             stat_declare_srv_dead) != ILB_STATUS_OK) {
 587                 logerr("%s: cannot disable server in kernel: rule %s "
 588                     "server %s", __func__,
 589                     srv->shc_hc_rule->hcr_rule->irl_name,
 590                     srv->shc_sg_srv->sgs_srvID);
 591         }
 592 }
 593 
 594 /*
 595  * A probe fails, set the state of the server.
 596  */
 597 static void
 598 ilbd_set_fail_state(ilbd_hc_srv_t *srv)
 599 {
 600         if (++srv->shc_fail_cnt < srv->shc_hc->ihc_count) {
 601                 /* Probe again */
 602                 ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
 603                 return;
 604         }
 605 
 606         logdebug("%s: rule %s server %s fails %u", __func__,
 607             srv->shc_hc_rule->hcr_rule->irl_name, srv->shc_sg_srv->sgs_srvID,
 608             srv->shc_fail_cnt);
 609 
 610         /*
 611          * If this is a ping test, mark the server as
 612          * unreachable instead of dead.
 613          */
 614         if (srv->shc_hc->ihc_test_type == ILBD_HC_PING ||
 615             srv->shc_state == ilbd_hc_def_pinging) {
 616                 srv->shc_status = ILB_HCS_UNREACH;
 617         } else {
 618                 srv->shc_status = ILB_HCS_DEAD;
 619         }
 620 
 621         /* Disable the server in kernel. */
 622         if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
 623             srv->shc_hc_rule->hcr_rule->irl_name, stat_declare_srv_dead) !=
 624             ILB_STATUS_OK) {
 625                 logerr("%s: cannot disable server in kernel: rule %s "
 626                     "server %s", __func__,
 627                     srv->shc_hc_rule->hcr_rule->irl_name,
 628                     srv->shc_sg_srv->sgs_srvID);
 629         }
 630 
 631         /* Still keep probing in case the server is alive again. */
 632         if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
 633                 /* Only thing to do is to disable the server... */
 634                 logerr("%s: cannot restart timer: rule %s server %s", __func__,
 635                     srv->shc_hc_rule->hcr_rule->irl_name,
 636                     srv->shc_sg_srv->sgs_srvID);
 637                 srv->shc_status = ILB_HCS_DISABLED;
 638         }
 639 }
 640 
 641 /*
 642  * A probe process has not returned for the ihc_timeout period, we should
 643  * kill it.  This function is the handler of this.
 644  */
 645 /* ARGSUSED */
 646 static void
 647 ilbd_hc_kill_timer(iu_tq_t *tq, void *arg)
 648 {
 649         ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
 650 
 651         ilbd_hc_kill_probe(srv);
 652         ilbd_set_fail_state(srv);
 653 }
 654 
 655 /*
 656  * Probe timeout handler.  Send out the appropriate probe.
 657  */
 658 /* ARGSUSED */
 659 static void
 660 ilbd_hc_probe_timer(iu_tq_t *tq, void *arg)
 661 {
 662         ilbd_hc_srv_t *srv = (ilbd_hc_srv_t *)arg;
 663 
 664         /*
 665          * If starting the probe fails, just pretend that the timeout has
 666          * extended.
 667          */
 668         if (!ilbd_run_probe(srv)) {
 669                 /*
 670                  * If we cannot restart the timer, the only thing we can do
 671                  * is to disable this server.  Hopefully the sys admin will
 672                  * notice this and enable this server again later.
 673                  */
 674                 if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
 675                         logerr("%s: cannot restart timer: rule %s server %s, "
 676                             "disabling it", __func__,
 677                             srv->shc_hc_rule->hcr_rule->irl_name,
 678                             srv->shc_sg_srv->sgs_srvID);
 679                         ilbd_mark_server_disabled(srv);
 680                 }
 681                 return;
 682         }
 683 
 684         /*
 685          * Similar to above, if kill timer cannot be started, disable the
 686          * server.
 687          */
 688         if ((srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q,
 689             srv->shc_hc->ihc_timeout, ilbd_hc_kill_timer, srv)) == -1) {
 690                 logerr("%s: cannot start kill timer: rule %s server %s, "
 691                     "disabling it", __func__,
 692                     srv->shc_hc_rule->hcr_rule->irl_name,
 693                     srv->shc_sg_srv->sgs_srvID);
 694                 ilbd_mark_server_disabled(srv);
 695         }
 696         hc_timer_restarted = B_TRUE;
 697 }
 698 
 699 /* Restart the periodic timer for a given server. */
 700 static ilb_status_t
 701 ilbd_hc_restart_timer(ilbd_hc_t *hc, ilbd_hc_srv_t *srv)
 702 {
 703         int timeout;
 704 
 705         /* Don't allow the timeout interval to be less than 1s */
 706         timeout = MAX((hc->ihc_interval >> 1) + (gethrtime() %
 707             (hc->ihc_interval + 1)), 1);
 708 
 709         /*
 710          * If the probe is actually a ping probe, there is no need to
 711          * do default pinging.  Just skip the step.
 712          */
 713         if (hc->ihc_def_ping && hc->ihc_test_type != ILBD_HC_PING)
 714                 srv->shc_state = ilbd_hc_def_pinging;
 715         else
 716                 srv->shc_state = ilbd_hc_probing;
 717         srv->shc_tid = iu_schedule_timer(ilbd_hc_timer_q, timeout,
 718             ilbd_hc_probe_timer, srv);
 719 
 720         if (srv->shc_tid == -1)
 721                 return (ILB_STATUS_TIMER);
 722         srv->shc_lasttime = time(NULL);
 723         srv->shc_nexttime = time(NULL) + timeout;
 724 
 725         hc_timer_restarted = B_TRUE;
 726         return (ILB_STATUS_OK);
 727 }
 728 
 729 /* Helper routine to associate a server with its hc object. */
 730 static ilb_status_t
 731 ilbd_hc_srv_add(ilbd_hc_t *hc, ilbd_hc_rule_t *hc_rule,
 732     const ilb_sg_srv_t *srv, int ev_port)
 733 {
 734         ilbd_hc_srv_t *new_srv;
 735         ilb_status_t ret;
 736 
 737         if ((new_srv = calloc(1, sizeof (ilbd_hc_srv_t))) == NULL)
 738                 return (ILB_STATUS_ENOMEM);
 739         new_srv->shc_hc = hc;
 740         new_srv->shc_hc_rule = hc_rule;
 741         new_srv->shc_sg_srv = srv;
 742         new_srv->shc_ev_port = ev_port;
 743         new_srv->shc_tid = -1;
 744         new_srv->shc_nexttime = time(NULL);
 745         new_srv->shc_lasttime = new_srv->shc_nexttime;
 746 
 747         if ((hc_rule->hcr_rule->irl_flags & ILB_FLAGS_RULE_ENABLED) &&
 748             ILB_IS_SRV_ENABLED(srv->sgs_flags)) {
 749                 new_srv->shc_status = ILB_HCS_UNINIT;
 750                 ret = ilbd_hc_restart_timer(hc, new_srv);
 751                 if (ret != ILB_STATUS_OK) {
 752                         free(new_srv);
 753                         return (ret);
 754                 }
 755         } else {
 756                 new_srv->shc_status = ILB_HCS_DISABLED;
 757         }
 758 
 759         list_insert_tail(&hc_rule->hcr_servers, new_srv);
 760         return (ILB_STATUS_OK);
 761 }
 762 
 763 /* Handy macro to cancel a server's timer. */
 764 #define HC_CANCEL_TIMER(srv)                                            \
 765 {                                                                       \
 766         void *arg;                                                      \
 767         int ret;                                                        \
 768         if ((srv)->shc_tid != -1) {                                  \
 769                 ret = iu_cancel_timer(ilbd_hc_timer_q, (srv)->shc_tid, &arg); \
 770                 (srv)->shc_tid = -1;                                 \
 771                 assert(ret == 1);                                       \
 772                 assert(arg == (srv));                                   \
 773         }                                                               \
 774         hc_timer_restarted = B_TRUE;                                    \
 775 }
 776 
 777 /* Helper routine to dissociate a server from its hc object. */
 778 static ilb_status_t
 779 ilbd_hc_srv_rem(ilbd_hc_rule_t *hc_rule, const ilb_sg_srv_t *srv)
 780 {
 781         ilbd_hc_srv_t *tmp_srv;
 782 
 783         for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
 784             tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
 785                 if (tmp_srv->shc_sg_srv == srv) {
 786                         list_remove(&hc_rule->hcr_servers, tmp_srv);
 787                         HC_CANCEL_TIMER(tmp_srv);
 788                         if (tmp_srv->shc_child_pid != 0)
 789                                 ilbd_hc_kill_probe(tmp_srv);
 790                         free(tmp_srv);
 791                         return (ILB_STATUS_OK);
 792                 }
 793         }
 794         return (ILB_STATUS_ENOENT);
 795 }
 796 
 797 /* Helper routine to dissociate all servers of a rule from its hc object. */
 798 static void
 799 ilbd_hc_srv_rem_all(ilbd_hc_rule_t *hc_rule)
 800 {
 801         ilbd_hc_srv_t *srv;
 802 
 803         while ((srv = list_remove_head(&hc_rule->hcr_servers)) != NULL) {
 804                 HC_CANCEL_TIMER(srv);
 805                 if (srv->shc_child_pid != 0)
 806                         ilbd_hc_kill_probe(srv);
 807                 free(srv);
 808         }
 809 }
 810 
 811 /* Associate a rule with its hc object. */
 812 ilb_status_t
 813 ilbd_hc_associate_rule(const ilbd_rule_t *rule, int ev_port)
 814 {
 815         ilbd_hc_t       *hc;
 816         ilbd_hc_rule_t  *hc_rule;
 817         ilb_status_t    ret;
 818         ilbd_sg_t       *sg;
 819         ilbd_srv_t      *ilbd_srv;
 820 
 821         /* The rule is assumed to be initialized appropriately. */
 822         if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
 823                 logdebug("ilbd_hc_associate_rule: healthcheck %s does not "
 824                     "exist", rule->irl_hcname);
 825                 return (ILB_STATUS_ENOHCINFO);
 826         }
 827         if ((hc->ihc_test_type == ILBD_HC_TCP &&
 828             rule->irl_proto != IPPROTO_TCP) ||
 829             (hc->ihc_test_type == ILBD_HC_UDP &&
 830             rule->irl_proto != IPPROTO_UDP)) {
 831                 return (ILB_STATUS_RULE_HC_MISMATCH);
 832         }
 833         if ((hc_rule = calloc(1, sizeof (ilbd_hc_rule_t))) == NULL) {
 834                 logdebug("ilbd_hc_associate_rule: out of memory");
 835                 return (ILB_STATUS_ENOMEM);
 836         }
 837 
 838         hc_rule->hcr_rule = rule;
 839         list_create(&hc_rule->hcr_servers, sizeof (ilbd_hc_srv_t),
 840             offsetof(ilbd_hc_srv_t, shc_srv_link));
 841 
 842         /* Add all the servers. */
 843         sg = rule->irl_sg;
 844         for (ilbd_srv = list_head(&sg->isg_srvlist); ilbd_srv != NULL;
 845             ilbd_srv = list_next(&sg->isg_srvlist, ilbd_srv)) {
 846                 if ((ret = ilbd_hc_srv_add(hc, hc_rule, &ilbd_srv->isv_srv,
 847                     ev_port)) != ILB_STATUS_OK) {
 848                         /* Remove all previously added servers */
 849                         ilbd_hc_srv_rem_all(hc_rule);
 850                         list_destroy(&hc_rule->hcr_servers);
 851                         free(hc_rule);
 852                         return (ret);
 853                 }
 854         }
 855         list_insert_tail(&hc->ihc_rules, hc_rule);
 856         hc->ihc_rule_cnt++;
 857 
 858         return (ILB_STATUS_OK);
 859 }
 860 
 861 /* Dissociate a rule from its hc object. */
 862 ilb_status_t
 863 ilbd_hc_dissociate_rule(const ilbd_rule_t *rule)
 864 {
 865         ilbd_hc_t       *hc;
 866         ilbd_hc_rule_t  *hc_rule;
 867 
 868         /* The rule is assumed to be initialized appropriately. */
 869         if ((hc = ilbd_get_hc(rule->irl_hcname)) == NULL) {
 870                 logdebug("ilbd_hc_dissociate_rule: healthcheck %s does not "
 871                     "exist", rule->irl_hcname);
 872                 return (ILB_STATUS_ENOENT);
 873         }
 874         for (hc_rule = list_head(&hc->ihc_rules); hc_rule != NULL;
 875             hc_rule = list_next(&hc->ihc_rules, hc_rule)) {
 876                 if (hc_rule->hcr_rule == rule)
 877                         break;
 878         }
 879         if (hc_rule == NULL) {
 880                 logdebug("ilbd_hc_dissociate_rule: rule %s is not associated "
 881                     "with healtcheck %s", rule->irl_hcname, hc->ihc_name);
 882                 return (ILB_STATUS_ENOENT);
 883         }
 884         ilbd_hc_srv_rem_all(hc_rule);
 885         list_remove(&hc->ihc_rules, hc_rule);
 886         hc->ihc_rule_cnt--;
 887         list_destroy(&hc_rule->hcr_servers);
 888         free(hc_rule);
 889         return (ILB_STATUS_OK);
 890 }
 891 
 892 /*
 893  * Given a hc object name and a rule, check to see if the rule is associated
 894  * with the hc object.  If it is, the hc object is returned in **hc and the
 895  * ilbd_hc_rule_t is returned in **hc_rule.
 896  */
 897 static boolean_t
 898 ilbd_hc_check_rule(const char *hc_name, const ilbd_rule_t *rule,
 899     ilbd_hc_t **hc, ilbd_hc_rule_t **hc_rule)
 900 {
 901         ilbd_hc_t       *tmp_hc;
 902         ilbd_hc_rule_t  *tmp_hc_rule;
 903 
 904         if ((tmp_hc = ilbd_get_hc(hc_name)) == NULL)
 905                 return (B_FALSE);
 906         for (tmp_hc_rule = list_head(&tmp_hc->ihc_rules); tmp_hc_rule != NULL;
 907             tmp_hc_rule = list_next(&tmp_hc->ihc_rules, tmp_hc_rule)) {
 908                 if (tmp_hc_rule->hcr_rule == rule) {
 909                         *hc = tmp_hc;
 910                         *hc_rule = tmp_hc_rule;
 911                         return (B_TRUE);
 912                 }
 913         }
 914         return (B_FALSE);
 915 }
 916 
 917 /* Associate a server with its hc object. */
 918 ilb_status_t
 919 ilbd_hc_add_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
 920     int ev_port)
 921 {
 922         ilbd_hc_t       *hc;
 923         ilbd_hc_rule_t  *hc_rule;
 924 
 925         if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
 926                 return (ILB_STATUS_ENOENT);
 927         return (ilbd_hc_srv_add(hc, hc_rule, srv, ev_port));
 928 }
 929 
 930 /* Dissociate a server from its hc object. */
 931 ilb_status_t
 932 ilbd_hc_del_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
 933 {
 934         ilbd_hc_t       *hc;
 935         ilbd_hc_rule_t  *hc_rule;
 936 
 937         if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
 938                 return (ILB_STATUS_ENOENT);
 939         return (ilbd_hc_srv_rem(hc_rule, srv));
 940 }
 941 
 942 /* Helper routine to enable/disable a server's hc probe. */
 943 static ilb_status_t
 944 ilbd_hc_toggle_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv,
 945     boolean_t enable)
 946 {
 947         ilbd_hc_t       *hc;
 948         ilbd_hc_rule_t  *hc_rule;
 949         ilbd_hc_srv_t   *tmp_srv;
 950         ilb_status_t    ret;
 951 
 952         if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
 953                 return (ILB_STATUS_ENOENT);
 954         for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
 955             tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
 956                 if (tmp_srv->shc_sg_srv != srv) {
 957                         continue;
 958                 }
 959                 if (enable) {
 960                         if (tmp_srv->shc_status == ILB_HCS_DISABLED) {
 961                                 ret = ilbd_hc_restart_timer(hc, tmp_srv);
 962                                 if (ret != ILB_STATUS_OK) {
 963                                         logerr("%s: cannot start timers for "
 964                                             "rule %s server %s", __func__,
 965                                             rule->irl_name,
 966                                             tmp_srv->shc_sg_srv->sgs_srvID);
 967                                         return (ret);
 968                                 }
 969                                 /* Start from fresh... */
 970                                 tmp_srv->shc_status = ILB_HCS_UNINIT;
 971                                 tmp_srv->shc_rtt = 0;
 972                                 tmp_srv->shc_fail_cnt = 0;
 973                         }
 974                 } else {
 975                         if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
 976                                 tmp_srv->shc_status = ILB_HCS_DISABLED;
 977                                 HC_CANCEL_TIMER(tmp_srv);
 978                                 if (tmp_srv->shc_child_pid != 0)
 979                                         ilbd_hc_kill_probe(tmp_srv);
 980                         }
 981                 }
 982                 return (ILB_STATUS_OK);
 983         }
 984         return (ILB_STATUS_ENOENT);
 985 }
 986 
 987 ilb_status_t
 988 ilbd_hc_enable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
 989 {
 990         return (ilbd_hc_toggle_server(rule, srv, B_TRUE));
 991 }
 992 
 993 ilb_status_t
 994 ilbd_hc_disable_server(const ilbd_rule_t *rule, const ilb_sg_srv_t *srv)
 995 {
 996         return (ilbd_hc_toggle_server(rule, srv, B_FALSE));
 997 }
 998 
 999 /*
1000  * Helper routine to enable/disable a rule's hc probe (including all its
1001  * servers).
1002  */
1003 static ilb_status_t
1004 ilbd_hc_toggle_rule(const ilbd_rule_t *rule, boolean_t enable)
1005 {
1006         ilbd_hc_t       *hc;
1007         ilbd_hc_rule_t  *hc_rule;
1008         ilbd_hc_srv_t   *tmp_srv;
1009         int             ret;
1010 
1011         if (!ilbd_hc_check_rule(rule->irl_hcname, rule, &hc, &hc_rule))
1012                 return (ILB_STATUS_ENOENT);
1013 
1014         for (tmp_srv = list_head(&hc_rule->hcr_servers); tmp_srv != NULL;
1015             tmp_srv = list_next(&hc_rule->hcr_servers, tmp_srv)) {
1016                 if (enable) {
1017                         /*
1018                          * If the server is disabled in the rule, do not
1019                          * restart its timer.
1020                          */
1021                         if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1022                             ILB_IS_SRV_ENABLED(
1023                             tmp_srv->shc_sg_srv->sgs_flags)) {
1024                                 ret = ilbd_hc_restart_timer(hc, tmp_srv);
1025                                 if (ret != ILB_STATUS_OK) {
1026                                         logerr("%s: cannot start timers for "
1027                                             "rule %s server %s", __func__,
1028                                             rule->irl_name,
1029                                             tmp_srv->shc_sg_srv->sgs_srvID);
1030                                         goto rollback;
1031                                 } else {
1032                                         /* Start from fresh... */
1033                                         tmp_srv->shc_status = ILB_HCS_UNINIT;
1034                                         tmp_srv->shc_rtt = 0;
1035                                         tmp_srv->shc_fail_cnt = 0;
1036                                 }
1037                         }
1038                 } else {
1039                         if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1040                                 HC_CANCEL_TIMER(tmp_srv);
1041                                 tmp_srv->shc_status = ILB_HCS_DISABLED;
1042                                 if (tmp_srv->shc_child_pid != 0)
1043                                         ilbd_hc_kill_probe(tmp_srv);
1044                         }
1045                 }
1046         }
1047         return (ILB_STATUS_OK);
1048 rollback:
1049         enable = !enable;
1050         for (tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv);
1051             tmp_srv != NULL;
1052             tmp_srv = list_prev(&hc_rule->hcr_servers, tmp_srv)) {
1053                 if (enable) {
1054                         if (tmp_srv->shc_status == ILB_HCS_DISABLED &&
1055                             ILB_IS_SRV_ENABLED(
1056                             tmp_srv->shc_sg_srv->sgs_flags)) {
1057                                 (void) ilbd_hc_restart_timer(hc, tmp_srv);
1058                                 tmp_srv->shc_status = ILB_HCS_UNINIT;
1059                                 tmp_srv->shc_rtt = 0;
1060                                 tmp_srv->shc_fail_cnt = 0;
1061                         }
1062                 } else {
1063                         if (tmp_srv->shc_status != ILB_HCS_DISABLED) {
1064                                 HC_CANCEL_TIMER(tmp_srv);
1065                                 tmp_srv->shc_status = ILB_HCS_DISABLED;
1066                                 if (tmp_srv->shc_child_pid != 0)
1067                                         ilbd_hc_kill_probe(tmp_srv);
1068                         }
1069                 }
1070         }
1071         return (ret);
1072 }
1073 
1074 ilb_status_t
1075 ilbd_hc_enable_rule(const ilbd_rule_t *rule)
1076 {
1077         return (ilbd_hc_toggle_rule(rule, B_TRUE));
1078 }
1079 
1080 ilb_status_t
1081 ilbd_hc_disable_rule(const ilbd_rule_t *rule)
1082 {
1083         return (ilbd_hc_toggle_rule(rule, B_FALSE));
1084 }
1085 
1086 static const char *
1087 topo_2_str(ilb_topo_t topo)
1088 {
1089         switch (topo) {
1090         case ILB_TOPO_DSR:
1091                 return ("DSR");
1092         case ILB_TOPO_NAT:
1093                 return ("NAT");
1094         case ILB_TOPO_HALF_NAT:
1095                 return ("HALF_NAT");
1096         default:
1097                 /* Should not happen. */
1098                 logerr("%s: unknown topology", __func__);
1099                 break;
1100         }
1101         return ("");
1102 }
1103 
1104 /*
1105  * Create the argument list to be passed to a hc probe command.
1106  * The passed in argv is assumed to have HC_PROBE_ARGC elements.
1107  */
1108 static boolean_t
1109 create_argv(ilbd_hc_srv_t *srv, char *argv[])
1110 {
1111         char buf[INET6_ADDRSTRLEN];
1112         ilbd_rule_t const *rule;
1113         ilb_sg_srv_t const *sg_srv;
1114         struct in_addr v4_addr;
1115         in_port_t port;
1116         int i;
1117 
1118         rule = srv->shc_hc_rule->hcr_rule;
1119         sg_srv = srv->shc_sg_srv;
1120 
1121         if (srv->shc_state == ilbd_hc_def_pinging) {
1122                 if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL)
1123                         return (B_FALSE);
1124         } else {
1125                 switch (srv->shc_hc->ihc_test_type) {
1126                 case ILBD_HC_USER:
1127                         if ((argv[0] = strdup(srv->shc_hc->ihc_test)) == NULL)
1128                                 return (B_FALSE);
1129                         break;
1130                 case ILBD_HC_TCP:
1131                 case ILBD_HC_UDP:
1132                         if ((argv[0] = strdup(ILB_PROBE_PROTO)) ==
1133                             NULL) {
1134                                 return (B_FALSE);
1135                         }
1136                         break;
1137                 case ILBD_HC_PING:
1138                         if ((argv[0] = strdup(ILB_PROBE_PING)) == NULL) {
1139                                 return (B_FALSE);
1140                         }
1141                         break;
1142                 }
1143         }
1144 
1145         /*
1146          * argv[1] is the VIP.
1147          *
1148          * Right now, the VIP and the backend server addresses should be
1149          * in the same IP address family.  Here we don't do that in case
1150          * this assumption is changed in future.
1151          */
1152         if (IN6_IS_ADDR_V4MAPPED(&rule->irl_vip)) {
1153                 IN6_V4MAPPED_TO_INADDR(&rule->irl_vip, &v4_addr);
1154                 if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1155                         goto cleanup;
1156         } else {
1157                 if (inet_ntop(AF_INET6, &rule->irl_vip, buf,
1158                     sizeof (buf)) == NULL) {
1159                         goto cleanup;
1160                 }
1161         }
1162         if ((argv[1] = strdup(buf)) == NULL)
1163                 goto cleanup;
1164 
1165         /*
1166          * argv[2] is the backend server address.
1167          */
1168         if (IN6_IS_ADDR_V4MAPPED(&sg_srv->sgs_addr)) {
1169                 IN6_V4MAPPED_TO_INADDR(&sg_srv->sgs_addr, &v4_addr);
1170                 if (inet_ntop(AF_INET, &v4_addr, buf, sizeof (buf)) == NULL)
1171                         goto cleanup;
1172         } else {
1173                 if (inet_ntop(AF_INET6, &sg_srv->sgs_addr, buf,
1174                     sizeof (buf)) == NULL) {
1175                         goto cleanup;
1176                 }
1177         }
1178         if ((argv[2] = strdup(buf)) == NULL)
1179                 goto cleanup;
1180 
1181         /*
1182          * argv[3] is the transport protocol used in the rule.
1183          */
1184         switch (rule->irl_proto) {
1185         case IPPROTO_TCP:
1186                 argv[3] = strdup("TCP");
1187                 break;
1188         case IPPROTO_UDP:
1189                 argv[3] = strdup("UDP");
1190                 break;
1191         default:
1192                 logerr("%s: unknown protocol", __func__);
1193                 goto cleanup;
1194         }
1195         if (argv[3] == NULL)
1196                 goto cleanup;
1197 
1198         /*
1199          * argv[4] is the load balance mode, DSR, NAT, HALF-NAT.
1200          */
1201         if ((argv[4] = strdup(topo_2_str(rule->irl_topo))) == NULL)
1202                 goto cleanup;
1203 
1204         /*
1205          * argv[5] is the port range.  Right now, there should only be 1 port.
1206          */
1207         switch (rule->irl_hcpflag) {
1208         case ILB_HCI_PROBE_FIX:
1209                 port = ntohs(rule->irl_hcport);
1210                 break;
1211         case ILB_HCI_PROBE_ANY: {
1212                 in_port_t min, max;
1213 
1214                 if (ntohs(sg_srv->sgs_minport) == 0) {
1215                         min = ntohs(rule->irl_minport);
1216                         max = ntohs(rule->irl_maxport);
1217                 } else {
1218                         min = ntohs(sg_srv->sgs_minport);
1219                         max = ntohs(sg_srv->sgs_maxport);
1220                 }
1221                 if (max > min)
1222                         port = min + gethrtime() % (max - min + 1);
1223                 else
1224                         port = min;
1225                 break;
1226         }
1227         default:
1228                 logerr("%s: unknown HC flag", __func__);
1229                 goto cleanup;
1230         }
1231         (void) sprintf(buf, "%d", port);
1232         if ((argv[5] = strdup(buf)) == NULL)
1233                 goto cleanup;
1234 
1235         /*
1236          * argv[6] is the probe timeout.
1237          */
1238         (void) sprintf(buf, "%d", srv->shc_hc->ihc_timeout);
1239         if ((argv[6] = strdup(buf)) == NULL)
1240                 goto cleanup;
1241 
1242         argv[7] = NULL;
1243         return (B_TRUE);
1244 
1245 cleanup:
1246         for (i = 0; i < HC_PROBE_ARGC; i++) {
1247                 if (argv[i] != NULL)
1248                         free(argv[i]);
1249         }
1250         return (B_FALSE);
1251 }
1252 
1253 static void
1254 destroy_argv(char *argv[])
1255 {
1256         int i;
1257 
1258         for (i = 0; argv[i] != NULL; i++)
1259                 free(argv[i]);
1260 }
1261 
1262 /* Spawn a process to run the hc probe on the given server. */
1263 static boolean_t
1264 ilbd_run_probe(ilbd_hc_srv_t *srv)
1265 {
1266         posix_spawn_file_actions_t      fd_actions;
1267         posix_spawnattr_t               attr;
1268         sigset_t                        child_sigset;
1269         int                             fds[2];
1270         int                             fdflags;
1271         pid_t                           pid;
1272         char                            *child_argv[HC_PROBE_ARGC];
1273         ilbd_hc_probe_event_t           *probe_ev;
1274         char                            *probe_name;
1275 
1276         bzero(child_argv, HC_PROBE_ARGC * sizeof (char *));
1277         if ((probe_ev = calloc(1, sizeof (*probe_ev))) == NULL) {
1278                 logdebug("ilbd_run_probe: calloc");
1279                 return (B_FALSE);
1280         }
1281 
1282         /* Set up a pipe to get output from probe command. */
1283         if (pipe(fds) < 0) {
1284                 logdebug("ilbd_run_probe: cannot create pipe");
1285                 free(probe_ev);
1286                 return (B_FALSE);
1287         }
1288         /* Set our side of the pipe to be non-blocking */
1289         if ((fdflags = fcntl(fds[0], F_GETFL, 0)) == -1) {
1290                 logdebug("ilbd_run_probe: fcntl(F_GETFL)");
1291                 goto cleanup;
1292         }
1293         if (fcntl(fds[0], F_SETFL, fdflags | O_NONBLOCK) == -1) {
1294                 logdebug("ilbd_run_probe: fcntl(F_SETFL)");
1295                 goto cleanup;
1296         }
1297 
1298         if (posix_spawn_file_actions_init(&fd_actions) != 0) {
1299                 logdebug("ilbd_run_probe: posix_spawn_file_actions_init");
1300                 goto cleanup;
1301         }
1302         if (posix_spawnattr_init(&attr) != 0) {
1303                 logdebug("ilbd_run_probe: posix_spawnattr_init");
1304                 goto cleanup;
1305         }
1306         if (posix_spawn_file_actions_addclose(&fd_actions, fds[0]) != 0) {
1307                 logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1308                 goto cleanup;
1309         }
1310         if (posix_spawn_file_actions_adddup2(&fd_actions, fds[1],
1311             STDOUT_FILENO) != 0) {
1312                 logdebug("ilbd_run_probe: posix_spawn_file_actions_dup2");
1313                 goto cleanup;
1314         }
1315         if (posix_spawn_file_actions_addclose(&fd_actions, fds[1]) != 0) {
1316                 logdebug("ilbd_run_probe: posix_spawn_file_actions_addclose");
1317                 goto cleanup;
1318         }
1319 
1320         /* Reset all signal handling of the child to default. */
1321         (void) sigfillset(&child_sigset);
1322         if (posix_spawnattr_setsigdefault(&attr, &child_sigset) != 0) {
1323                 logdebug("ilbd_run_probe: posix_spawnattr_setsigdefault");
1324                 goto cleanup;
1325         }
1326         /* Don't want SIGCHLD. */
1327         if (posix_spawnattr_setflags(&attr, POSIX_SPAWN_NOSIGCHLD_NP|
1328             POSIX_SPAWN_SETSIGDEF) != 0) {
1329                 logdebug("ilbd_run_probe: posix_spawnattr_setflags");
1330                 goto cleanup;
1331         }
1332 
1333         if (!create_argv(srv, child_argv)) {
1334                 logdebug("ilbd_run_probe: create_argv");
1335                 goto cleanup;
1336         }
1337 
1338         /*
1339          * If we are doing default pinging or not using a user supplied
1340          * probe, we should execute our standard supplied probe.  The
1341          * supplied probe command handles all types of probes.  And the
1342          * type used depends on argv[0], as filled in by create_argv().
1343          */
1344         if (srv->shc_state == ilbd_hc_def_pinging ||
1345             srv->shc_hc->ihc_test_type != ILBD_HC_USER) {
1346                 probe_name = ILB_PROBE_PROTO;
1347         } else {
1348                 probe_name = srv->shc_hc->ihc_test;
1349         }
1350         if (posix_spawn(&pid, probe_name, &fd_actions, &attr, child_argv,
1351             NULL) != 0) {
1352                 logerr("%s: posix_spawn: %s for server %s: %s", __func__,
1353                     srv->shc_hc->ihc_test, srv->shc_sg_srv->sgs_srvID,
1354                     strerror(errno));
1355                 goto cleanup;
1356         }
1357 
1358         (void) close(fds[1]);
1359         destroy_argv(child_argv);
1360         srv->shc_child_pid = pid;
1361         srv->shc_child_fd = fds[0];
1362         srv->shc_ev = probe_ev;
1363 
1364         probe_ev->ihp_ev = ILBD_EVENT_PROBE;
1365         probe_ev->ihp_srv = srv;
1366         probe_ev->ihp_pid = pid;
1367         if (port_associate(srv->shc_ev_port, PORT_SOURCE_FD, fds[0],
1368             POLLRDNORM, probe_ev) != 0) {
1369                 /*
1370                  * Need to kill the child.  It will free the srv->shc_ev,
1371                  * which is probe_ev.  So set probe_ev to NULL.
1372                  */
1373                 ilbd_hc_kill_probe(srv);
1374                 probe_ev = NULL;
1375                 goto cleanup;
1376         }
1377 
1378         return (B_TRUE);
1379 
1380 cleanup:
1381         (void) close(fds[0]);
1382         (void) close(fds[1]);
1383         destroy_argv(child_argv);
1384         if (probe_ev != NULL)
1385                 free(probe_ev);
1386         return (B_FALSE);
1387 }
1388 
1389 /*
1390  * Called by ild_hc_probe_return() to re-associate the fd to a child to
1391  * the event port.
1392  */
1393 static void
1394 reassociate_port(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1395 {
1396         if (port_associate(ev_port, PORT_SOURCE_FD, fd,
1397             POLLRDNORM, ev) != 0) {
1398                 /*
1399                  * If we cannot reassociate with the port, the only
1400                  * thing we can do now is to kill the child and
1401                  * do a blocking wait here...
1402                  */
1403                 logdebug("%s: port_associate: %s", __func__, strerror(errno));
1404                 if (kill(ev->ihp_pid, SIGKILL) != 0)
1405                         logerr("%s: kill: %s", __func__, strerror(errno));
1406                 if (waitpid(ev->ihp_pid, NULL, 0) != ev->ihp_pid)
1407                         logdebug("%s: waitpid: %s", __func__, strerror(errno));
1408                 free(ev);
1409         }
1410 }
1411 
1412 /*
1413  * To handle a child probe process hanging up.
1414  */
1415 static void
1416 ilbd_hc_child_hup(int ev_port, int fd, ilbd_hc_probe_event_t *ev)
1417 {
1418         ilbd_hc_srv_t *srv;
1419         pid_t ret_pid;
1420         int ret;
1421 
1422         srv = ev->ihp_srv;
1423 
1424         if (!ev->ihp_done) {
1425                 /* ilbd does not care about this process anymore ... */
1426                 ev->ihp_done = B_TRUE;
1427                 srv->shc_ev = NULL;
1428                 srv->shc_child_pid = 0;
1429                 HC_CANCEL_TIMER(srv);
1430                 ilbd_set_fail_state(srv);
1431         }
1432         ret_pid = waitpid(ev->ihp_pid, &ret, WNOHANG);
1433         switch (ret_pid) {
1434         case -1:
1435                 logperror("ilbd_hc_child_hup: waitpid");
1436                 /* FALLTHROUGH */
1437         case 0:
1438                 /* The child has not completed the exit. Wait again. */
1439                 reassociate_port(ev_port, fd, ev);
1440                 break;
1441         default:
1442                 /* Right now, we just ignore the exit status. */
1443                 if (WIFEXITED(ret))
1444                         ret = WEXITSTATUS(ret);
1445                 (void) close(fd);
1446                 free(ev);
1447         }
1448 }
1449 
1450 /*
1451  * To read the output of a child probe process.
1452  */
1453 static void
1454 ilbd_hc_child_data(int fd, ilbd_hc_probe_event_t *ev)
1455 {
1456         ilbd_hc_srv_t *srv;
1457         char buf[HC_MAX_PROBE_OUTPUT];
1458         int ret;
1459         int64_t rtt;
1460 
1461         srv = ev->ihp_srv;
1462 
1463         bzero(buf, HC_MAX_PROBE_OUTPUT);
1464         ret = read(fd, buf, HC_MAX_PROBE_OUTPUT - 1);
1465         /* Should not happen since event port should have caught this. */
1466         assert(ret > 0);
1467 
1468         /*
1469          * We expect the probe command to print out the RTT only.  But
1470          * the command may misbehave and print out more than what we intend to
1471          * read in.  So need to do this check below to "flush" out all the
1472          * output from the command.
1473          */
1474         if (!ev->ihp_done) {
1475                 ev->ihp_done = B_TRUE;
1476                 /* We don't need to know about this event anymore. */
1477                 srv->shc_ev = NULL;
1478                 srv->shc_child_pid = 0;
1479                 HC_CANCEL_TIMER(srv);
1480         } else {
1481                 return;
1482         }
1483 
1484         rtt = strtoll(buf, NULL, 10);
1485 
1486         /*
1487          * -1 means the server is dead or the probe somehow fails.  Treat
1488          * them both as server is dead.
1489          */
1490         if (rtt == -1) {
1491                 ilbd_set_fail_state(srv);
1492                 return;
1493         } else if (rtt > 0) {
1494                 /* If the returned RTT value is not valid, just ignore it. */
1495                 if (rtt > 0 && rtt <= UINT_MAX) {
1496                         /* Set rtt to be the simple smoothed average. */
1497                         if (srv->shc_rtt == 0) {
1498                                 srv->shc_rtt = rtt;
1499                         } else {
1500                                 srv->shc_rtt = 3 * ((srv)->shc_rtt >> 2) +
1501                                     (rtt >> 2);
1502                         }
1503                 }
1504 
1505         }
1506 
1507         switch (srv->shc_state) {
1508         case ilbd_hc_def_pinging:
1509                 srv->shc_state = ilbd_hc_probing;
1510 
1511                 /* Ping is OK, now start the probe. */
1512                 ilbd_hc_probe_timer(ilbd_hc_timer_q, srv);
1513                 break;
1514         case ilbd_hc_probing:
1515                 srv->shc_fail_cnt = 0;
1516 
1517                 /* Server is dead before, re-enable it. */
1518                 if (srv->shc_status == ILB_HCS_UNREACH ||
1519                     srv->shc_status == ILB_HCS_DEAD) {
1520                         /*
1521                          * If enabling the server in kernel fails now,
1522                          * hopefully when the timer fires again later, the
1523                          * enabling can be done.
1524                          */
1525                         if (ilbd_k_Xable_server(&srv->shc_sg_srv->sgs_addr,
1526                             srv->shc_hc_rule->hcr_rule->irl_name,
1527                             stat_declare_srv_alive) != ILB_STATUS_OK) {
1528                                 logerr("%s: cannot enable server in kernel: "
1529                                     " rule %s server %s", __func__,
1530                                     srv->shc_hc_rule->hcr_rule->irl_name,
1531                                     srv->shc_sg_srv->sgs_srvID);
1532                         } else {
1533                                 srv->shc_status = ILB_HCS_ALIVE;
1534                         }
1535                 } else {
1536                         srv->shc_status = ILB_HCS_ALIVE;
1537                 }
1538                 if (ilbd_hc_restart_timer(srv->shc_hc, srv) != ILB_STATUS_OK) {
1539                         logerr("%s: cannot restart timer: rule %s server %s",
1540                             __func__, srv->shc_hc_rule->hcr_rule->irl_name,
1541                             srv->shc_sg_srv->sgs_srvID);
1542                         ilbd_mark_server_disabled(srv);
1543                 }
1544                 break;
1545         default:
1546                 logdebug("%s: unknown state", __func__);
1547                 break;
1548         }
1549 }
1550 
1551 /*
1552  * Handle the return event of a child probe fd.
1553  */
1554 void
1555 ilbd_hc_probe_return(int ev_port, int fd, int port_events,
1556     ilbd_hc_probe_event_t *ev)
1557 {
1558         /*
1559          * Note that there can be more than one events delivered to us at
1560          * the same time.  So we need to check them individually.
1561          */
1562         if (port_events & POLLRDNORM)
1563                 ilbd_hc_child_data(fd, ev);
1564 
1565         if (port_events & (POLLHUP|POLLERR)) {
1566                 ilbd_hc_child_hup(ev_port, fd, ev);
1567                 return;
1568         }
1569 
1570         /*
1571          * Re-associate the fd with the port so that when the child
1572          * exits, we can reap the status.
1573          */
1574         reassociate_port(ev_port, fd, ev);
1575 }