1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2015 Garrett D'Amore <garrett@damore.org>
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/stropts.h>
  31 #include <sys/errno.h>
  32 #include <sys/strlog.h>
  33 #include <sys/tihdr.h>
  34 #include <sys/socket.h>
  35 #include <sys/ddi.h>
  36 #include <sys/sunddi.h>
  37 #include <sys/kmem.h>
  38 #include <sys/zone.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/cmn_err.h>
  41 #include <sys/vtrace.h>
  42 #include <sys/debug.h>
  43 #include <sys/atomic.h>
  44 #include <sys/strsun.h>
  45 #include <sys/random.h>
  46 #include <netinet/in.h>
  47 #include <net/if.h>
  48 #include <netinet/ip6.h>
  49 #include <net/pfkeyv2.h>
  50 #include <net/pfpolicy.h>
  51 
  52 #include <inet/common.h>
  53 #include <inet/mi.h>
  54 #include <inet/nd.h>
  55 #include <inet/ip.h>
  56 #include <inet/ip_impl.h>
  57 #include <inet/ip6.h>
  58 #include <inet/ip_if.h>
  59 #include <inet/ip_ndp.h>
  60 #include <inet/sadb.h>
  61 #include <inet/ipsec_info.h>
  62 #include <inet/ipsec_impl.h>
  63 #include <inet/ipsecesp.h>
  64 #include <inet/ipdrop.h>
  65 #include <inet/tcp.h>
  66 #include <sys/kstat.h>
  67 #include <sys/policy.h>
  68 #include <sys/strsun.h>
  69 #include <sys/strsubr.h>
  70 #include <inet/udp_impl.h>
  71 #include <sys/taskq.h>
  72 #include <sys/note.h>
  73 
  74 #include <sys/tsol/tnet.h>
  75 
  76 /*
  77  * Table of ND variables supported by ipsecesp. These are loaded into
  78  * ipsecesp_g_nd in ipsecesp_init_nd.
  79  * All of these are alterable, within the min/max values given, at run time.
  80  */
  81 static  ipsecespparam_t lcl_param_arr[] = {
  82         /* min  max                     value   name */
  83         { 0,    3,                      0,      "ipsecesp_debug"},
  84         { 125,  32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
  85         { 1,    10,                     1,      "ipsecesp_reap_delay"},
  86         { 1,    SADB_MAX_REPLAY,        64,     "ipsecesp_replay_size"},
  87         { 1,    300,                    15,     "ipsecesp_acquire_timeout"},
  88         { 1,    1800,                   90,     "ipsecesp_larval_timeout"},
  89         /* Default lifetime values for ACQUIRE messages. */
  90         { 0,    0xffffffffU,    0,      "ipsecesp_default_soft_bytes"},
  91         { 0,    0xffffffffU,    0,      "ipsecesp_default_hard_bytes"},
  92         { 0,    0xffffffffU,    24000,  "ipsecesp_default_soft_addtime"},
  93         { 0,    0xffffffffU,    28800,  "ipsecesp_default_hard_addtime"},
  94         { 0,    0xffffffffU,    0,      "ipsecesp_default_soft_usetime"},
  95         { 0,    0xffffffffU,    0,      "ipsecesp_default_hard_usetime"},
  96         { 0,    1,              0,      "ipsecesp_log_unknown_spi"},
  97         { 0,    2,              1,      "ipsecesp_padding_check"},
  98         { 0,    600,            20,     "ipsecesp_nat_keepalive_interval"},
  99 };
 100 #define ipsecesp_debug  ipsecesp_params[0].ipsecesp_param_value
 101 #define ipsecesp_age_interval ipsecesp_params[1].ipsecesp_param_value
 102 #define ipsecesp_age_int_max    ipsecesp_params[1].ipsecesp_param_max
 103 #define ipsecesp_reap_delay     ipsecesp_params[2].ipsecesp_param_value
 104 #define ipsecesp_replay_size    ipsecesp_params[3].ipsecesp_param_value
 105 #define ipsecesp_acquire_timeout        \
 106         ipsecesp_params[4].ipsecesp_param_value
 107 #define ipsecesp_larval_timeout \
 108         ipsecesp_params[5].ipsecesp_param_value
 109 #define ipsecesp_default_soft_bytes     \
 110         ipsecesp_params[6].ipsecesp_param_value
 111 #define ipsecesp_default_hard_bytes     \
 112         ipsecesp_params[7].ipsecesp_param_value
 113 #define ipsecesp_default_soft_addtime   \
 114         ipsecesp_params[8].ipsecesp_param_value
 115 #define ipsecesp_default_hard_addtime   \
 116         ipsecesp_params[9].ipsecesp_param_value
 117 #define ipsecesp_default_soft_usetime   \
 118         ipsecesp_params[10].ipsecesp_param_value
 119 #define ipsecesp_default_hard_usetime   \
 120         ipsecesp_params[11].ipsecesp_param_value
 121 #define ipsecesp_log_unknown_spi        \
 122         ipsecesp_params[12].ipsecesp_param_value
 123 #define ipsecesp_padding_check  \
 124         ipsecesp_params[13].ipsecesp_param_value
 125 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
 126 
 127 #define esp0dbg(a)      printf a
 128 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
 129 #define esp1dbg(espstack, a)    if (espstack->ipsecesp_debug != 0) printf a
 130 #define esp2dbg(espstack, a)    if (espstack->ipsecesp_debug > 1) printf a
 131 #define esp3dbg(espstack, a)    if (espstack->ipsecesp_debug > 2) printf a
 132 
 133 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
 134 static int ipsecesp_close(queue_t *);
 135 static void ipsecesp_wput(queue_t *, mblk_t *);
 136 static void     *ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
 137 static void     ipsecesp_stack_fini(netstackid_t stackid, void *arg);
 138 static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
 139 
 140 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
 141 static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *);
 142 static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *);
 143 
 144 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
 145     ipsecesp_stack_t *, cred_t *);
 146 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
 147     kstat_named_t **, ipsecesp_stack_t *);
 148 static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *,
 149     ipsa_t *, uint_t);
 150 static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *,
 151     ipsa_t *, uchar_t *, uint_t);
 152 
 153 /* Setable in /etc/system */
 154 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
 155 
 156 static struct module_info info = {
 157         5137, "ipsecesp", 0, INFPSZ, 65536, 1024
 158 };
 159 
 160 static struct qinit rinit = {
 161         (pfi_t)putnext, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
 162         NULL
 163 };
 164 
 165 static struct qinit winit = {
 166         (pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
 167         NULL
 168 };
 169 
 170 struct streamtab ipsecespinfo = {
 171         &rinit, &winit, NULL, NULL
 172 };
 173 
 174 static taskq_t *esp_taskq;
 175 
 176 /*
 177  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
 178  *
 179  * Question:    Do I need this, given that all instance's esps->esps_wq point
 180  *              to IP?
 181  *
 182  * Answer:      Yes, because I need to know which queue is BOUND to
 183  *              IPPROTO_ESP
 184  */
 185 
 186 /*
 187  * Stats.  This may eventually become a full-blown SNMP MIB once that spec
 188  * stabilizes.
 189  */
 190 
 191 typedef struct esp_kstats_s {
 192         kstat_named_t esp_stat_num_aalgs;
 193         kstat_named_t esp_stat_good_auth;
 194         kstat_named_t esp_stat_bad_auth;
 195         kstat_named_t esp_stat_bad_padding;
 196         kstat_named_t esp_stat_replay_failures;
 197         kstat_named_t esp_stat_replay_early_failures;
 198         kstat_named_t esp_stat_keysock_in;
 199         kstat_named_t esp_stat_out_requests;
 200         kstat_named_t esp_stat_acquire_requests;
 201         kstat_named_t esp_stat_bytes_expired;
 202         kstat_named_t esp_stat_out_discards;
 203         kstat_named_t esp_stat_crypto_sync;
 204         kstat_named_t esp_stat_crypto_async;
 205         kstat_named_t esp_stat_crypto_failures;
 206         kstat_named_t esp_stat_num_ealgs;
 207         kstat_named_t esp_stat_bad_decrypt;
 208         kstat_named_t esp_stat_sa_port_renumbers;
 209 } esp_kstats_t;
 210 
 211 /*
 212  * espstack->esp_kstats is equal to espstack->esp_ksp->ks_data if
 213  * kstat_create_netstack for espstack->esp_ksp succeeds, but when it
 214  * fails, it will be NULL. Note this is done for all stack instances,
 215  * so it *could* fail. hence a non-NULL checking is done for
 216  * ESP_BUMP_STAT and ESP_DEBUMP_STAT
 217  */
 218 #define ESP_BUMP_STAT(espstack, x)                                      \
 219 do {                                                                    \
 220         if (espstack->esp_kstats != NULL)                            \
 221                 (espstack->esp_kstats->esp_stat_ ## x).value.ui64++;      \
 222 _NOTE(CONSTCOND)                                                        \
 223 } while (0)
 224 
 225 #define ESP_DEBUMP_STAT(espstack, x)                                    \
 226 do {                                                                    \
 227         if (espstack->esp_kstats != NULL)                            \
 228                 (espstack->esp_kstats->esp_stat_ ## x).value.ui64--;      \
 229 _NOTE(CONSTCOND)                                                        \
 230 } while (0)
 231 
 232 static int      esp_kstat_update(kstat_t *, int);
 233 
 234 static boolean_t
 235 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
 236 {
 237         espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
 238             "net", KSTAT_TYPE_NAMED,
 239             sizeof (esp_kstats_t) / sizeof (kstat_named_t),
 240             KSTAT_FLAG_PERSISTENT, stackid);
 241 
 242         if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
 243                 return (B_FALSE);
 244 
 245         espstack->esp_kstats = espstack->esp_ksp->ks_data;
 246 
 247         espstack->esp_ksp->ks_update = esp_kstat_update;
 248         espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
 249 
 250 #define K64 KSTAT_DATA_UINT64
 251 #define KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
 252 
 253         KI(num_aalgs);
 254         KI(num_ealgs);
 255         KI(good_auth);
 256         KI(bad_auth);
 257         KI(bad_padding);
 258         KI(replay_failures);
 259         KI(replay_early_failures);
 260         KI(keysock_in);
 261         KI(out_requests);
 262         KI(acquire_requests);
 263         KI(bytes_expired);
 264         KI(out_discards);
 265         KI(crypto_sync);
 266         KI(crypto_async);
 267         KI(crypto_failures);
 268         KI(bad_decrypt);
 269         KI(sa_port_renumbers);
 270 
 271 #undef KI
 272 #undef K64
 273 
 274         kstat_install(espstack->esp_ksp);
 275 
 276         return (B_TRUE);
 277 }
 278 
 279 static int
 280 esp_kstat_update(kstat_t *kp, int rw)
 281 {
 282         esp_kstats_t *ekp;
 283         netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
 284         netstack_t      *ns;
 285         ipsec_stack_t   *ipss;
 286 
 287         if ((kp == NULL) || (kp->ks_data == NULL))
 288                 return (EIO);
 289 
 290         if (rw == KSTAT_WRITE)
 291                 return (EACCES);
 292 
 293         ns = netstack_find_by_stackid(stackid);
 294         if (ns == NULL)
 295                 return (-1);
 296         ipss = ns->netstack_ipsec;
 297         if (ipss == NULL) {
 298                 netstack_rele(ns);
 299                 return (-1);
 300         }
 301         ekp = (esp_kstats_t *)kp->ks_data;
 302 
 303         mutex_enter(&ipss->ipsec_alg_lock);
 304         ekp->esp_stat_num_aalgs.value.ui64 =
 305             ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
 306         ekp->esp_stat_num_ealgs.value.ui64 =
 307             ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
 308         mutex_exit(&ipss->ipsec_alg_lock);
 309 
 310         netstack_rele(ns);
 311         return (0);
 312 }
 313 
 314 #ifdef DEBUG
 315 /*
 316  * Debug routine, useful to see pre-encryption data.
 317  */
 318 static char *
 319 dump_msg(mblk_t *mp)
 320 {
 321         char tmp_str[3], tmp_line[256];
 322 
 323         while (mp != NULL) {
 324                 unsigned char *ptr;
 325 
 326                 printf("mblk address 0x%p, length %ld, db_ref %d "
 327                     "type %d, base 0x%p, lim 0x%p\n",
 328                     (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
 329                     mp->b_datap->db_ref, mp->b_datap->db_type,
 330                     (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
 331                 ptr = mp->b_rptr;
 332 
 333                 tmp_line[0] = '\0';
 334                 while (ptr < mp->b_wptr) {
 335                         uint_t diff;
 336 
 337                         diff = (ptr - mp->b_rptr);
 338                         if (!(diff & 0x1f)) {
 339                                 if (strlen(tmp_line) > 0) {
 340                                         printf("bytes: %s\n", tmp_line);
 341                                         tmp_line[0] = '\0';
 342                                 }
 343                         }
 344                         if (!(diff & 0x3))
 345                                 (void) strcat(tmp_line, " ");
 346                         (void) sprintf(tmp_str, "%02x", *ptr);
 347                         (void) strcat(tmp_line, tmp_str);
 348                         ptr++;
 349                 }
 350                 if (strlen(tmp_line) > 0)
 351                         printf("bytes: %s\n", tmp_line);
 352 
 353                 mp = mp->b_cont;
 354         }
 355 
 356         return ("\n");
 357 }
 358 
 359 #else /* DEBUG */
 360 static char *
 361 dump_msg(mblk_t *mp)
 362 {
 363         printf("Find value of mp %p.\n", mp);
 364         return ("\n");
 365 }
 366 #endif /* DEBUG */
 367 
 368 /*
 369  * Don't have to lock age_interval, as only one thread will access it at
 370  * a time, because I control the one function that does with timeout().
 371  */
 372 static void
 373 esp_ager(void *arg)
 374 {
 375         ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
 376         netstack_t      *ns = espstack->ipsecesp_netstack;
 377         hrtime_t begin = gethrtime();
 378 
 379         sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
 380             espstack->ipsecesp_reap_delay, ns);
 381         sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
 382             espstack->ipsecesp_reap_delay, ns);
 383 
 384         espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
 385             esp_ager, espstack,
 386             &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
 387             info.mi_idnum);
 388 }
 389 
 390 /*
 391  * Get an ESP NDD parameter.
 392  */
 393 /* ARGSUSED */
 394 static int
 395 ipsecesp_param_get(q, mp, cp, cr)
 396         queue_t *q;
 397         mblk_t  *mp;
 398         caddr_t cp;
 399         cred_t *cr;
 400 {
 401         ipsecespparam_t *ipsecesppa = (ipsecespparam_t *)cp;
 402         uint_t value;
 403         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
 404 
 405         mutex_enter(&espstack->ipsecesp_param_lock);
 406         value = ipsecesppa->ipsecesp_param_value;
 407         mutex_exit(&espstack->ipsecesp_param_lock);
 408 
 409         (void) mi_mpprintf(mp, "%u", value);
 410         return (0);
 411 }
 412 
 413 /*
 414  * This routine sets an NDD variable in a ipsecespparam_t structure.
 415  */
 416 /* ARGSUSED */
 417 static int
 418 ipsecesp_param_set(q, mp, value, cp, cr)
 419         queue_t *q;
 420         mblk_t  *mp;
 421         char    *value;
 422         caddr_t cp;
 423         cred_t *cr;
 424 {
 425         ulong_t new_value;
 426         ipsecespparam_t *ipsecesppa = (ipsecespparam_t *)cp;
 427         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
 428 
 429         /*
 430          * Fail the request if the new value does not lie within the
 431          * required bounds.
 432          */
 433         if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
 434             new_value < ipsecesppa->ipsecesp_param_min ||
 435             new_value > ipsecesppa->ipsecesp_param_max) {
 436                 return (EINVAL);
 437         }
 438 
 439         /* Set the new value */
 440         mutex_enter(&espstack->ipsecesp_param_lock);
 441         ipsecesppa->ipsecesp_param_value = new_value;
 442         mutex_exit(&espstack->ipsecesp_param_lock);
 443         return (0);
 444 }
 445 
 446 /*
 447  * Using lifetime NDD variables, fill in an extended combination's
 448  * lifetime information.
 449  */
 450 void
 451 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
 452 {
 453         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
 454 
 455         ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
 456         ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
 457         ecomb->sadb_x_ecomb_soft_addtime =
 458             espstack->ipsecesp_default_soft_addtime;
 459         ecomb->sadb_x_ecomb_hard_addtime =
 460             espstack->ipsecesp_default_hard_addtime;
 461         ecomb->sadb_x_ecomb_soft_usetime =
 462             espstack->ipsecesp_default_soft_usetime;
 463         ecomb->sadb_x_ecomb_hard_usetime =
 464             espstack->ipsecesp_default_hard_usetime;
 465 }
 466 
 467 /*
 468  * Initialize things for ESP at module load time.
 469  */
 470 boolean_t
 471 ipsecesp_ddi_init(void)
 472 {
 473         esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
 474             IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
 475 
 476         /*
 477          * We want to be informed each time a stack is created or
 478          * destroyed in the kernel, so we can maintain the
 479          * set of ipsecesp_stack_t's.
 480          */
 481         netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
 482             ipsecesp_stack_fini);
 483 
 484         return (B_TRUE);
 485 }
 486 
 487 /*
 488  * Walk through the param array specified registering each element with the
 489  * named dispatch handler.
 490  */
 491 static boolean_t
 492 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
 493 {
 494         for (; cnt-- > 0; espp++) {
 495                 if (espp->ipsecesp_param_name != NULL &&
 496                     espp->ipsecesp_param_name[0]) {
 497                         if (!nd_load(ndp,
 498                             espp->ipsecesp_param_name,
 499                             ipsecesp_param_get, ipsecesp_param_set,
 500                             (caddr_t)espp)) {
 501                                 nd_free(ndp);
 502                                 return (B_FALSE);
 503                         }
 504                 }
 505         }
 506         return (B_TRUE);
 507 }
 508 /*
 509  * Initialize things for ESP for each stack instance
 510  */
 511 static void *
 512 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
 513 {
 514         ipsecesp_stack_t        *espstack;
 515         ipsecespparam_t         *espp;
 516 
 517         espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
 518             KM_SLEEP);
 519         espstack->ipsecesp_netstack = ns;
 520 
 521         espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
 522         espstack->ipsecesp_params = espp;
 523         bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
 524 
 525         (void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
 526             A_CNT(lcl_param_arr));
 527 
 528         (void) esp_kstat_init(espstack, stackid);
 529 
 530         espstack->esp_sadb.s_acquire_timeout =
 531             &espstack->ipsecesp_acquire_timeout;
 532         espstack->esp_sadb.s_acqfn = esp_send_acquire;
 533         sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
 534             espstack->ipsecesp_netstack);
 535 
 536         mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
 537 
 538         ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
 539         return (espstack);
 540 }
 541 
 542 /*
 543  * Destroy things for ESP at module unload time.
 544  */
 545 void
 546 ipsecesp_ddi_destroy(void)
 547 {
 548         netstack_unregister(NS_IPSECESP);
 549         taskq_destroy(esp_taskq);
 550 }
 551 
 552 /*
 553  * Destroy things for ESP for one stack instance
 554  */
 555 static void
 556 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
 557 {
 558         ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
 559 
 560         if (espstack->esp_pfkey_q != NULL) {
 561                 (void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
 562         }
 563         espstack->esp_sadb.s_acqfn = NULL;
 564         espstack->esp_sadb.s_acquire_timeout = NULL;
 565         sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
 566         ip_drop_unregister(&espstack->esp_dropper);
 567         mutex_destroy(&espstack->ipsecesp_param_lock);
 568         nd_free(&espstack->ipsecesp_g_nd);
 569 
 570         kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
 571         espstack->ipsecesp_params = NULL;
 572         kstat_delete_netstack(espstack->esp_ksp, stackid);
 573         espstack->esp_ksp = NULL;
 574         espstack->esp_kstats = NULL;
 575         kmem_free(espstack, sizeof (*espstack));
 576 }
 577 
 578 /*
 579  * ESP module open routine, which is here for keysock plumbing.
 580  * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
 581  * Days of export control, and fears that ESP would not be allowed
 582  * to be shipped at all by default.  Eventually, keysock should
 583  * either access AH and ESP via modstubs or krtld dependencies, or
 584  * perhaps be folded in with AH and ESP into a single IPsec/netsec
 585  * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
 586  */
 587 /* ARGSUSED */
 588 static int
 589 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 590 {
 591         netstack_t              *ns;
 592         ipsecesp_stack_t        *espstack;
 593 
 594         if (secpolicy_ip_config(credp, B_FALSE) != 0)
 595                 return (EPERM);
 596 
 597         if (q->q_ptr != NULL)
 598                 return (0);  /* Re-open of an already open instance. */
 599 
 600         if (sflag != MODOPEN)
 601                 return (EINVAL);
 602 
 603         ns = netstack_find_by_cred(credp);
 604         ASSERT(ns != NULL);
 605         espstack = ns->netstack_ipsecesp;
 606         ASSERT(espstack != NULL);
 607 
 608         q->q_ptr = espstack;
 609         WR(q)->q_ptr = q->q_ptr;
 610 
 611         qprocson(q);
 612         return (0);
 613 }
 614 
 615 /*
 616  * ESP module close routine.
 617  */
 618 static int
 619 ipsecesp_close(queue_t *q)
 620 {
 621         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
 622 
 623         /*
 624          * Clean up q_ptr, if needed.
 625          */
 626         qprocsoff(q);
 627 
 628         /* Keysock queue check is safe, because of OCEXCL perimeter. */
 629 
 630         if (q == espstack->esp_pfkey_q) {
 631                 esp1dbg(espstack,
 632                     ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
 633                 espstack->esp_pfkey_q = NULL;
 634                 /* Detach qtimeouts. */
 635                 (void) quntimeout(q, espstack->esp_event);
 636         }
 637 
 638         netstack_rele(espstack->ipsecesp_netstack);
 639         return (0);
 640 }
 641 
 642 /*
 643  * Add a number of bytes to what the SA has protected so far.  Return
 644  * B_TRUE if the SA can still protect that many bytes.
 645  *
 646  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
 647  * any obtained peer SA.
 648  */
 649 static boolean_t
 650 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
 651 {
 652         ipsa_t *inassoc, *outassoc;
 653         isaf_t *bucket;
 654         boolean_t inrc, outrc, isv6;
 655         sadb_t *sp;
 656         int outhash;
 657         netstack_t              *ns = assoc->ipsa_netstack;
 658         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
 659 
 660         /* No peer?  No problem! */
 661         if (!assoc->ipsa_haspeer) {
 662                 return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
 663                     B_TRUE));
 664         }
 665 
 666         /*
 667          * Otherwise, we want to grab both the original assoc and its peer.
 668          * There might be a race for this, but if it's a real race, two
 669          * expire messages may occur.  We limit this by only sending the
 670          * expire message on one of the peers, we'll pick the inbound
 671          * arbitrarily.
 672          *
 673          * If we need tight synchronization on the peer SA, then we need to
 674          * reconsider.
 675          */
 676 
 677         /* Use address length to select IPv6/IPv4 */
 678         isv6 = (assoc->ipsa_addrfam == AF_INET6);
 679         sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
 680 
 681         if (inbound) {
 682                 inassoc = assoc;
 683                 if (isv6) {
 684                         outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
 685                             &inassoc->ipsa_dstaddr));
 686                 } else {
 687                         outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
 688                             &inassoc->ipsa_dstaddr));
 689                 }
 690                 bucket = &sp->sdb_of[outhash];
 691                 mutex_enter(&bucket->isaf_lock);
 692                 outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
 693                     inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
 694                     inassoc->ipsa_addrfam);
 695                 mutex_exit(&bucket->isaf_lock);
 696                 if (outassoc == NULL) {
 697                         /* Q: Do we wish to set haspeer == B_FALSE? */
 698                         esp0dbg(("esp_age_bytes: "
 699                             "can't find peer for inbound.\n"));
 700                         return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
 701                             bytes, B_TRUE));
 702                 }
 703         } else {
 704                 outassoc = assoc;
 705                 bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
 706                 mutex_enter(&bucket->isaf_lock);
 707                 inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
 708                     outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
 709                     outassoc->ipsa_addrfam);
 710                 mutex_exit(&bucket->isaf_lock);
 711                 if (inassoc == NULL) {
 712                         /* Q: Do we wish to set haspeer == B_FALSE? */
 713                         esp0dbg(("esp_age_bytes: "
 714                             "can't find peer for outbound.\n"));
 715                         return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
 716                             bytes, B_TRUE));
 717                 }
 718         }
 719 
 720         inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
 721         outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
 722 
 723         /*
 724          * REFRELE any peer SA.
 725          *
 726          * Because of the multi-line macro nature of IPSA_REFRELE, keep
 727          * them in { }.
 728          */
 729         if (inbound) {
 730                 IPSA_REFRELE(outassoc);
 731         } else {
 732                 IPSA_REFRELE(inassoc);
 733         }
 734 
 735         return (inrc && outrc);
 736 }
 737 
 738 /*
 739  * Do incoming NAT-T manipulations for packet.
 740  * Returns NULL if the mblk chain is consumed.
 741  */
 742 static mblk_t *
 743 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
 744 {
 745         ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
 746         tcpha_t *tcpha;
 747         udpha_t *udpha;
 748         /* Initialize to our inbound cksum adjustment... */
 749         uint32_t sum = assoc->ipsa_inbound_cksum;
 750 
 751         switch (ipha->ipha_protocol) {
 752         case IPPROTO_TCP:
 753                 tcpha = (tcpha_t *)(data_mp->b_rptr +
 754                     IPH_HDR_LENGTH(ipha));
 755 
 756 #define DOWN_SUM(x) (x) = ((x) & 0xFFFF) +   ((x) >> 16)
 757                 sum += ~ntohs(tcpha->tha_sum) & 0xFFFF;
 758                 DOWN_SUM(sum);
 759                 DOWN_SUM(sum);
 760                 tcpha->tha_sum = ~htons(sum);
 761                 break;
 762         case IPPROTO_UDP:
 763                 udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
 764 
 765                 if (udpha->uha_checksum != 0) {
 766                         /* Adujst if the inbound one was not zero. */
 767                         sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
 768                         DOWN_SUM(sum);
 769                         DOWN_SUM(sum);
 770                         udpha->uha_checksum = ~htons(sum);
 771                         if (udpha->uha_checksum == 0)
 772                                 udpha->uha_checksum = 0xFFFF;
 773                 }
 774 #undef DOWN_SUM
 775                 break;
 776         case IPPROTO_IP:
 777                 /*
 778                  * This case is only an issue for self-encapsulated
 779                  * packets.  So for now, fall through.
 780                  */
 781                 break;
 782         }
 783         return (data_mp);
 784 }
 785 
 786 
 787 /*
 788  * Strip ESP header, check padding, and fix IP header.
 789  * Returns B_TRUE on success, B_FALSE if an error occured.
 790  */
 791 static boolean_t
 792 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
 793     kstat_named_t **counter, ipsecesp_stack_t *espstack)
 794 {
 795         ipha_t *ipha;
 796         ip6_t *ip6h;
 797         uint_t divpoint;
 798         mblk_t *scratch;
 799         uint8_t nexthdr, padlen;
 800         uint8_t lastpad;
 801         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
 802         uint8_t *lastbyte;
 803 
 804         /*
 805          * Strip ESP data and fix IP header.
 806          *
 807          * XXX In case the beginning of esp_inbound() changes to not do a
 808          * pullup, this part of the code can remain unchanged.
 809          */
 810         if (isv4) {
 811                 ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
 812                 ipha = (ipha_t *)data_mp->b_rptr;
 813                 ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
 814                     IPH_HDR_LENGTH(ipha));
 815                 divpoint = IPH_HDR_LENGTH(ipha);
 816         } else {
 817                 ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
 818                 ip6h = (ip6_t *)data_mp->b_rptr;
 819                 divpoint = ip_hdr_length_v6(data_mp, ip6h);
 820         }
 821 
 822         scratch = data_mp;
 823         while (scratch->b_cont != NULL)
 824                 scratch = scratch->b_cont;
 825 
 826         ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
 827 
 828         /*
 829          * "Next header" and padding length are the last two bytes in the
 830          * ESP-protected datagram, thus the explicit - 1 and - 2.
 831          * lastpad is the last byte of the padding, which can be used for
 832          * a quick check to see if the padding is correct.
 833          */
 834         lastbyte = scratch->b_wptr - 1;
 835         nexthdr = *lastbyte--;
 836         padlen = *lastbyte--;
 837 
 838         if (isv4) {
 839                 /* Fix part of the IP header. */
 840                 ipha->ipha_protocol = nexthdr;
 841                 /*
 842                  * Reality check the padlen.  The explicit - 2 is for the
 843                  * padding length and the next-header bytes.
 844                  */
 845                 if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
 846                     sizeof (esph_t) - ivlen) {
 847                         ESP_BUMP_STAT(espstack, bad_decrypt);
 848                         ipsec_rl_strlog(espstack->ipsecesp_netstack,
 849                             info.mi_idnum, 0, 0,
 850                             SL_ERROR | SL_WARN,
 851                             "Corrupt ESP packet (padlen too big).\n");
 852                         esp1dbg(espstack, ("padlen (%d) is greater than:\n",
 853                             padlen));
 854                         esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
 855                             "hdr - ivlen(%d) = %d.\n",
 856                             ntohs(ipha->ipha_length), ivlen,
 857                             (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
 858                             2 - sizeof (esph_t) - ivlen)));
 859                         *counter = DROPPER(ipss, ipds_esp_bad_padlen);
 860                         return (B_FALSE);
 861                 }
 862 
 863                 /*
 864                  * Fix the rest of the header.  The explicit - 2 is for the
 865                  * padding length and the next-header bytes.
 866                  */
 867                 ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
 868                     2 - sizeof (esph_t) - ivlen);
 869                 ipha->ipha_hdr_checksum = 0;
 870                 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
 871         } else {
 872                 if (ip6h->ip6_nxt == IPPROTO_ESP) {
 873                         ip6h->ip6_nxt = nexthdr;
 874                 } else {
 875                         ip_pkt_t ipp;
 876 
 877                         bzero(&ipp, sizeof (ipp));
 878                         (void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
 879                             NULL);
 880                         if (ipp.ipp_dstopts != NULL) {
 881                                 ipp.ipp_dstopts->ip6d_nxt = nexthdr;
 882                         } else if (ipp.ipp_rthdr != NULL) {
 883                                 ipp.ipp_rthdr->ip6r_nxt = nexthdr;
 884                         } else if (ipp.ipp_hopopts != NULL) {
 885                                 ipp.ipp_hopopts->ip6h_nxt = nexthdr;
 886                         } else {
 887                                 /* Panic a DEBUG kernel. */
 888                                 ASSERT(ipp.ipp_hopopts != NULL);
 889                                 /* Otherwise, pretend it's IP + ESP. */
 890                                 cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
 891                                 ip6h->ip6_nxt = nexthdr;
 892                         }
 893                 }
 894 
 895                 if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
 896                     ivlen) {
 897                         ESP_BUMP_STAT(espstack, bad_decrypt);
 898                         ipsec_rl_strlog(espstack->ipsecesp_netstack,
 899                             info.mi_idnum, 0, 0,
 900                             SL_ERROR | SL_WARN,
 901                             "Corrupt ESP packet (v6 padlen too big).\n");
 902                         esp1dbg(espstack, ("padlen (%d) is greater than:\n",
 903                             padlen));
 904                         esp1dbg(espstack,
 905                             ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
 906                             "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
 907                             + sizeof (ip6_t)), ivlen,
 908                             (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
 909                             sizeof (esph_t) - ivlen)));
 910                         *counter = DROPPER(ipss, ipds_esp_bad_padlen);
 911                         return (B_FALSE);
 912                 }
 913 
 914 
 915                 /*
 916                  * Fix the rest of the header.  The explicit - 2 is for the
 917                  * padding length and the next-header bytes.  IPv6 is nice,
 918                  * because there's no hdr checksum!
 919                  */
 920                 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
 921                     2 - sizeof (esph_t) - ivlen);
 922         }
 923 
 924         if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
 925                 /*
 926                  * Weak padding check: compare last-byte to length, they
 927                  * should be equal.
 928                  */
 929                 lastpad = *lastbyte--;
 930 
 931                 if (padlen != lastpad) {
 932                         ipsec_rl_strlog(espstack->ipsecesp_netstack,
 933                             info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
 934                             "Corrupt ESP packet (lastpad != padlen).\n");
 935                         esp1dbg(espstack,
 936                             ("lastpad (%d) not equal to padlen (%d):\n",
 937                             lastpad, padlen));
 938                         ESP_BUMP_STAT(espstack, bad_padding);
 939                         *counter = DROPPER(ipss, ipds_esp_bad_padding);
 940                         return (B_FALSE);
 941                 }
 942 
 943                 /*
 944                  * Strong padding check: Check all pad bytes to see that
 945                  * they're ascending.  Go backwards using a descending counter
 946                  * to verify.  padlen == 1 is checked by previous block, so
 947                  * only bother if we've more than 1 byte of padding.
 948                  * Consequently, start the check one byte before the location
 949                  * of "lastpad".
 950                  */
 951                 if (espstack->ipsecesp_padding_check > 1) {
 952                         /*
 953                          * This assert may have to become an if and a pullup
 954                          * if we start accepting multi-dblk mblks. For now,
 955                          * though, any packet here will have been pulled up in
 956                          * esp_inbound.
 957                          */
 958                         ASSERT(MBLKL(scratch) >= lastpad + 3);
 959 
 960                         /*
 961                          * Use "--lastpad" because we already checked the very
 962                          * last pad byte previously.
 963                          */
 964                         while (--lastpad != 0) {
 965                                 if (lastpad != *lastbyte) {
 966                                         ipsec_rl_strlog(
 967                                             espstack->ipsecesp_netstack,
 968                                             info.mi_idnum, 0, 0,
 969                                             SL_ERROR | SL_WARN, "Corrupt ESP "
 970                                             "packet (bad padding).\n");
 971                                         esp1dbg(espstack,
 972                                             ("padding not in correct"
 973                                             " format:\n"));
 974                                         ESP_BUMP_STAT(espstack, bad_padding);
 975                                         *counter = DROPPER(ipss,
 976                                             ipds_esp_bad_padding);
 977                                         return (B_FALSE);
 978                                 }
 979                                 lastbyte--;
 980                         }
 981                 }
 982         }
 983 
 984         /* Trim off the padding. */
 985         ASSERT(data_mp->b_cont == NULL);
 986         data_mp->b_wptr -= (padlen + 2);
 987 
 988         /*
 989          * Remove the ESP header.
 990          *
 991          * The above assertions about data_mp's size will make this work.
 992          *
 993          * XXX  Question:  If I send up and get back a contiguous mblk,
 994          * would it be quicker to bcopy over, or keep doing the dupb stuff?
 995          * I go with copying for now.
 996          */
 997 
 998         if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
 999             IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
1000                 uint8_t *start = data_mp->b_rptr;
1001                 uint32_t *src, *dst;
1002 
1003                 src = (uint32_t *)(start + divpoint);
1004                 dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
1005 
1006                 ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
1007                     IS_P2ALIGNED(src, sizeof (uint32_t)));
1008 
1009                 do {
1010                         src--;
1011                         dst--;
1012                         *dst = *src;
1013                 } while (src != (uint32_t *)start);
1014 
1015                 data_mp->b_rptr = (uchar_t *)dst;
1016         } else {
1017                 uint8_t *start = data_mp->b_rptr;
1018                 uint8_t *src, *dst;
1019 
1020                 src = start + divpoint;
1021                 dst = src + sizeof (esph_t) + ivlen;
1022 
1023                 do {
1024                         src--;
1025                         dst--;
1026                         *dst = *src;
1027                 } while (src != start);
1028 
1029                 data_mp->b_rptr = dst;
1030         }
1031 
1032         esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
1033         esp2dbg(espstack, (dump_msg(data_mp)));
1034 
1035         return (B_TRUE);
1036 }
1037 
1038 /*
1039  * Updating use times can be tricky business if the ipsa_haspeer flag is
1040  * set.  This function is called once in an SA's lifetime.
1041  *
1042  * Caller has to REFRELE "assoc" which is passed in.  This function has
1043  * to REFRELE any peer SA that is obtained.
1044  */
1045 static void
1046 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
1047 {
1048         ipsa_t *inassoc, *outassoc;
1049         isaf_t *bucket;
1050         sadb_t *sp;
1051         int outhash;
1052         boolean_t isv6;
1053         netstack_t              *ns = assoc->ipsa_netstack;
1054         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
1055 
1056         /* No peer?  No problem! */
1057         if (!assoc->ipsa_haspeer) {
1058                 sadb_set_usetime(assoc);
1059                 return;
1060         }
1061 
1062         /*
1063          * Otherwise, we want to grab both the original assoc and its peer.
1064          * There might be a race for this, but if it's a real race, the times
1065          * will be out-of-synch by at most a second, and since our time
1066          * granularity is a second, this won't be a problem.
1067          *
1068          * If we need tight synchronization on the peer SA, then we need to
1069          * reconsider.
1070          */
1071 
1072         /* Use address length to select IPv6/IPv4 */
1073         isv6 = (assoc->ipsa_addrfam == AF_INET6);
1074         sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1075 
1076         if (inbound) {
1077                 inassoc = assoc;
1078                 if (isv6) {
1079                         outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1080                             &inassoc->ipsa_dstaddr));
1081                 } else {
1082                         outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1083                             &inassoc->ipsa_dstaddr));
1084                 }
1085                 bucket = &sp->sdb_of[outhash];
1086                 mutex_enter(&bucket->isaf_lock);
1087                 outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1088                     inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1089                     inassoc->ipsa_addrfam);
1090                 mutex_exit(&bucket->isaf_lock);
1091                 if (outassoc == NULL) {
1092                         /* Q: Do we wish to set haspeer == B_FALSE? */
1093                         esp0dbg(("esp_set_usetime: "
1094                             "can't find peer for inbound.\n"));
1095                         sadb_set_usetime(inassoc);
1096                         return;
1097                 }
1098         } else {
1099                 outassoc = assoc;
1100                 bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1101                 mutex_enter(&bucket->isaf_lock);
1102                 inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1103                     outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1104                     outassoc->ipsa_addrfam);
1105                 mutex_exit(&bucket->isaf_lock);
1106                 if (inassoc == NULL) {
1107                         /* Q: Do we wish to set haspeer == B_FALSE? */
1108                         esp0dbg(("esp_set_usetime: "
1109                             "can't find peer for outbound.\n"));
1110                         sadb_set_usetime(outassoc);
1111                         return;
1112                 }
1113         }
1114 
1115         /* Update usetime on both. */
1116         sadb_set_usetime(inassoc);
1117         sadb_set_usetime(outassoc);
1118 
1119         /*
1120          * REFRELE any peer SA.
1121          *
1122          * Because of the multi-line macro nature of IPSA_REFRELE, keep
1123          * them in { }.
1124          */
1125         if (inbound) {
1126                 IPSA_REFRELE(outassoc);
1127         } else {
1128                 IPSA_REFRELE(inassoc);
1129         }
1130 }
1131 
1132 /*
1133  * Handle ESP inbound data for IPv4 and IPv6.
1134  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1135  * mblk chain data_mp.
1136  */
1137 mblk_t *
1138 esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
1139 {
1140         esph_t *esph = (esph_t *)arg;
1141         ipsa_t *ipsa = ira->ira_ipsec_esp_sa;
1142         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
1143         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1144         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1145 
1146         /*
1147          * We may wish to check replay in-range-only here as an optimization.
1148          * Include the reality check of ipsa->ipsa_replay >
1149          * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1150          * where N == ipsa->ipsa_replay_wsize.
1151          *
1152          * Another check that may come here later is the "collision" check.
1153          * If legitimate packets flow quickly enough, this won't be a problem,
1154          * but collisions may cause authentication algorithm crunching to
1155          * take place when it doesn't need to.
1156          */
1157         if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1158                 ESP_BUMP_STAT(espstack, replay_early_failures);
1159                 IP_ESP_BUMP_STAT(ipss, in_discards);
1160                 ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1161                     DROPPER(ipss, ipds_esp_early_replay),
1162                     &espstack->esp_dropper);
1163                 BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1164                 return (NULL);
1165         }
1166 
1167         /*
1168          * Adjust the IP header's payload length to reflect the removal
1169          * of the ICV.
1170          */
1171         if (!(ira->ira_flags & IRAF_IS_IPV4)) {
1172                 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1173                 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1174                     ipsa->ipsa_mac_len);
1175         } else {
1176                 ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1177                 ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1178                     ipsa->ipsa_mac_len);
1179         }
1180 
1181         /* submit the request to the crypto framework */
1182         return (esp_submit_req_inbound(data_mp, ira, ipsa,
1183             (uint8_t *)esph - data_mp->b_rptr));
1184 }
1185 
1186 /*
1187  * Perform the really difficult work of inserting the proposed situation.
1188  * Called while holding the algorithm lock.
1189  */
1190 static void
1191 esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs,
1192     netstack_t *ns)
1193 {
1194         sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
1195         ipsec_action_t *ap;
1196         ipsec_prot_t *prot;
1197         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1198         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1199 
1200         ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1201 
1202         prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
1203         prop->sadb_prop_len = SADB_8TO64(sizeof (sadb_prop_t));
1204         *(uint32_t *)(&prop->sadb_prop_replay) = 0;      /* Quick zero-out! */
1205 
1206         prop->sadb_prop_replay = espstack->ipsecesp_replay_size;
1207 
1208         /*
1209          * Based upon algorithm properties, and what-not, prioritize a
1210          * proposal, based on the ordering of the ESP algorithms in the
1211          * alternatives in the policy rule or socket that was placed
1212          * in the acquire record.
1213          *
1214          * For each action in policy list
1215          *   Add combination.  If I've hit limit, return.
1216          */
1217 
1218         for (ap = acqrec->ipsacq_act; ap != NULL;
1219             ap = ap->ipa_next) {
1220                 ipsec_alginfo_t *ealg = NULL;
1221                 ipsec_alginfo_t *aalg = NULL;
1222 
1223                 if (ap->ipa_act.ipa_type != IPSEC_POLICY_APPLY)
1224                         continue;
1225 
1226                 prot = &ap->ipa_act.ipa_apply;
1227 
1228                 if (!(prot->ipp_use_esp))
1229                         continue;
1230 
1231                 if (prot->ipp_esp_auth_alg != 0) {
1232                         aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
1233                             [prot->ipp_esp_auth_alg];
1234                         if (aalg == NULL || !ALG_VALID(aalg))
1235                                 continue;
1236                 }
1237 
1238                 ASSERT(prot->ipp_encr_alg > 0);
1239                 ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
1240                     [prot->ipp_encr_alg];
1241                 if (ealg == NULL || !ALG_VALID(ealg))
1242                         continue;
1243 
1244                 comb->sadb_comb_flags = 0;
1245                 comb->sadb_comb_reserved = 0;
1246                 comb->sadb_comb_encrypt = ealg->alg_id;
1247                 comb->sadb_comb_encrypt_minbits =
1248                     MAX(prot->ipp_espe_minbits, ealg->alg_ef_minbits);
1249                 comb->sadb_comb_encrypt_maxbits =
1250                     MIN(prot->ipp_espe_maxbits, ealg->alg_ef_maxbits);
1251 
1252                 if (aalg == NULL) {
1253                         comb->sadb_comb_auth = 0;
1254                         comb->sadb_comb_auth_minbits = 0;
1255                         comb->sadb_comb_auth_maxbits = 0;
1256                 } else {
1257                         comb->sadb_comb_auth = aalg->alg_id;
1258                         comb->sadb_comb_auth_minbits =
1259                             MAX(prot->ipp_espa_minbits, aalg->alg_ef_minbits);
1260                         comb->sadb_comb_auth_maxbits =
1261                             MIN(prot->ipp_espa_maxbits, aalg->alg_ef_maxbits);
1262                 }
1263 
1264                 /*
1265                  * The following may be based on algorithm
1266                  * properties, but in the meantime, we just pick
1267                  * some good, sensible numbers.  Key mgmt. can
1268                  * (and perhaps should) be the place to finalize
1269                  * such decisions.
1270                  */
1271 
1272                 /*
1273                  * No limits on allocations, since we really don't
1274                  * support that concept currently.
1275                  */
1276                 comb->sadb_comb_soft_allocations = 0;
1277                 comb->sadb_comb_hard_allocations = 0;
1278 
1279                 /*
1280                  * These may want to come from policy rule..
1281                  */
1282                 comb->sadb_comb_soft_bytes =
1283                     espstack->ipsecesp_default_soft_bytes;
1284                 comb->sadb_comb_hard_bytes =
1285                     espstack->ipsecesp_default_hard_bytes;
1286                 comb->sadb_comb_soft_addtime =
1287                     espstack->ipsecesp_default_soft_addtime;
1288                 comb->sadb_comb_hard_addtime =
1289                     espstack->ipsecesp_default_hard_addtime;
1290                 comb->sadb_comb_soft_usetime =
1291                     espstack->ipsecesp_default_soft_usetime;
1292                 comb->sadb_comb_hard_usetime =
1293                     espstack->ipsecesp_default_hard_usetime;
1294 
1295                 prop->sadb_prop_len += SADB_8TO64(sizeof (*comb));
1296                 if (--combs == 0)
1297                         break;  /* out of space.. */
1298                 comb++;
1299         }
1300 }
1301 
1302 /*
1303  * Prepare and actually send the SADB_ACQUIRE message to PF_KEY.
1304  */
1305 static void
1306 esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
1307 {
1308         uint_t combs;
1309         sadb_msg_t *samsg;
1310         sadb_prop_t *prop;
1311         mblk_t *pfkeymp, *msgmp;
1312         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1313         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1314 
1315         ESP_BUMP_STAT(espstack, acquire_requests);
1316 
1317         if (espstack->esp_pfkey_q == NULL) {
1318                 mutex_exit(&acqrec->ipsacq_lock);
1319                 return;
1320         }
1321 
1322         /* Set up ACQUIRE. */
1323         pfkeymp = sadb_setup_acquire(acqrec, SADB_SATYPE_ESP,
1324             ns->netstack_ipsec);
1325         if (pfkeymp == NULL) {
1326                 esp0dbg(("sadb_setup_acquire failed.\n"));
1327                 mutex_exit(&acqrec->ipsacq_lock);
1328                 return;
1329         }
1330         ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1331         combs = ipss->ipsec_nalgs[IPSEC_ALG_AUTH] *
1332             ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
1333         msgmp = pfkeymp->b_cont;
1334         samsg = (sadb_msg_t *)(msgmp->b_rptr);
1335 
1336         /* Insert proposal here. */
1337 
1338         prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
1339         esp_insert_prop(prop, acqrec, combs, ns);
1340         samsg->sadb_msg_len += prop->sadb_prop_len;
1341         msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
1342 
1343         mutex_exit(&ipss->ipsec_alg_lock);
1344 
1345         /*
1346          * Must mutex_exit() before sending PF_KEY message up, in
1347          * order to avoid recursive mutex_enter() if there are no registered
1348          * listeners.
1349          *
1350          * Once I've sent the message, I'm cool anyway.
1351          */
1352         mutex_exit(&acqrec->ipsacq_lock);
1353         if (extended != NULL) {
1354                 putnext(espstack->esp_pfkey_q, extended);
1355         }
1356         putnext(espstack->esp_pfkey_q, pfkeymp);
1357 }
1358 
1359 /* XXX refactor me */
1360 /*
1361  * Handle the SADB_GETSPI message.  Create a larval SA.
1362  */
1363 static void
1364 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1365 {
1366         ipsa_t *newbie, *target;
1367         isaf_t *outbound, *inbound;
1368         int rc, diagnostic;
1369         sadb_sa_t *assoc;
1370         keysock_out_t *kso;
1371         uint32_t newspi;
1372 
1373         /*
1374          * Randomly generate a proposed SPI value
1375          */
1376         (void) random_get_pseudo_bytes((uint8_t *)&newspi, sizeof (uint32_t));
1377         newbie = sadb_getspi(ksi, newspi, &diagnostic,
1378             espstack->ipsecesp_netstack);
1379 
1380         if (newbie == NULL) {
1381                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1382                     ksi->ks_in_serial);
1383                 return;
1384         } else if (newbie == (ipsa_t *)-1) {
1385                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1386                     ksi->ks_in_serial);
1387                 return;
1388         }
1389 
1390         /*
1391          * XXX - We may randomly collide.  We really should recover from this.
1392          *       Unfortunately, that could require spending way-too-much-time
1393          *       in here.  For now, let the user retry.
1394          */
1395 
1396         if (newbie->ipsa_addrfam == AF_INET6) {
1397                 outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1398                     *(uint32_t *)(newbie->ipsa_dstaddr));
1399                 inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1400                     newbie->ipsa_spi);
1401         } else {
1402                 ASSERT(newbie->ipsa_addrfam == AF_INET);
1403                 outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1404                     *(uint32_t *)(newbie->ipsa_dstaddr));
1405                 inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1406                     newbie->ipsa_spi);
1407         }
1408 
1409         mutex_enter(&outbound->isaf_lock);
1410         mutex_enter(&inbound->isaf_lock);
1411 
1412         /*
1413          * Check for collisions (i.e. did sadb_getspi() return with something
1414          * that already exists?).
1415          *
1416          * Try outbound first.  Even though SADB_GETSPI is traditionally
1417          * for inbound SAs, you never know what a user might do.
1418          */
1419         target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1420             newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1421         if (target == NULL) {
1422                 target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1423                     newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1424                     newbie->ipsa_addrfam);
1425         }
1426 
1427         /*
1428          * I don't have collisions elsewhere!
1429          * (Nor will I because I'm still holding inbound/outbound locks.)
1430          */
1431 
1432         if (target != NULL) {
1433                 rc = EEXIST;
1434                 IPSA_REFRELE(target);
1435         } else {
1436                 /*
1437                  * sadb_insertassoc() also checks for collisions, so
1438                  * if there's a colliding entry, rc will be set
1439                  * to EEXIST.
1440                  */
1441                 rc = sadb_insertassoc(newbie, inbound);
1442                 newbie->ipsa_hardexpiretime = gethrestime_sec();
1443                 newbie->ipsa_hardexpiretime +=
1444                     espstack->ipsecesp_larval_timeout;
1445         }
1446 
1447         /*
1448          * Can exit outbound mutex.  Hold inbound until we're done
1449          * with newbie.
1450          */
1451         mutex_exit(&outbound->isaf_lock);
1452 
1453         if (rc != 0) {
1454                 mutex_exit(&inbound->isaf_lock);
1455                 IPSA_REFRELE(newbie);
1456                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1457                     SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1458                 return;
1459         }
1460 
1461 
1462         /* Can write here because I'm still holding the bucket lock. */
1463         newbie->ipsa_type = SADB_SATYPE_ESP;
1464 
1465         /*
1466          * Construct successful return message. We have one thing going
1467          * for us in PF_KEY v2.  That's the fact that
1468          *      sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1469          */
1470         assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1471         assoc->sadb_sa_exttype = SADB_EXT_SA;
1472         assoc->sadb_sa_spi = newbie->ipsa_spi;
1473         *((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1474         mutex_exit(&inbound->isaf_lock);
1475 
1476         /* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1477         kso = (keysock_out_t *)ksi;
1478         kso->ks_out_len = sizeof (*kso);
1479         kso->ks_out_serial = ksi->ks_in_serial;
1480         kso->ks_out_type = KEYSOCK_OUT;
1481 
1482         /*
1483          * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1484          * from the esp_pfkey_q.
1485          */
1486         putnext(espstack->esp_pfkey_q, mp);
1487 }
1488 
1489 /*
1490  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1491  * allocated mblk with the ESP header in between the two.
1492  */
1493 static boolean_t
1494 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1495     ipsecesp_stack_t *espstack)
1496 {
1497         mblk_t *split_mp = mp;
1498         uint_t wheretodiv = divpoint;
1499 
1500         while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1501                 wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1502                 split_mp = split_mp->b_cont;
1503                 ASSERT(split_mp != NULL);
1504         }
1505 
1506         if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1507                 mblk_t *scratch;
1508 
1509                 /* "scratch" is the 2nd half, split_mp is the first. */
1510                 scratch = dupb(split_mp);
1511                 if (scratch == NULL) {
1512                         esp1dbg(espstack,
1513                             ("esp_insert_esp: can't allocate scratch.\n"));
1514                         return (B_FALSE);
1515                 }
1516                 /* NOTE:  dupb() doesn't set b_cont appropriately. */
1517                 scratch->b_cont = split_mp->b_cont;
1518                 scratch->b_rptr += wheretodiv;
1519                 split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1520                 split_mp->b_cont = scratch;
1521         }
1522         /*
1523          * At this point, split_mp is exactly "wheretodiv" bytes long, and
1524          * holds the end of the pre-ESP part of the datagram.
1525          */
1526         esp_mp->b_cont = split_mp->b_cont;
1527         split_mp->b_cont = esp_mp;
1528 
1529         return (B_TRUE);
1530 }
1531 
1532 /*
1533  * Section 7 of RFC 3947 says:
1534  *
1535  * 7.  Recovering from the Expiring NAT Mappings
1536  *
1537  *    There are cases where NAT box decides to remove mappings that are still
1538  *    alive (for example, when the keepalive interval is too long, or when the
1539  *    NAT box is rebooted).  To recover from this, ends that are NOT behind
1540  *    NAT SHOULD use the last valid UDP encapsulated IKE or IPsec packet from
1541  *    the other end to determine which IP and port addresses should be used.
1542  *    The host behind dynamic NAT MUST NOT do this, as otherwise it opens a
1543  *    DoS attack possibility because the IP address or port of the other host
1544  *    will not change (it is not behind NAT).
1545  *
1546  *    Keepalives cannot be used for these purposes, as they are not
1547  *    authenticated, but any IKE authenticated IKE packet or ESP packet can be
1548  *    used to detect whether the IP address or the port has changed.
1549  *
1550  * The following function will check an SA and its explicitly-set pair to see
1551  * if the NAT-T remote port matches the received packet (which must have
1552  * passed ESP authentication, see esp_in_done() for the caller context).  If
1553  * there is a mismatch, the SAs are updated.  It is not important if we race
1554  * with a transmitting thread, as if there is a transmitting thread, it will
1555  * merely emit a packet that will most-likely be dropped.
1556  *
1557  * "ports" are ordered src,dst, and assoc is an inbound SA, where src should
1558  * match ipsa_remote_nat_port and dst should match ipsa_local_nat_port.
1559  */
1560 #ifdef _LITTLE_ENDIAN
1561 #define FIRST_16(x) ((x) & 0xFFFF)
1562 #define NEXT_16(x) (((x) >> 16) & 0xFFFF)
1563 #else
1564 #define FIRST_16(x) (((x) >> 16) & 0xFFFF)
1565 #define NEXT_16(x) ((x) & 0xFFFF)
1566 #endif
1567 static void
1568 esp_port_freshness(uint32_t ports, ipsa_t *assoc)
1569 {
1570         uint16_t remote = FIRST_16(ports);
1571         ipsa_t *outbound_peer;
1572         isaf_t *bucket;
1573         ipsecesp_stack_t *espstack = assoc->ipsa_netstack->netstack_ipsecesp;
1574 
1575         /* We found a conn_t, therefore local != 0. */
1576         ASSERT(NEXT_16(ports) != 0);
1577         /* Assume an IPv4 SA. */
1578         ASSERT(assoc->ipsa_addrfam == AF_INET);
1579 
1580         /*
1581          * On-the-wire rport == 0 means something's very wrong.
1582          * An unpaired SA is also useless to us.
1583          * If we are behind the NAT, don't bother.
1584          * A zero local NAT port defaults to 4500, so check that too.
1585          * And, of course, if the ports already match, we don't need to
1586          * bother.
1587          */
1588         if (remote == 0 || assoc->ipsa_otherspi == 0 ||
1589             (assoc->ipsa_flags & IPSA_F_BEHIND_NAT) ||
1590             (assoc->ipsa_remote_nat_port == 0 &&
1591             remote == htons(IPPORT_IKE_NATT)) ||
1592             remote == assoc->ipsa_remote_nat_port)
1593                 return;
1594 
1595         /* Try and snag the peer.   NOTE:  Assume IPv4 for now. */
1596         bucket = OUTBOUND_BUCKET_V4(&(espstack->esp_sadb.s_v4),
1597             assoc->ipsa_srcaddr[0]);
1598         mutex_enter(&bucket->isaf_lock);
1599         outbound_peer = ipsec_getassocbyspi(bucket, assoc->ipsa_otherspi,
1600             assoc->ipsa_dstaddr, assoc->ipsa_srcaddr, AF_INET);
1601         mutex_exit(&bucket->isaf_lock);
1602 
1603         /* We probably lost a race to a deleting or expiring thread. */
1604         if (outbound_peer == NULL)
1605                 return;
1606 
1607         /*
1608          * Hold the mutexes for both SAs so we don't race another inbound
1609          * thread.  A lock-entry order shouldn't matter, since all other
1610          * per-ipsa locks are individually held-then-released.
1611          *
1612          * Luckily, this has nothing to do with the remote-NAT address,
1613          * so we don't have to re-scribble the cached-checksum differential.
1614          */
1615         mutex_enter(&outbound_peer->ipsa_lock);
1616         mutex_enter(&assoc->ipsa_lock);
1617         outbound_peer->ipsa_remote_nat_port = assoc->ipsa_remote_nat_port =
1618             remote;
1619         mutex_exit(&assoc->ipsa_lock);
1620         mutex_exit(&outbound_peer->ipsa_lock);
1621         IPSA_REFRELE(outbound_peer);
1622         ESP_BUMP_STAT(espstack, sa_port_renumbers);
1623 }
1624 /*
1625  * Finish processing of an inbound ESP packet after processing by the
1626  * crypto framework.
1627  * - Remove the ESP header.
1628  * - Send packet back to IP.
1629  * If authentication was performed on the packet, this function is called
1630  * only if the authentication succeeded.
1631  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1632  * mblk chain data_mp.
1633  */
1634 static mblk_t *
1635 esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
1636 {
1637         ipsa_t *assoc;
1638         uint_t espstart;
1639         uint32_t ivlen = 0;
1640         uint_t processed_len;
1641         esph_t *esph;
1642         kstat_named_t *counter;
1643         boolean_t is_natt;
1644         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
1645         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1646         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1647 
1648         assoc = ira->ira_ipsec_esp_sa;
1649         ASSERT(assoc != NULL);
1650 
1651         is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1652 
1653         /* get the pointer to the ESP header */
1654         if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1655                 /* authentication-only ESP */
1656                 espstart = ic->ic_crypto_data.cd_offset;
1657                 processed_len = ic->ic_crypto_data.cd_length;
1658         } else {
1659                 /* encryption present */
1660                 ivlen = assoc->ipsa_iv_len;
1661                 if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1662                         /* encryption-only ESP */
1663                         espstart = ic->ic_crypto_data.cd_offset -
1664                             sizeof (esph_t) - assoc->ipsa_iv_len;
1665                         processed_len = ic->ic_crypto_data.cd_length +
1666                             ivlen;
1667                 } else {
1668                         /* encryption with authentication */
1669                         espstart = ic->ic_crypto_dual_data.dd_offset1;
1670                         processed_len = ic->ic_crypto_dual_data.dd_len2 +
1671                             ivlen;
1672                 }
1673         }
1674 
1675         esph = (esph_t *)(data_mp->b_rptr + espstart);
1676 
1677         if (assoc->ipsa_auth_alg != IPSA_AALG_NONE ||
1678             (assoc->ipsa_flags & IPSA_F_COMBINED)) {
1679                 /*
1680                  * Authentication passed if we reach this point.
1681                  * Packets with authentication will have the ICV
1682                  * after the crypto data. Adjust b_wptr before
1683                  * making padlen checks.
1684                  */
1685                 ESP_BUMP_STAT(espstack, good_auth);
1686                 data_mp->b_wptr -= assoc->ipsa_mac_len;
1687 
1688                 /*
1689                  * Check replay window here!
1690                  * For right now, assume keysock will set the replay window
1691                  * size to zero for SAs that have an unspecified sender.
1692                  * This may change...
1693                  */
1694 
1695                 if (!sadb_replay_check(assoc, esph->esph_replay)) {
1696                         /*
1697                          * Log the event. As of now we print out an event.
1698                          * Do not print the replay failure number, or else
1699                          * syslog cannot collate the error messages.  Printing
1700                          * the replay number that failed opens a denial-of-
1701                          * service attack.
1702                          */
1703                         ipsec_assocfailure(info.mi_idnum, 0, 0,
1704                             SL_ERROR | SL_WARN,
1705                             "Replay failed for ESP spi 0x%x, dst %s.\n",
1706                             assoc->ipsa_spi, assoc->ipsa_dstaddr,
1707                             assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1708                         ESP_BUMP_STAT(espstack, replay_failures);
1709                         counter = DROPPER(ipss, ipds_esp_replay);
1710                         goto drop_and_bail;
1711                 }
1712 
1713                 if (is_natt) {
1714                         ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS);
1715                         ASSERT(ira->ira_esp_udp_ports != 0);
1716                         esp_port_freshness(ira->ira_esp_udp_ports, assoc);
1717                 }
1718         }
1719 
1720         esp_set_usetime(assoc, B_TRUE);
1721 
1722         if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1723                 /* The ipsa has hit hard expiration, LOG and AUDIT. */
1724                 ipsec_assocfailure(info.mi_idnum, 0, 0,
1725                     SL_ERROR | SL_WARN,
1726                     "ESP association 0x%x, dst %s had bytes expire.\n",
1727                     assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1728                     espstack->ipsecesp_netstack);
1729                 ESP_BUMP_STAT(espstack, bytes_expired);
1730                 counter = DROPPER(ipss, ipds_esp_bytes_expire);
1731                 goto drop_and_bail;
1732         }
1733 
1734         /*
1735          * Remove ESP header and padding from packet.  I hope the compiler
1736          * spews "branch, predict taken" code for this.
1737          */
1738 
1739         if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4),
1740             ivlen, &counter, espstack)) {
1741 
1742                 if (is_system_labeled() && assoc->ipsa_tsl != NULL) {
1743                         if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
1744                                 ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1745                                     DROPPER(ipss, ipds_ah_nomem),
1746                                     &espstack->esp_dropper);
1747                                 BUMP_MIB(ira->ira_ill->ill_ip_mib,
1748                                     ipIfStatsInDiscards);
1749                                 return (NULL);
1750                         }
1751                 }
1752                 if (is_natt)
1753                         return (esp_fix_natt_checksums(data_mp, assoc));
1754 
1755                 return (data_mp);
1756         }
1757 
1758         esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1759 drop_and_bail:
1760         IP_ESP_BUMP_STAT(ipss, in_discards);
1761         ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter,
1762             &espstack->esp_dropper);
1763         BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1764         return (NULL);
1765 }
1766 
1767 /*
1768  * Called upon failing the inbound ICV check. The message passed as
1769  * argument is freed.
1770  */
1771 static void
1772 esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira)
1773 {
1774         ipsa_t          *assoc = ira->ira_ipsec_esp_sa;
1775         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
1776         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1777         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1778 
1779         /*
1780          * Log the event. Don't print to the console, block
1781          * potential denial-of-service attack.
1782          */
1783         ESP_BUMP_STAT(espstack, bad_auth);
1784 
1785         ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1786             "ESP Authentication failed for spi 0x%x, dst %s.\n",
1787             assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1788             espstack->ipsecesp_netstack);
1789 
1790         IP_ESP_BUMP_STAT(ipss, in_discards);
1791         ip_drop_packet(mp, B_TRUE, ira->ira_ill,
1792             DROPPER(ipss, ipds_esp_bad_auth),
1793             &espstack->esp_dropper);
1794 }
1795 
1796 
1797 /*
1798  * Invoked for outbound packets after ESP processing. If the packet
1799  * also requires AH, performs the AH SA selection and AH processing.
1800  * Returns B_TRUE if the AH processing was not needed or if it was
1801  * performed successfully. Returns B_FALSE and consumes the passed mblk
1802  * if AH processing was required but could not be performed.
1803  *
1804  * Returns data_mp unless data_mp was consumed/queued.
1805  */
1806 static mblk_t *
1807 esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa)
1808 {
1809         ipsec_action_t *ap;
1810 
1811         ap = ixa->ixa_ipsec_action;
1812         if (ap == NULL) {
1813                 ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
1814                 ap = pp->ipsp_act;
1815         }
1816 
1817         if (!ap->ipa_want_ah)
1818                 return (data_mp);
1819 
1820         /*
1821          * Normally the AH SA would have already been put in place
1822          * but it could have been flushed so we need to look for it.
1823          */
1824         if (ixa->ixa_ipsec_ah_sa == NULL) {
1825                 if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
1826                         sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE);
1827                         return (NULL);
1828                 }
1829         }
1830         ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
1831 
1832         data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa);
1833         return (data_mp);
1834 }
1835 
1836 
1837 /*
1838  * Kernel crypto framework callback invoked after completion of async
1839  * crypto requests for outbound packets.
1840  */
1841 static void
1842 esp_kcf_callback_outbound(void *arg, int status)
1843 {
1844         mblk_t          *mp = (mblk_t *)arg;
1845         mblk_t          *async_mp;
1846         netstack_t      *ns;
1847         ipsec_stack_t   *ipss;
1848         ipsecesp_stack_t *espstack;
1849         mblk_t          *data_mp;
1850         ip_xmit_attr_t  ixas;
1851         ipsec_crypto_t  *ic;
1852         ill_t           *ill;
1853 
1854         /*
1855          * First remove the ipsec_crypto_t mblk
1856          * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1857          */
1858         async_mp = ipsec_remove_crypto_data(mp, &ic);
1859         ASSERT(async_mp != NULL);
1860 
1861         /*
1862          * Extract the ip_xmit_attr_t from the first mblk.
1863          * Verifies that the netstack and ill is still around; could
1864          * have vanished while kEf was doing its work.
1865          * On succesful return we have a nce_t and the ill/ipst can't
1866          * disappear until we do the nce_refrele in ixa_cleanup.
1867          */
1868         data_mp = async_mp->b_cont;
1869         async_mp->b_cont = NULL;
1870         if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
1871                 /* Disappeared on us - no ill/ipst for MIB */
1872                 /* We have nowhere to do stats since ixa_ipst could be NULL */
1873                 if (ixas.ixa_nce != NULL) {
1874                         ill = ixas.ixa_nce->nce_ill;
1875                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1876                         ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
1877                 }
1878                 freemsg(data_mp);
1879                 goto done;
1880         }
1881         ns = ixas.ixa_ipst->ips_netstack;
1882         espstack = ns->netstack_ipsecesp;
1883         ipss = ns->netstack_ipsec;
1884         ill = ixas.ixa_nce->nce_ill;
1885 
1886         if (status == CRYPTO_SUCCESS) {
1887                 /*
1888                  * If a ICV was computed, it was stored by the
1889                  * crypto framework at the end of the packet.
1890                  */
1891                 ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1892 
1893                 esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE);
1894                 /* NAT-T packet. */
1895                 if (IPH_HDR_VERSION(ipha) == IP_VERSION &&
1896                     ipha->ipha_protocol == IPPROTO_UDP)
1897                         esp_prepare_udp(ns, data_mp, ipha);
1898 
1899                 /* do AH processing if needed */
1900                 data_mp = esp_do_outbound_ah(data_mp, &ixas);
1901                 if (data_mp == NULL)
1902                         goto done;
1903 
1904                 (void) ip_output_post_ipsec(data_mp, &ixas);
1905         } else {
1906                 /* Outbound shouldn't see invalid MAC */
1907                 ASSERT(status != CRYPTO_INVALID_MAC);
1908 
1909                 esp1dbg(espstack,
1910                     ("esp_kcf_callback_outbound: crypto failed with 0x%x\n",
1911                     status));
1912                 ESP_BUMP_STAT(espstack, crypto_failures);
1913                 ESP_BUMP_STAT(espstack, out_discards);
1914                 ip_drop_packet(data_mp, B_FALSE, ill,
1915                     DROPPER(ipss, ipds_esp_crypto_failed),
1916                     &espstack->esp_dropper);
1917                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1918         }
1919 done:
1920         ixa_cleanup(&ixas);
1921         (void) ipsec_free_crypto_data(mp);
1922 }
1923 
1924 /*
1925  * Kernel crypto framework callback invoked after completion of async
1926  * crypto requests for inbound packets.
1927  */
1928 static void
1929 esp_kcf_callback_inbound(void *arg, int status)
1930 {
1931         mblk_t          *mp = (mblk_t *)arg;
1932         mblk_t          *async_mp;
1933         netstack_t      *ns;
1934         ipsecesp_stack_t *espstack;
1935         ipsec_stack_t   *ipss;
1936         mblk_t          *data_mp;
1937         ip_recv_attr_t  iras;
1938         ipsec_crypto_t  *ic;
1939 
1940         /*
1941          * First remove the ipsec_crypto_t mblk
1942          * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1943          */
1944         async_mp = ipsec_remove_crypto_data(mp, &ic);
1945         ASSERT(async_mp != NULL);
1946 
1947         /*
1948          * Extract the ip_recv_attr_t from the first mblk.
1949          * Verifies that the netstack and ill is still around; could
1950          * have vanished while kEf was doing its work.
1951          */
1952         data_mp = async_mp->b_cont;
1953         async_mp->b_cont = NULL;
1954         if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
1955                 /* The ill or ip_stack_t disappeared on us */
1956                 ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
1957                 freemsg(data_mp);
1958                 goto done;
1959         }
1960 
1961         ns = iras.ira_ill->ill_ipst->ips_netstack;
1962         espstack = ns->netstack_ipsecesp;
1963         ipss = ns->netstack_ipsec;
1964 
1965         if (status == CRYPTO_SUCCESS) {
1966                 data_mp = esp_in_done(data_mp, &iras, ic);
1967                 if (data_mp == NULL)
1968                         goto done;
1969 
1970                 /* finish IPsec processing */
1971                 ip_input_post_ipsec(data_mp, &iras);
1972         } else if (status == CRYPTO_INVALID_MAC) {
1973                 esp_log_bad_auth(data_mp, &iras);
1974         } else {
1975                 esp1dbg(espstack,
1976                     ("esp_kcf_callback: crypto failed with 0x%x\n",
1977                     status));
1978                 ESP_BUMP_STAT(espstack, crypto_failures);
1979                 IP_ESP_BUMP_STAT(ipss, in_discards);
1980                 ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
1981                     DROPPER(ipss, ipds_esp_crypto_failed),
1982                     &espstack->esp_dropper);
1983                 BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1984         }
1985 done:
1986         ira_cleanup(&iras, B_TRUE);
1987         (void) ipsec_free_crypto_data(mp);
1988 }
1989 
1990 /*
1991  * Invoked on crypto framework failure during inbound and outbound processing.
1992  */
1993 static void
1994 esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
1995     ill_t *ill, ipsecesp_stack_t *espstack)
1996 {
1997         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
1998 
1999         esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
2000             is_inbound ? "inbound" : "outbound", kef_rc));
2001         ip_drop_packet(data_mp, is_inbound, ill,
2002             DROPPER(ipss, ipds_esp_crypto_failed),
2003             &espstack->esp_dropper);
2004         ESP_BUMP_STAT(espstack, crypto_failures);
2005         if (is_inbound)
2006                 IP_ESP_BUMP_STAT(ipss, in_discards);
2007         else
2008                 ESP_BUMP_STAT(espstack, out_discards);
2009 }
2010 
2011 /*
2012  * A statement-equivalent macro, _cr MUST point to a modifiable
2013  * crypto_call_req_t.
2014  */
2015 #define ESP_INIT_CALLREQ(_cr, _mp, _callback)                           \
2016         (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE;      \
2017         (_cr)->cr_callback_arg = (_mp);                              \
2018         (_cr)->cr_callback_func = (_callback)
2019 
2020 #define ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {                      \
2021         (mac)->cd_format = CRYPTO_DATA_RAW;                          \
2022         (mac)->cd_offset = 0;                                                \
2023         (mac)->cd_length = icvlen;                                   \
2024         (mac)->cd_raw.iov_base = (char *)icvbuf;                     \
2025         (mac)->cd_raw.iov_len = icvlen;                                      \
2026 }
2027 
2028 #define ESP_INIT_CRYPTO_DATA(data, mp, off, len) {                      \
2029         if (MBLKL(mp) >= (len) + (off)) {                            \
2030                 (data)->cd_format = CRYPTO_DATA_RAW;                 \
2031                 (data)->cd_raw.iov_base = (char *)(mp)->b_rptr;           \
2032                 (data)->cd_raw.iov_len = MBLKL(mp);                  \
2033                 (data)->cd_offset = off;                             \
2034         } else {                                                        \
2035                 (data)->cd_format = CRYPTO_DATA_MBLK;                        \
2036                 (data)->cd_mp = mp;                                  \
2037                 (data)->cd_offset = off;                             \
2038         }                                                               \
2039         (data)->cd_length = len;                                     \
2040 }
2041 
2042 #define ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {   \
2043         (data)->dd_format = CRYPTO_DATA_MBLK;                                \
2044         (data)->dd_mp = mp;                                          \
2045         (data)->dd_len1 = len1;                                              \
2046         (data)->dd_offset1 = off1;                                   \
2047         (data)->dd_len2 = len2;                                              \
2048         (data)->dd_offset2 = off2;                                   \
2049 }
2050 
2051 /*
2052  * Returns data_mp if successfully completed the request. Returns
2053  * NULL if it failed (and increments InDiscards) or if it is pending.
2054  */
2055 static mblk_t *
2056 esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira,
2057     ipsa_t *assoc, uint_t esph_offset)
2058 {
2059         uint_t auth_offset, msg_len, auth_len;
2060         crypto_call_req_t call_req, *callrp;
2061         mblk_t *mp;
2062         esph_t *esph_ptr;
2063         int kef_rc;
2064         uint_t icv_len = assoc->ipsa_mac_len;
2065         crypto_ctx_template_t auth_ctx_tmpl;
2066         boolean_t do_auth, do_encr, force;
2067         uint_t encr_offset, encr_len;
2068         uint_t iv_len = assoc->ipsa_iv_len;
2069         crypto_ctx_template_t encr_ctx_tmpl;
2070         ipsec_crypto_t  *ic, icstack;
2071         uchar_t *iv_ptr;
2072         netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
2073         ipsec_stack_t *ipss = ns->netstack_ipsec;
2074         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2075 
2076         do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2077         do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2078         force = (assoc->ipsa_flags & IPSA_F_ASYNC);
2079 
2080 #ifdef IPSEC_LATENCY_TEST
2081         kef_rc = CRYPTO_SUCCESS;
2082 #else
2083         kef_rc = CRYPTO_FAILED;
2084 #endif
2085 
2086         /*
2087          * An inbound packet is of the form:
2088          * [IP,options,ESP,IV,data,ICV,pad]
2089          */
2090         esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
2091         iv_ptr = (uchar_t *)(esph_ptr + 1);
2092         /* Packet length starting at IP header ending after ESP ICV. */
2093         msg_len = MBLKL(esp_mp);
2094 
2095         encr_offset = esph_offset + sizeof (esph_t) + iv_len;
2096         encr_len = msg_len - encr_offset;
2097 
2098         /*
2099          * Counter mode algs need a nonce. This is setup in sadb_common_add().
2100          * If for some reason we are using a SA which does not have a nonce
2101          * then we must fail here.
2102          */
2103         if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
2104             (assoc->ipsa_nonce == NULL)) {
2105                 ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill,
2106                     DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2107                 return (NULL);
2108         }
2109 
2110         if (force) {
2111                 /* We are doing asynch; allocate mblks to hold state */
2112                 if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
2113                     (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
2114                         BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
2115                         ip_drop_input("ipIfStatsInDiscards", esp_mp,
2116                             ira->ira_ill);
2117                         return (NULL);
2118                 }
2119                 linkb(mp, esp_mp);
2120                 callrp = &call_req;
2121                 ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound);
2122         } else {
2123                 /*
2124                  * If we know we are going to do sync then ipsec_crypto_t
2125                  * should be on the stack.
2126                  */
2127                 ic = &icstack;
2128                 bzero(ic, sizeof (*ic));
2129                 callrp = NULL;
2130         }
2131 
2132         if (do_auth) {
2133                 /* authentication context template */
2134                 IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2135                     auth_ctx_tmpl);
2136 
2137                 /* ICV to be verified */
2138                 ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
2139                     icv_len, esp_mp->b_wptr - icv_len);
2140 
2141                 /* authentication starts at the ESP header */
2142                 auth_offset = esph_offset;
2143                 auth_len = msg_len - auth_offset - icv_len;
2144                 if (!do_encr) {
2145                         /* authentication only */
2146                         /* initialize input data argument */
2147                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2148                             esp_mp, auth_offset, auth_len);
2149 
2150                         /* call the crypto framework */
2151                         kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
2152                             &ic->ic_crypto_data,
2153                             &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2154                             &ic->ic_crypto_mac, callrp);
2155                 }
2156         }
2157 
2158         if (do_encr) {
2159                 /* encryption template */
2160                 IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2161                     encr_ctx_tmpl);
2162 
2163                 /* Call the nonce update function. Also passes in IV */
2164                 (assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len,
2165                     iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
2166 
2167                 if (!do_auth) {
2168                         /* decryption only */
2169                         /* initialize input data argument */
2170                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2171                             esp_mp, encr_offset, encr_len);
2172 
2173                         /* call the crypto framework */
2174                         kef_rc = crypto_decrypt((crypto_mechanism_t *)
2175                             &ic->ic_cmm, &ic->ic_crypto_data,
2176                             &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2177                             NULL, callrp);
2178                 }
2179         }
2180 
2181         if (do_auth && do_encr) {
2182                 /* dual operation */
2183                 /* initialize input data argument */
2184                 ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
2185                     esp_mp, auth_offset, auth_len,
2186                     encr_offset, encr_len - icv_len);
2187 
2188                 /* specify IV */
2189                 ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
2190 
2191                 /* call the framework */
2192                 kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
2193                     &assoc->ipsa_emech, &ic->ic_crypto_dual_data,
2194                     &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
2195                     auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac,
2196                     NULL, callrp);
2197         }
2198 
2199         switch (kef_rc) {
2200         case CRYPTO_SUCCESS:
2201                 ESP_BUMP_STAT(espstack, crypto_sync);
2202                 esp_mp = esp_in_done(esp_mp, ira, ic);
2203                 if (force) {
2204                         /* Free mp after we are done with ic */
2205                         mp = ipsec_free_crypto_data(mp);
2206                         (void) ip_recv_attr_free_mblk(mp);
2207                 }
2208                 return (esp_mp);
2209         case CRYPTO_QUEUED:
2210                 /* esp_kcf_callback_inbound() will be invoked on completion */
2211                 ESP_BUMP_STAT(espstack, crypto_async);
2212                 return (NULL);
2213         case CRYPTO_INVALID_MAC:
2214                 if (force) {
2215                         mp = ipsec_free_crypto_data(mp);
2216                         esp_mp = ip_recv_attr_free_mblk(mp);
2217                 }
2218                 ESP_BUMP_STAT(espstack, crypto_sync);
2219                 BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
2220                 esp_log_bad_auth(esp_mp, ira);
2221                 /* esp_mp was passed to ip_drop_packet */
2222                 return (NULL);
2223         }
2224 
2225         if (force) {
2226                 mp = ipsec_free_crypto_data(mp);
2227                 esp_mp = ip_recv_attr_free_mblk(mp);
2228         }
2229         BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
2230         esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack);
2231         /* esp_mp was passed to ip_drop_packet */
2232         return (NULL);
2233 }
2234 
2235 /*
2236  * Compute the IP and UDP checksums -- common code for both keepalives and
2237  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2238  * uses mblk-insertion to insert the UDP header.
2239  * TODO - If there is an easy way to prep a packet for HW checksums, make
2240  * it happen here.
2241  * Note that this is used before both before calling ip_output_simple and
2242  * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the
2243  * latter.
2244  */
2245 static void
2246 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2247 {
2248         int offset;
2249         uint32_t cksum;
2250         uint16_t *arr;
2251         mblk_t *udpmp = mp;
2252         uint_t hlen = IPH_HDR_LENGTH(ipha);
2253 
2254         ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2255 
2256         ipha->ipha_hdr_checksum = 0;
2257         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2258 
2259         if (ns->netstack_udp->us_do_checksum) {
2260                 ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2261                 /* arr points to the IP header. */
2262                 arr = (uint16_t *)ipha;
2263                 IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2264                 IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes,
2265                     ntohs(htons(ipha->ipha_length) - hlen));
2266                 /* arr[6-9] are the IP addresses. */
2267                 cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2268                     ntohs(htons(ipha->ipha_length) - hlen);
2269                 cksum = IP_CSUM(mp, hlen, cksum);
2270                 offset = hlen + UDP_CHECKSUM_OFFSET;
2271                 while (offset >= MBLKL(udpmp)) {
2272                         offset -= MBLKL(udpmp);
2273                         udpmp = udpmp->b_cont;
2274                 }
2275                 /* arr points to the UDP header's checksum field. */
2276                 arr = (uint16_t *)(udpmp->b_rptr + offset);
2277                 *arr = cksum;
2278         }
2279 }
2280 
2281 /*
2282  * taskq handler so we can send the NAT-T keepalive on a separate thread.
2283  */
2284 static void
2285 actually_send_keepalive(void *arg)
2286 {
2287         mblk_t *mp = (mblk_t *)arg;
2288         ip_xmit_attr_t ixas;
2289         netstack_t      *ns;
2290         netstackid_t    stackid;
2291 
2292         stackid = (netstackid_t)(uintptr_t)mp->b_prev;
2293         mp->b_prev = NULL;
2294         ns = netstack_find_by_stackid(stackid);
2295         if (ns == NULL) {
2296                 /* Disappeared */
2297                 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2298                 freemsg(mp);
2299                 return;
2300         }
2301 
2302         bzero(&ixas, sizeof (ixas));
2303         ixas.ixa_zoneid = ALL_ZONES;
2304         ixas.ixa_cred = kcred;
2305         ixas.ixa_cpid = NOPID;
2306         ixas.ixa_tsl = NULL;
2307         ixas.ixa_ipst = ns->netstack_ip;
2308         /* No ULP checksum; done by esp_prepare_udp */
2309         ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE);
2310 
2311         (void) ip_output_simple(mp, &ixas);
2312         ixa_cleanup(&ixas);
2313         netstack_rele(ns);
2314 }
2315 
2316 /*
2317  * Send a one-byte UDP NAT-T keepalive.
2318  */
2319 void
2320 ipsecesp_send_keepalive(ipsa_t *assoc)
2321 {
2322         mblk_t          *mp;
2323         ipha_t          *ipha;
2324         udpha_t         *udpha;
2325         netstack_t      *ns = assoc->ipsa_netstack;
2326 
2327         ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
2328 
2329         mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2330         if (mp == NULL)
2331                 return;
2332         ipha = (ipha_t *)mp->b_rptr;
2333         ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2334         ipha->ipha_type_of_service = 0;
2335         ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2336         /* Use the low-16 of the SPI so we have some clue where it came from. */
2337         ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2338         ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2339         ipha->ipha_ttl = 0xFF;
2340         ipha->ipha_protocol = IPPROTO_UDP;
2341         ipha->ipha_hdr_checksum = 0;
2342         ipha->ipha_src = assoc->ipsa_srcaddr[0];
2343         ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2344         udpha = (udpha_t *)(ipha + 1);
2345         udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2346             assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2347         udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2348             assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2349         udpha->uha_length = htons(sizeof (udpha_t) + 1);
2350         udpha->uha_checksum = 0;
2351         mp->b_wptr = (uint8_t *)(udpha + 1);
2352         *(mp->b_wptr++) = 0xFF;
2353 
2354         esp_prepare_udp(ns, mp, ipha);
2355 
2356         /*
2357          * We're holding an isaf_t bucket lock, so pawn off the actual
2358          * packet transmission to another thread.  Just in case syncq
2359          * processing causes a same-bucket packet to be processed.
2360          */
2361         mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid;
2362 
2363         if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp,
2364             TQ_NOSLEEP) == 0) {
2365                 /* Assume no memory if taskq_dispatch() fails. */
2366                 mp->b_prev = NULL;
2367                 ip_drop_packet(mp, B_FALSE, NULL,
2368                     DROPPER(ns->netstack_ipsec, ipds_esp_nomem),
2369                     &ns->netstack_ipsecesp->esp_dropper);
2370         }
2371 }
2372 
2373 /*
2374  * Returns mp if successfully completed the request. Returns
2375  * NULL if it failed (and increments InDiscards) or if it is pending.
2376  */
2377 static mblk_t *
2378 esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc,
2379     uchar_t *icv_buf, uint_t payload_len)
2380 {
2381         uint_t auth_len;
2382         crypto_call_req_t call_req, *callrp;
2383         mblk_t *esp_mp;
2384         esph_t *esph_ptr;
2385         mblk_t *mp;
2386         int kef_rc = CRYPTO_FAILED;
2387         uint_t icv_len = assoc->ipsa_mac_len;
2388         crypto_ctx_template_t auth_ctx_tmpl;
2389         boolean_t do_auth, do_encr, force;
2390         uint_t iv_len = assoc->ipsa_iv_len;
2391         crypto_ctx_template_t encr_ctx_tmpl;
2392         boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2393         size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2394         netstack_t      *ns = ixa->ixa_ipst->ips_netstack;
2395         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2396         ipsec_crypto_t  *ic, icstack;
2397         uchar_t         *iv_ptr;
2398         crypto_data_t   *cd_ptr = NULL;
2399         ill_t           *ill = ixa->ixa_nce->nce_ill;
2400         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2401 
2402         esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2403             is_natt ? "natt" : "not natt"));
2404 
2405         do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2406         do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2407         force = (assoc->ipsa_flags & IPSA_F_ASYNC);
2408 
2409 #ifdef IPSEC_LATENCY_TEST
2410         kef_rc = CRYPTO_SUCCESS;
2411 #else
2412         kef_rc = CRYPTO_FAILED;
2413 #endif
2414 
2415         /*
2416          * Outbound IPsec packets are of the form:
2417          * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2418          * unless it's NATT, then it's
2419          * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2420          * Get a pointer to the mblk containing the ESP header.
2421          */
2422         ASSERT(data_mp->b_cont != NULL);
2423         esp_mp = data_mp->b_cont;
2424         esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
2425         iv_ptr = (uchar_t *)(esph_ptr + 1);
2426 
2427         /*
2428          * Combined mode algs need a nonce. This is setup in sadb_common_add().
2429          * If for some reason we are using a SA which does not have a nonce
2430          * then we must fail here.
2431          */
2432         if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
2433             (assoc->ipsa_nonce == NULL)) {
2434                 ip_drop_packet(data_mp, B_FALSE, NULL,
2435                     DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2436                 return (NULL);
2437         }
2438 
2439         if (force) {
2440                 /* We are doing asynch; allocate mblks to hold state */
2441                 if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
2442                     (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
2443                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2444                         ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
2445                         freemsg(data_mp);
2446                         return (NULL);
2447                 }
2448 
2449                 linkb(mp, data_mp);
2450                 callrp = &call_req;
2451                 ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound);
2452         } else {
2453                 /*
2454                  * If we know we are going to do sync then ipsec_crypto_t
2455                  * should be on the stack.
2456                  */
2457                 ic = &icstack;
2458                 bzero(ic, sizeof (*ic));
2459                 callrp = NULL;
2460         }
2461 
2462 
2463         if (do_auth) {
2464                 /* authentication context template */
2465                 IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2466                     auth_ctx_tmpl);
2467 
2468                 /* where to store the computed mac */
2469                 ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
2470                     icv_len, icv_buf);
2471 
2472                 /* authentication starts at the ESP header */
2473                 auth_len = payload_len + iv_len + sizeof (esph_t);
2474                 if (!do_encr) {
2475                         /* authentication only */
2476                         /* initialize input data argument */
2477                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2478                             esp_mp, esph_offset, auth_len);
2479 
2480                         /* call the crypto framework */
2481                         kef_rc = crypto_mac(&assoc->ipsa_amech,
2482                             &ic->ic_crypto_data,
2483                             &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2484                             &ic->ic_crypto_mac, callrp);
2485                 }
2486         }
2487 
2488         if (do_encr) {
2489                 /* encryption context template */
2490                 IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2491                     encr_ctx_tmpl);
2492                 /* Call the nonce update function. */
2493                 (assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len,
2494                     iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
2495 
2496                 if (!do_auth) {
2497                         /* encryption only, skip mblk that contains ESP hdr */
2498                         /* initialize input data argument */
2499                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2500                             esp_mp->b_cont, 0, payload_len);
2501 
2502                         /*
2503                          * For combined mode ciphers, the ciphertext is the same
2504                          * size as the clear text, the ICV should follow the
2505                          * ciphertext. To convince the kcf to allow in-line
2506                          * encryption, with an ICV, use ipsec_out_crypto_mac
2507                          * to point to the same buffer as the data. The calling
2508                          * function need to ensure the buffer is large enough to
2509                          * include the ICV.
2510                          *
2511                          * The IV is already written to the packet buffer, the
2512                          * nonce setup function copied it to the params struct
2513                          * for the cipher to use.
2514                          */
2515                         if (assoc->ipsa_flags & IPSA_F_COMBINED) {
2516                                 bcopy(&ic->ic_crypto_data,
2517                                     &ic->ic_crypto_mac,
2518                                     sizeof (crypto_data_t));
2519                                 ic->ic_crypto_mac.cd_length =
2520                                     payload_len + icv_len;
2521                                 cd_ptr = &ic->ic_crypto_mac;
2522                         }
2523 
2524                         /* call the crypto framework */
2525                         kef_rc = crypto_encrypt((crypto_mechanism_t *)
2526                             &ic->ic_cmm, &ic->ic_crypto_data,
2527                             &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2528                             cd_ptr, callrp);
2529 
2530                 }
2531         }
2532 
2533         if (do_auth && do_encr) {
2534                 /*
2535                  * Encryption and authentication:
2536                  * Pass the pointer to the mblk chain starting at the ESP
2537                  * header to the framework. Skip the ESP header mblk
2538                  * for encryption, which is reflected by an encryption
2539                  * offset equal to the length of that mblk. Start
2540                  * the authentication at the ESP header, i.e. use an
2541                  * authentication offset of zero.
2542                  */
2543                 ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
2544                     esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2545 
2546                 /* specify IV */
2547                 ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
2548 
2549                 /* call the framework */
2550                 kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2551                     &assoc->ipsa_amech, NULL,
2552                     &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2553                     encr_ctx_tmpl, auth_ctx_tmpl,
2554                     &ic->ic_crypto_dual_data,
2555                     &ic->ic_crypto_mac, callrp);
2556         }
2557 
2558         switch (kef_rc) {
2559         case CRYPTO_SUCCESS:
2560                 ESP_BUMP_STAT(espstack, crypto_sync);
2561                 esp_set_usetime(assoc, B_FALSE);
2562                 if (force) {
2563                         mp = ipsec_free_crypto_data(mp);
2564                         data_mp = ip_xmit_attr_free_mblk(mp);
2565                 }
2566                 if (is_natt)
2567                         esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr);
2568                 return (data_mp);
2569         case CRYPTO_QUEUED:
2570                 /* esp_kcf_callback_outbound() will be invoked on completion */
2571                 ESP_BUMP_STAT(espstack, crypto_async);
2572                 return (NULL);
2573         }
2574 
2575         if (force) {
2576                 mp = ipsec_free_crypto_data(mp);
2577                 data_mp = ip_xmit_attr_free_mblk(mp);
2578         }
2579         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2580         esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack);
2581         /* data_mp was passed to ip_drop_packet */
2582         return (NULL);
2583 }
2584 
2585 /*
2586  * Handle outbound IPsec processing for IPv4 and IPv6
2587  *
2588  * Returns data_mp if successfully completed the request. Returns
2589  * NULL if it failed (and increments InDiscards) or if it is pending.
2590  */
2591 static mblk_t *
2592 esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
2593 {
2594         mblk_t *espmp, *tailmp;
2595         ipha_t *ipha;
2596         ip6_t *ip6h;
2597         esph_t *esph_ptr, *iv_ptr;
2598         uint_t af;
2599         uint8_t *nhp;
2600         uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2601         uintptr_t esplen = sizeof (esph_t);
2602         uint8_t protocol;
2603         ipsa_t *assoc;
2604         uint_t iv_len, block_size, mac_len = 0;
2605         uchar_t *icv_buf;
2606         udpha_t *udpha;
2607         boolean_t is_natt = B_FALSE;
2608         netstack_t      *ns = ixa->ixa_ipst->ips_netstack;
2609         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2610         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2611         ill_t           *ill = ixa->ixa_nce->nce_ill;
2612         boolean_t       need_refrele = B_FALSE;
2613 
2614         ESP_BUMP_STAT(espstack, out_requests);
2615 
2616         /*
2617          * <sigh> We have to copy the message here, because TCP (for example)
2618          * keeps a dupb() of the message lying around for retransmission.
2619          * Since ESP changes the whole of the datagram, we have to create our
2620          * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2621          * we might as well make use of msgpullup() and get the mblk into one
2622          * contiguous piece!
2623          */
2624         tailmp = msgpullup(data_mp, -1);
2625         if (tailmp == NULL) {
2626                 esp0dbg(("esp_outbound: msgpullup() failed, "
2627                     "dropping packet.\n"));
2628                 ip_drop_packet(data_mp, B_FALSE, ill,
2629                     DROPPER(ipss, ipds_esp_nomem),
2630                     &espstack->esp_dropper);
2631                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2632                 return (NULL);
2633         }
2634         freemsg(data_mp);
2635         data_mp = tailmp;
2636 
2637         assoc = ixa->ixa_ipsec_esp_sa;
2638         ASSERT(assoc != NULL);
2639 
2640         /*
2641          * Get the outer IP header in shape to escape this system..
2642          */
2643         if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
2644                 /*
2645                  * Need to update packet with any CIPSO option and update
2646                  * ixa_tsl to capture the new label.
2647                  * We allocate a separate ixa for that purpose.
2648                  */
2649                 ixa = ip_xmit_attr_duplicate(ixa);
2650                 if (ixa == NULL) {
2651                         ip_drop_packet(data_mp, B_FALSE, ill,
2652                             DROPPER(ipss, ipds_esp_nomem),
2653                             &espstack->esp_dropper);
2654                         return (NULL);
2655                 }
2656                 need_refrele = B_TRUE;
2657 
2658                 label_hold(assoc->ipsa_otsl);
2659                 ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
2660 
2661                 data_mp = sadb_whack_label(data_mp, assoc, ixa,
2662                     DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2663                 if (data_mp == NULL) {
2664                         /* Packet dropped by sadb_whack_label */
2665                         ixa_refrele(ixa);
2666                         return (NULL);
2667                 }
2668         }
2669 
2670         /*
2671          * Reality check....
2672          */
2673         ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2674 
2675         if (ixa->ixa_flags & IXAF_IS_IPV4) {
2676                 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
2677 
2678                 af = AF_INET;
2679                 divpoint = IPH_HDR_LENGTH(ipha);
2680                 datalen = ntohs(ipha->ipha_length) - divpoint;
2681                 nhp = (uint8_t *)&ipha->ipha_protocol;
2682         } else {
2683                 ip_pkt_t ipp;
2684 
2685                 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
2686 
2687                 af = AF_INET6;
2688                 ip6h = (ip6_t *)ipha;
2689                 bzero(&ipp, sizeof (ipp));
2690                 divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL);
2691                 if (ipp.ipp_dstopts != NULL &&
2692                     ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2693                         /*
2694                          * Destination options are tricky.  If we get in here,
2695                          * then we have a terminal header following the
2696                          * destination options.  We need to adjust backwards
2697                          * so we insert ESP BEFORE the destination options
2698                          * bag.  (So that the dstopts get encrypted!)
2699                          *
2700                          * Since this is for outbound packets only, we know
2701                          * that non-terminal destination options only precede
2702                          * routing headers.
2703                          */
2704                         divpoint -= ipp.ipp_dstoptslen;
2705                 }
2706                 datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2707 
2708                 if (ipp.ipp_rthdr != NULL) {
2709                         nhp = &ipp.ipp_rthdr->ip6r_nxt;
2710                 } else if (ipp.ipp_hopopts != NULL) {
2711                         nhp = &ipp.ipp_hopopts->ip6h_nxt;
2712                 } else {
2713                         ASSERT(divpoint == sizeof (ip6_t));
2714                         /* It's probably IP + ESP. */
2715                         nhp = &ip6h->ip6_nxt;
2716                 }
2717         }
2718 
2719         mac_len = assoc->ipsa_mac_len;
2720 
2721         if (assoc->ipsa_flags & IPSA_F_NATT) {
2722                 /* wedge in UDP header */
2723                 is_natt = B_TRUE;
2724                 esplen += UDPH_SIZE;
2725         }
2726 
2727         /*
2728          * Set up ESP header and encryption padding for ENCR PI request.
2729          */
2730 
2731         /* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2732         if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2733                 iv_len = assoc->ipsa_iv_len;
2734                 block_size = assoc->ipsa_datalen;
2735 
2736                 /*
2737                  * Pad the data to the length of the cipher block size.
2738                  * Include the two additional bytes (hence the - 2) for the
2739                  * padding length and the next header.  Take this into account
2740                  * when calculating the actual length of the padding.
2741                  */
2742                 ASSERT(ISP2(iv_len));
2743                 padlen = ((unsigned)(block_size - datalen - 2)) &
2744                     (block_size - 1);
2745         } else {
2746                 iv_len = 0;
2747                 padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2748                     (sizeof (uint32_t) - 1);
2749         }
2750 
2751         /* Allocate ESP header and IV. */
2752         esplen += iv_len;
2753 
2754         /*
2755          * Update association byte-count lifetimes.  Don't forget to take
2756          * into account the padding length and next-header (hence the + 2).
2757          *
2758          * Use the amount of data fed into the "encryption algorithm".  This
2759          * is the IV, the data length, the padding length, and the final two
2760          * bytes (padlen, and next-header).
2761          *
2762          */
2763 
2764         if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2765                 ip_drop_packet(data_mp, B_FALSE, ill,
2766                     DROPPER(ipss, ipds_esp_bytes_expire),
2767                     &espstack->esp_dropper);
2768                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2769                 if (need_refrele)
2770                         ixa_refrele(ixa);
2771                 return (NULL);
2772         }
2773 
2774         espmp = allocb(esplen, BPRI_HI);
2775         if (espmp == NULL) {
2776                 ESP_BUMP_STAT(espstack, out_discards);
2777                 esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2778                 ip_drop_packet(data_mp, B_FALSE, ill,
2779                     DROPPER(ipss, ipds_esp_nomem),
2780                     &espstack->esp_dropper);
2781                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2782                 if (need_refrele)
2783                         ixa_refrele(ixa);
2784                 return (NULL);
2785         }
2786         espmp->b_wptr += esplen;
2787         esph_ptr = (esph_t *)espmp->b_rptr;
2788 
2789         if (is_natt) {
2790                 esp3dbg(espstack, ("esp_outbound: NATT"));
2791 
2792                 udpha = (udpha_t *)espmp->b_rptr;
2793                 udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2794                     assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2795                 udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2796                     assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2797                 /*
2798                  * Set the checksum to 0, so that the esp_prepare_udp() call
2799                  * can do the right thing.
2800                  */
2801                 udpha->uha_checksum = 0;
2802                 esph_ptr = (esph_t *)(udpha + 1);
2803         }
2804 
2805         esph_ptr->esph_spi = assoc->ipsa_spi;
2806 
2807         esph_ptr->esph_replay = htonl(atomic_inc_32_nv(&assoc->ipsa_replay));
2808         if (esph_ptr->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2809                 /*
2810                  * XXX We have replay counter wrapping.
2811                  * We probably want to nuke this SA (and its peer).
2812                  */
2813                 ipsec_assocfailure(info.mi_idnum, 0, 0,
2814                     SL_ERROR | SL_CONSOLE | SL_WARN,
2815                     "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2816                     esph_ptr->esph_spi, assoc->ipsa_dstaddr, af,
2817                     espstack->ipsecesp_netstack);
2818 
2819                 ESP_BUMP_STAT(espstack, out_discards);
2820                 sadb_replay_delete(assoc);
2821                 ip_drop_packet(data_mp, B_FALSE, ill,
2822                     DROPPER(ipss, ipds_esp_replay),
2823                     &espstack->esp_dropper);
2824                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2825                 if (need_refrele)
2826                         ixa_refrele(ixa);
2827                 return (NULL);
2828         }
2829 
2830         iv_ptr = (esph_ptr + 1);
2831         /*
2832          * iv_ptr points to the mblk which will contain the IV once we have
2833          * written it there. This mblk will be part of a mblk chain that
2834          * will make up the packet.
2835          *
2836          * For counter mode algorithms, the IV is a 64 bit quantity, it
2837          * must NEVER repeat in the lifetime of the SA, otherwise an
2838          * attacker who had recorded enough packets might be able to
2839          * determine some clear text.
2840          *
2841          * To ensure this does not happen, the IV is stored in the SA and
2842          * incremented for each packet, the IV is then copied into the
2843          * "packet" for transmission to the receiving system. The IV will
2844          * also be copied into the nonce, when the packet is encrypted.
2845          *
2846          * CBC mode algorithms use a random IV for each packet. We do not
2847          * require the highest quality random bits, but for best security
2848          * with CBC mode ciphers, the value must be unlikely to repeat and
2849          * must not be known in advance to an adversary capable of influencing
2850          * the clear text.
2851          */
2852         if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc,
2853             espstack)) {
2854                 ip_drop_packet(data_mp, B_FALSE, ill,
2855                     DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper);
2856                 if (need_refrele)
2857                         ixa_refrele(ixa);
2858                 return (NULL);
2859         }
2860 
2861         /* Fix the IP header. */
2862         alloclen = padlen + 2 + mac_len;
2863         adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2864 
2865         protocol = *nhp;
2866 
2867         if (ixa->ixa_flags & IXAF_IS_IPV4) {
2868                 ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2869                 if (is_natt) {
2870                         *nhp = IPPROTO_UDP;
2871                         udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2872                             IPH_HDR_LENGTH(ipha));
2873                 } else {
2874                         *nhp = IPPROTO_ESP;
2875                 }
2876                 ipha->ipha_hdr_checksum = 0;
2877                 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2878         } else {
2879                 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2880                 *nhp = IPPROTO_ESP;
2881         }
2882 
2883         /* I've got the two ESP mblks, now insert them. */
2884 
2885         esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2886         esp2dbg(espstack, (dump_msg(data_mp)));
2887 
2888         if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2889                 ESP_BUMP_STAT(espstack, out_discards);
2890                 /* NOTE:  esp_insert_esp() only fails if there's no memory. */
2891                 ip_drop_packet(data_mp, B_FALSE, ill,
2892                     DROPPER(ipss, ipds_esp_nomem),
2893                     &espstack->esp_dropper);
2894                 freeb(espmp);
2895                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2896                 if (need_refrele)
2897                         ixa_refrele(ixa);
2898                 return (NULL);
2899         }
2900 
2901         /* Append padding (and leave room for ICV). */
2902         for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2903                 ;
2904         if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2905                 tailmp->b_cont = allocb(alloclen, BPRI_HI);
2906                 if (tailmp->b_cont == NULL) {
2907                         ESP_BUMP_STAT(espstack, out_discards);
2908                         esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2909                         ip_drop_packet(data_mp, B_FALSE, ill,
2910                             DROPPER(ipss, ipds_esp_nomem),
2911                             &espstack->esp_dropper);
2912                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2913                         if (need_refrele)
2914                                 ixa_refrele(ixa);
2915                         return (NULL);
2916                 }
2917                 tailmp = tailmp->b_cont;
2918         }
2919 
2920         /*
2921          * If there's padding, N bytes of padding must be of the form 0x1,
2922          * 0x2, 0x3... 0xN.
2923          */
2924         for (i = 0; i < padlen; ) {
2925                 i++;
2926                 *tailmp->b_wptr++ = i;
2927         }
2928         *tailmp->b_wptr++ = i;
2929         *tailmp->b_wptr++ = protocol;
2930 
2931         esp2dbg(espstack, ("data_Mp before encryption:\n"));
2932         esp2dbg(espstack, (dump_msg(data_mp)));
2933 
2934         /*
2935          * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2936          */
2937 
2938         if (mac_len > 0) {
2939                 ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2940                 icv_buf = tailmp->b_wptr;
2941                 tailmp->b_wptr += mac_len;
2942         } else {
2943                 icv_buf = NULL;
2944         }
2945 
2946         data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf,
2947             datalen + padlen + 2);
2948         if (need_refrele)
2949                 ixa_refrele(ixa);
2950         return (data_mp);
2951 }
2952 
2953 /*
2954  * IP calls this to validate the ICMP errors that
2955  * we got from the network.
2956  */
2957 mblk_t *
2958 ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
2959 {
2960         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
2961         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2962         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2963 
2964         /*
2965          * Unless we get an entire packet back, this function is useless.
2966          * Why?
2967          *
2968          * 1.)  Partial packets are useless, because the "next header"
2969          *      is at the end of the decrypted ESP packet.  Without the
2970          *      whole packet, this is useless.
2971          *
2972          * 2.)  If we every use a stateful cipher, such as a stream or a
2973          *      one-time pad, we can't do anything.
2974          *
2975          * Since the chances of us getting an entire packet back are very
2976          * very small, we discard here.
2977          */
2978         IP_ESP_BUMP_STAT(ipss, in_discards);
2979         ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
2980             DROPPER(ipss, ipds_esp_icmp),
2981             &espstack->esp_dropper);
2982         return (NULL);
2983 }
2984 
2985 /*
2986  * Construct an SADB_REGISTER message with the current algorithms.
2987  * This function gets called when 'ipsecalgs -s' is run or when
2988  * in.iked (or other KMD) starts.
2989  */
2990 static boolean_t
2991 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
2992     ipsecesp_stack_t *espstack, cred_t *cr)
2993 {
2994         mblk_t *pfkey_msg_mp, *keysock_out_mp;
2995         sadb_msg_t *samsg;
2996         sadb_supported_t *sasupp_auth = NULL;
2997         sadb_supported_t *sasupp_encr = NULL;
2998         sadb_alg_t *saalg;
2999         uint_t allocsize = sizeof (*samsg);
3000         uint_t i, numalgs_snap;
3001         int current_aalgs;
3002         ipsec_alginfo_t **authalgs;
3003         uint_t num_aalgs;
3004         int current_ealgs;
3005         ipsec_alginfo_t **encralgs;
3006         uint_t num_ealgs;
3007         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3008         sadb_sens_t *sens;
3009         size_t sens_len = 0;
3010         sadb_ext_t *nextext;
3011         ts_label_t *sens_tsl = NULL;
3012 
3013         /* Allocate the KEYSOCK_OUT. */
3014         keysock_out_mp = sadb_keysock_out(serial);
3015         if (keysock_out_mp == NULL) {
3016                 esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
3017                 return (B_FALSE);
3018         }
3019 
3020         if (is_system_labeled() && (cr != NULL)) {
3021                 sens_tsl = crgetlabel(cr);
3022                 if (sens_tsl != NULL) {
3023                         sens_len = sadb_sens_len_from_label(sens_tsl);
3024                         allocsize += sens_len;
3025                 }
3026         }
3027 
3028         /*
3029          * Allocate the PF_KEY message that follows KEYSOCK_OUT.
3030          */
3031 
3032         mutex_enter(&ipss->ipsec_alg_lock);
3033         /*
3034          * Fill SADB_REGISTER message's algorithm descriptors.  Hold
3035          * down the lock while filling it.
3036          *
3037          * Return only valid algorithms, so the number of algorithms
3038          * to send up may be less than the number of algorithm entries
3039          * in the table.
3040          */
3041         authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
3042         for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
3043                 if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
3044                         num_aalgs++;
3045 
3046         if (num_aalgs != 0) {
3047                 allocsize += (num_aalgs * sizeof (*saalg));
3048                 allocsize += sizeof (*sasupp_auth);
3049         }
3050         encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
3051         for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
3052                 if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
3053                         num_ealgs++;
3054 
3055         if (num_ealgs != 0) {
3056                 allocsize += (num_ealgs * sizeof (*saalg));
3057                 allocsize += sizeof (*sasupp_encr);
3058         }
3059         keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
3060         if (keysock_out_mp->b_cont == NULL) {
3061                 mutex_exit(&ipss->ipsec_alg_lock);
3062                 freemsg(keysock_out_mp);
3063                 return (B_FALSE);
3064         }
3065         pfkey_msg_mp = keysock_out_mp->b_cont;
3066         pfkey_msg_mp->b_wptr += allocsize;
3067 
3068         nextext = (sadb_ext_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
3069 
3070         if (num_aalgs != 0) {
3071                 sasupp_auth = (sadb_supported_t *)nextext;
3072                 saalg = (sadb_alg_t *)(sasupp_auth + 1);
3073 
3074                 ASSERT(((ulong_t)saalg & 0x7) == 0);
3075 
3076                 numalgs_snap = 0;
3077                 for (i = 0;
3078                     ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
3079                     i++) {
3080                         if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
3081                                 continue;
3082 
3083                         saalg->sadb_alg_id = authalgs[i]->alg_id;
3084                         saalg->sadb_alg_ivlen = 0;
3085                         saalg->sadb_alg_minbits      = authalgs[i]->alg_ef_minbits;
3086                         saalg->sadb_alg_maxbits      = authalgs[i]->alg_ef_maxbits;
3087                         saalg->sadb_x_alg_increment =
3088                             authalgs[i]->alg_increment;
3089                         saalg->sadb_x_alg_saltbits = SADB_8TO1(
3090                             authalgs[i]->alg_saltlen);
3091                         numalgs_snap++;
3092                         saalg++;
3093                 }
3094                 ASSERT(numalgs_snap == num_aalgs);
3095 #ifdef DEBUG
3096                 /*
3097                  * Reality check to make sure I snagged all of the
3098                  * algorithms.
3099                  */
3100                 for (; i < IPSEC_MAX_ALGS; i++) {
3101                         if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
3102                                 cmn_err(CE_PANIC, "esp_register_out()! "
3103                                     "Missed aalg #%d.\n", i);
3104                         }
3105                 }
3106 #endif /* DEBUG */
3107                 nextext = (sadb_ext_t *)saalg;
3108         }
3109 
3110         if (num_ealgs != 0) {
3111                 sasupp_encr = (sadb_supported_t *)nextext;
3112                 saalg = (sadb_alg_t *)(sasupp_encr + 1);
3113 
3114                 numalgs_snap = 0;
3115                 for (i = 0;
3116                     ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
3117                         if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
3118                                 continue;
3119                         saalg->sadb_alg_id = encralgs[i]->alg_id;
3120                         saalg->sadb_alg_ivlen = encralgs[i]->alg_ivlen;
3121                         saalg->sadb_alg_minbits      = encralgs[i]->alg_ef_minbits;
3122                         saalg->sadb_alg_maxbits      = encralgs[i]->alg_ef_maxbits;
3123                         /*
3124                          * We could advertise the ICV length, except there
3125                          * is not a value in sadb_x_algb to do this.
3126                          * saalg->sadb_alg_maclen = encralgs[i]->alg_maclen;
3127                          */
3128                         saalg->sadb_x_alg_increment =
3129                             encralgs[i]->alg_increment;
3130                         saalg->sadb_x_alg_saltbits =
3131                             SADB_8TO1(encralgs[i]->alg_saltlen);
3132 
3133                         numalgs_snap++;
3134                         saalg++;
3135                 }
3136                 ASSERT(numalgs_snap == num_ealgs);
3137 #ifdef DEBUG
3138                 /*
3139                  * Reality check to make sure I snagged all of the
3140                  * algorithms.
3141                  */
3142                 for (; i < IPSEC_MAX_ALGS; i++) {
3143                         if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
3144                                 cmn_err(CE_PANIC, "esp_register_out()! "
3145                                     "Missed ealg #%d.\n", i);
3146                         }
3147                 }
3148 #endif /* DEBUG */
3149                 nextext = (sadb_ext_t *)saalg;
3150         }
3151 
3152         current_aalgs = num_aalgs;
3153         current_ealgs = num_ealgs;
3154 
3155         mutex_exit(&ipss->ipsec_alg_lock);
3156 
3157         if (sens_tsl != NULL) {
3158                 sens = (sadb_sens_t *)nextext;
3159                 sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
3160                     sens_tsl, sens_len);
3161 
3162                 nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
3163         }
3164 
3165         /* Now fill the rest of the SADB_REGISTER message. */
3166 
3167         samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
3168         samsg->sadb_msg_version = PF_KEY_V2;
3169         samsg->sadb_msg_type = SADB_REGISTER;
3170         samsg->sadb_msg_errno = 0;
3171         samsg->sadb_msg_satype = SADB_SATYPE_ESP;
3172         samsg->sadb_msg_len = SADB_8TO64(allocsize);
3173         samsg->sadb_msg_reserved = 0;
3174         /*
3175          * Assume caller has sufficient sequence/pid number info.  If it's one
3176          * from me over a new alg., I could give two hoots about sequence.
3177          */
3178         samsg->sadb_msg_seq = sequence;
3179         samsg->sadb_msg_pid = pid;
3180 
3181         if (sasupp_auth != NULL) {
3182                 sasupp_auth->sadb_supported_len = SADB_8TO64(
3183                     sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
3184                 sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
3185                 sasupp_auth->sadb_supported_reserved = 0;
3186         }
3187 
3188         if (sasupp_encr != NULL) {
3189                 sasupp_encr->sadb_supported_len = SADB_8TO64(
3190                     sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
3191                 sasupp_encr->sadb_supported_exttype =
3192                     SADB_EXT_SUPPORTED_ENCRYPT;
3193                 sasupp_encr->sadb_supported_reserved = 0;
3194         }
3195 
3196         if (espstack->esp_pfkey_q != NULL)
3197                 putnext(espstack->esp_pfkey_q, keysock_out_mp);
3198         else {
3199                 freemsg(keysock_out_mp);
3200                 return (B_FALSE);
3201         }
3202 
3203         return (B_TRUE);
3204 }
3205 
3206 /*
3207  * Invoked when the algorithm table changes. Causes SADB_REGISTER
3208  * messages continaining the current list of algorithms to be
3209  * sent up to the ESP listeners.
3210  */
3211 void
3212 ipsecesp_algs_changed(netstack_t *ns)
3213 {
3214         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
3215 
3216         /*
3217          * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
3218          * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
3219          */
3220         (void) esp_register_out(0, 0, 0, espstack, NULL);
3221 }
3222 
3223 /*
3224  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
3225  * and send() it into ESP and IP again.
3226  */
3227 static void
3228 inbound_task(void *arg)
3229 {
3230         mblk_t          *mp = (mblk_t *)arg;
3231         mblk_t          *async_mp;
3232         ip_recv_attr_t  iras;
3233 
3234         async_mp = mp;
3235         mp = async_mp->b_cont;
3236         async_mp->b_cont = NULL;
3237         if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
3238                 /* The ill or ip_stack_t disappeared on us */
3239                 ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
3240                 freemsg(mp);
3241                 goto done;
3242         }
3243 
3244         esp_inbound_restart(mp, &iras);
3245 done:
3246         ira_cleanup(&iras, B_TRUE);
3247 }
3248 
3249 /*
3250  * Restart ESP after the SA has been added.
3251  */
3252 static void
3253 esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
3254 {
3255         esph_t          *esph;
3256         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
3257         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3258 
3259         esp2dbg(espstack, ("in ESP inbound_task"));
3260         ASSERT(espstack != NULL);
3261 
3262         mp = ipsec_inbound_esp_sa(mp, ira, &esph);
3263         if (mp == NULL)
3264                 return;
3265 
3266         ASSERT(esph != NULL);
3267         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3268         ASSERT(ira->ira_ipsec_esp_sa != NULL);
3269 
3270         mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira);
3271         if (mp == NULL) {
3272                 /*
3273                  * Either it failed or is pending. In the former case
3274                  * ipIfStatsInDiscards was increased.
3275                  */
3276                 return;
3277         }
3278 
3279         ip_input_post_ipsec(mp, ira);
3280 }
3281 
3282 /*
3283  * Now that weak-key passed, actually ADD the security association, and
3284  * send back a reply ADD message.
3285  */
3286 static int
3287 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3288     int *diagnostic, ipsecesp_stack_t *espstack)
3289 {
3290         isaf_t *primary = NULL, *secondary;
3291         boolean_t clone = B_FALSE, is_inbound = B_FALSE;
3292         ipsa_t *larval = NULL;
3293         ipsacq_t *acqrec;
3294         iacqf_t *acq_bucket;
3295         mblk_t *acq_msgs = NULL;
3296         int rc;
3297         mblk_t *lpkt;
3298         int error;
3299         ipsa_query_t sq;
3300         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3301 
3302         /*
3303          * Locate the appropriate table(s).
3304          */
3305         sq.spp = &espstack->esp_sadb;    /* XXX */
3306         error = sadb_form_query(ksi, IPSA_Q_SA|IPSA_Q_DST,
3307             IPSA_Q_SA|IPSA_Q_DST|IPSA_Q_INBOUND|IPSA_Q_OUTBOUND,
3308             &sq, diagnostic);
3309         if (error)
3310                 return (error);
3311 
3312         /*
3313          * Use the direction flags provided by the KMD to determine
3314          * if the inbound or outbound table should be the primary
3315          * for this SA. If these flags were absent then make this
3316          * decision based on the addresses.
3317          */
3318         if (sq.assoc->sadb_sa_flags & IPSA_F_INBOUND) {
3319                 primary = sq.inbound;
3320                 secondary = sq.outbound;
3321                 is_inbound = B_TRUE;
3322                 if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND)
3323                         clone = B_TRUE;
3324         } else if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND) {
3325                 primary = sq.outbound;
3326                 secondary = sq.inbound;
3327         }
3328 
3329         if (primary == NULL) {
3330                 /*
3331                  * The KMD did not set a direction flag, determine which
3332                  * table to insert the SA into based on addresses.
3333                  */
3334                 switch (ksi->ks_in_dsttype) {
3335                 case KS_IN_ADDR_MBCAST:
3336                         clone = B_TRUE; /* All mcast SAs can be bidirectional */
3337                         sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3338                         /* FALLTHRU */
3339                 /*
3340                  * If the source address is either one of mine, or unspecified
3341                  * (which is best summed up by saying "not 'not mine'"),
3342                  * then the association is potentially bi-directional,
3343                  * in that it can be used for inbound traffic and outbound
3344                  * traffic.  The best example of such an SA is a multicast
3345                  * SA (which allows me to receive the outbound traffic).
3346                  */
3347                 case KS_IN_ADDR_ME:
3348                         sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3349                         primary = sq.inbound;
3350                         secondary = sq.outbound;
3351                         if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3352                                 clone = B_TRUE;
3353                         is_inbound = B_TRUE;
3354                         break;
3355                 /*
3356                  * If the source address literally not mine (either
3357                  * unspecified or not mine), then this SA may have an
3358                  * address that WILL be mine after some configuration.
3359                  * We pay the price for this by making it a bi-directional
3360                  * SA.
3361                  */
3362                 case KS_IN_ADDR_NOTME:
3363                         sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3364                         primary = sq.outbound;
3365                         secondary = sq.inbound;
3366                         if (ksi->ks_in_srctype != KS_IN_ADDR_ME) {
3367                                 sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3368                                 clone = B_TRUE;
3369                         }
3370                         break;
3371                 default:
3372                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3373                         return (EINVAL);
3374                 }
3375         }
3376 
3377         /*
3378          * Find a ACQUIRE list entry if possible.  If we've added an SA that
3379          * suits the needs of an ACQUIRE list entry, we can eliminate the
3380          * ACQUIRE list entry and transmit the enqueued packets.  Use the
3381          * high-bit of the sequence number to queue it.  Key off destination
3382          * addr, and change acqrec's state.
3383          */
3384 
3385         if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3386                 acq_bucket = &(sq.sp->sdb_acq[sq.outhash]);
3387                 mutex_enter(&acq_bucket->iacqf_lock);
3388                 for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3389                     acqrec = acqrec->ipsacq_next) {
3390                         mutex_enter(&acqrec->ipsacq_lock);
3391                         /*
3392                          * Q:  I only check sequence.  Should I check dst?
3393                          * A: Yes, check dest because those are the packets
3394                          *    that are queued up.
3395                          */
3396                         if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3397                             IPSA_ARE_ADDR_EQUAL(sq.dstaddr,
3398                             acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3399                                 break;
3400                         mutex_exit(&acqrec->ipsacq_lock);
3401                 }
3402                 if (acqrec != NULL) {
3403                         /*
3404                          * AHA!  I found an ACQUIRE record for this SA.
3405                          * Grab the msg list, and free the acquire record.
3406                          * I already am holding the lock for this record,
3407                          * so all I have to do is free it.
3408                          */
3409                         acq_msgs = acqrec->ipsacq_mp;
3410                         acqrec->ipsacq_mp = NULL;
3411                         mutex_exit(&acqrec->ipsacq_lock);
3412                         sadb_destroy_acquire(acqrec,
3413                             espstack->ipsecesp_netstack);
3414                 }
3415                 mutex_exit(&acq_bucket->iacqf_lock);
3416         }
3417 
3418         /*
3419          * Find PF_KEY message, and see if I'm an update.  If so, find entry
3420          * in larval list (if there).
3421          */
3422         if (samsg->sadb_msg_type == SADB_UPDATE) {
3423                 mutex_enter(&sq.inbound->isaf_lock);
3424                 larval = ipsec_getassocbyspi(sq.inbound, sq.assoc->sadb_sa_spi,
3425                     ALL_ZEROES_PTR, sq.dstaddr, sq.dst->sin_family);
3426                 mutex_exit(&sq.inbound->isaf_lock);
3427 
3428                 if ((larval == NULL) ||
3429                     (larval->ipsa_state != IPSA_STATE_LARVAL)) {
3430                         *diagnostic = SADB_X_DIAGNOSTIC_SA_NOTFOUND;
3431                         if (larval != NULL) {
3432                                 IPSA_REFRELE(larval);
3433                         }
3434                         esp0dbg(("Larval update, but larval disappeared.\n"));
3435                         return (ESRCH);
3436                 } /* Else sadb_common_add unlinks it for me! */
3437         }
3438 
3439         if (larval != NULL) {
3440                 /*
3441                  * Hold again, because sadb_common_add() consumes a reference,
3442                  * and we don't want to clear_lpkt() without a reference.
3443                  */
3444                 IPSA_REFHOLD(larval);
3445         }
3446 
3447         rc = sadb_common_add(espstack->esp_pfkey_q,
3448             mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3449             diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
3450 
3451         if (larval != NULL) {
3452                 if (rc == 0) {
3453                         lpkt = sadb_clear_lpkt(larval);
3454                         if (lpkt != NULL) {
3455                                 rc = !taskq_dispatch(esp_taskq, inbound_task,
3456                                     lpkt, TQ_NOSLEEP);
3457                         }
3458                 }
3459                 IPSA_REFRELE(larval);
3460         }
3461 
3462         /*
3463          * How much more stack will I create with all of these
3464          * esp_outbound() calls?
3465          */
3466 
3467         /* Handle the packets queued waiting for the SA */
3468         while (acq_msgs != NULL) {
3469                 mblk_t          *asyncmp;
3470                 mblk_t          *data_mp;
3471                 ip_xmit_attr_t  ixas;
3472                 ill_t           *ill;
3473 
3474                 asyncmp = acq_msgs;
3475                 acq_msgs = acq_msgs->b_next;
3476                 asyncmp->b_next = NULL;
3477 
3478                 /*
3479                  * Extract the ip_xmit_attr_t from the first mblk.
3480                  * Verifies that the netstack and ill is still around; could
3481                  * have vanished while iked was doing its work.
3482                  * On succesful return we have a nce_t and the ill/ipst can't
3483                  * disappear until we do the nce_refrele in ixa_cleanup.
3484                  */
3485                 data_mp = asyncmp->b_cont;
3486                 asyncmp->b_cont = NULL;
3487                 if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
3488                         ESP_BUMP_STAT(espstack, out_discards);
3489                         ip_drop_packet(data_mp, B_FALSE, NULL,
3490                             DROPPER(ipss, ipds_sadb_acquire_timeout),
3491                             &espstack->esp_dropper);
3492                 } else if (rc != 0) {
3493                         ill = ixas.ixa_nce->nce_ill;
3494                         ESP_BUMP_STAT(espstack, out_discards);
3495                         ip_drop_packet(data_mp, B_FALSE, ill,
3496                             DROPPER(ipss, ipds_sadb_acquire_timeout),
3497                             &espstack->esp_dropper);
3498                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3499                 } else {
3500                         esp_outbound_finish(data_mp, &ixas);
3501                 }
3502                 ixa_cleanup(&ixas);
3503         }
3504 
3505         return (rc);
3506 }
3507 
3508 /*
3509  * Process one of the queued messages (from ipsacq_mp) once the SA
3510  * has been added.
3511  */
3512 static void
3513 esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
3514 {
3515         netstack_t      *ns = ixa->ixa_ipst->ips_netstack;
3516         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3517         ipsec_stack_t   *ipss = ns->netstack_ipsec;
3518         ill_t           *ill = ixa->ixa_nce->nce_ill;
3519 
3520         if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) {
3521                 ESP_BUMP_STAT(espstack, out_discards);
3522                 ip_drop_packet(data_mp, B_FALSE, ill,
3523                     DROPPER(ipss, ipds_sadb_acquire_timeout),
3524                     &espstack->esp_dropper);
3525                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3526                 return;
3527         }
3528 
3529         data_mp = esp_outbound(data_mp, ixa);
3530         if (data_mp == NULL)
3531                 return;
3532 
3533         /* do AH processing if needed */
3534         data_mp = esp_do_outbound_ah(data_mp, ixa);
3535         if (data_mp == NULL)
3536                 return;
3537 
3538         (void) ip_output_post_ipsec(data_mp, ixa);
3539 }
3540 
3541 /*
3542  * Add new ESP security association.  This may become a generic AH/ESP
3543  * routine eventually.
3544  */
3545 static int
3546 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3547 {
3548         sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3549         sadb_address_t *srcext =
3550             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3551         sadb_address_t *dstext =
3552             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3553         sadb_address_t *isrcext =
3554             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3555         sadb_address_t *idstext =
3556             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3557         sadb_address_t *nttext_loc =
3558             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3559         sadb_address_t *nttext_rem =
3560             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3561         sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3562         sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3563         struct sockaddr_in *src, *dst;
3564         struct sockaddr_in *natt_loc, *natt_rem;
3565         struct sockaddr_in6 *natt_loc6, *natt_rem6;
3566         sadb_lifetime_t *soft =
3567             (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3568         sadb_lifetime_t *hard =
3569             (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3570         sadb_lifetime_t *idle =
3571             (sadb_lifetime_t *)ksi->ks_in_extv[SADB_X_EXT_LIFETIME_IDLE];
3572         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3573         ipsec_stack_t   *ipss = ns->netstack_ipsec;
3574 
3575 
3576 
3577         /* I need certain extensions present for an ADD message. */
3578         if (srcext == NULL) {
3579                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3580                 return (EINVAL);
3581         }
3582         if (dstext == NULL) {
3583                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3584                 return (EINVAL);
3585         }
3586         if (isrcext == NULL && idstext != NULL) {
3587                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3588                 return (EINVAL);
3589         }
3590         if (isrcext != NULL && idstext == NULL) {
3591                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3592                 return (EINVAL);
3593         }
3594         if (assoc == NULL) {
3595                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3596                 return (EINVAL);
3597         }
3598         if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3599                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3600                 return (EINVAL);
3601         }
3602 
3603         src = (struct sockaddr_in *)(srcext + 1);
3604         dst = (struct sockaddr_in *)(dstext + 1);
3605         natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3606         natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3607         natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3608         natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3609 
3610         /* Sundry ADD-specific reality checks. */
3611         /* XXX STATS :  Logging/stats here? */
3612 
3613         if (assoc->sadb_sa_state != SADB_SASTATE_MATURE) {
3614                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3615                 return (EINVAL);
3616         }
3617         if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3618                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3619                 return (EINVAL);
3620         }
3621 
3622 #ifndef IPSEC_LATENCY_TEST
3623         if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3624             assoc->sadb_sa_auth == SADB_AALG_NONE) {
3625                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3626                 return (EINVAL);
3627         }
3628 #endif
3629 
3630         if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
3631                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3632                 return (EINVAL);
3633         }
3634 
3635         if ((*diagnostic = sadb_hardsoftchk(hard, soft, idle)) != 0) {
3636                 return (EINVAL);
3637         }
3638         ASSERT(src->sin_family == dst->sin_family);
3639 
3640         if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3641                 if (nttext_loc == NULL) {
3642                         *diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3643                         return (EINVAL);
3644                 }
3645 
3646                 if (natt_loc->sin_family == AF_INET6 &&
3647                     !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3648                         *diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3649                         return (EINVAL);
3650                 }
3651         }
3652 
3653         if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3654                 if (nttext_rem == NULL) {
3655                         *diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3656                         return (EINVAL);
3657                 }
3658                 if (natt_rem->sin_family == AF_INET6 &&
3659                     !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3660                         *diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3661                         return (EINVAL);
3662                 }
3663         }
3664 
3665 
3666         /* Stuff I don't support, for now.  XXX Diagnostic? */
3667         if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL)
3668                 return (EOPNOTSUPP);
3669 
3670         if ((*diagnostic = sadb_labelchk(ksi)) != 0)
3671                 return (EINVAL);
3672 
3673         /*
3674          * XXX Policy :  I'm not checking identities at this time,
3675          * but if I did, I'd do them here, before I sent
3676          * the weak key check up to the algorithm.
3677          */
3678 
3679         mutex_enter(&ipss->ipsec_alg_lock);
3680 
3681         /*
3682          * First locate the authentication algorithm.
3683          */
3684 #ifdef IPSEC_LATENCY_TEST
3685         if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) {
3686 #else
3687         if (akey != NULL) {
3688 #endif
3689                 ipsec_alginfo_t *aalg;
3690 
3691                 aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3692                     [assoc->sadb_sa_auth];
3693                 if (aalg == NULL || !ALG_VALID(aalg)) {
3694                         mutex_exit(&ipss->ipsec_alg_lock);
3695                         esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3696                             assoc->sadb_sa_auth));
3697                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3698                         return (EINVAL);
3699                 }
3700 
3701                 /*
3702                  * Sanity check key sizes.
3703                  * Note: It's not possible to use SADB_AALG_NONE because
3704                  * this auth_alg is not defined with ALG_FLAG_VALID. If this
3705                  * ever changes, the same check for SADB_AALG_NONE and
3706                  * a auth_key != NULL should be made here ( see below).
3707                  */
3708                 if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3709                         mutex_exit(&ipss->ipsec_alg_lock);
3710                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3711                         return (EINVAL);
3712                 }
3713                 ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3714 
3715                 /* check key and fix parity if needed */
3716                 if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3717                     diagnostic) != 0) {
3718                         mutex_exit(&ipss->ipsec_alg_lock);
3719                         return (EINVAL);
3720                 }
3721         }
3722 
3723         /*
3724          * Then locate the encryption algorithm.
3725          */
3726         if (ekey != NULL) {
3727                 uint_t keybits;
3728                 ipsec_alginfo_t *ealg;
3729 
3730                 ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3731                     [assoc->sadb_sa_encrypt];
3732                 if (ealg == NULL || !ALG_VALID(ealg)) {
3733                         mutex_exit(&ipss->ipsec_alg_lock);
3734                         esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3735                             assoc->sadb_sa_encrypt));
3736                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3737                         return (EINVAL);
3738                 }
3739 
3740                 /*
3741                  * Sanity check key sizes. If the encryption algorithm is
3742                  * SADB_EALG_NULL but the encryption key is NOT
3743                  * NULL then complain.
3744                  *
3745                  * The keying material includes salt bits if required by
3746                  * algorithm and optionally the Initial IV, check the
3747                  * length of whats left.
3748                  */
3749                 keybits = ekey->sadb_key_bits;
3750                 keybits -= ekey->sadb_key_reserved;
3751                 keybits -= SADB_8TO1(ealg->alg_saltlen);
3752                 if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3753                     (!ipsec_valid_key_size(keybits, ealg))) {
3754                         mutex_exit(&ipss->ipsec_alg_lock);
3755                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3756                         return (EINVAL);
3757                 }
3758                 ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3759 
3760                 /* check key */
3761                 if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3762                     diagnostic) != 0) {
3763                         mutex_exit(&ipss->ipsec_alg_lock);
3764                         return (EINVAL);
3765                 }
3766         }
3767         mutex_exit(&ipss->ipsec_alg_lock);
3768 
3769         return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3770             diagnostic, espstack));
3771 }
3772 
3773 /*
3774  * Update a security association.  Updates come in two varieties.  The first
3775  * is an update of lifetimes on a non-larval SA.  The second is an update of
3776  * a larval SA, which ends up looking a lot more like an add.
3777  */
3778 static int
3779 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3780     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3781 {
3782         sadb_address_t *dstext =
3783             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3784 
3785         if (dstext == NULL) {
3786                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3787                 return (EINVAL);
3788         }
3789 
3790         return (sadb_update_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3791             espstack->esp_pfkey_q, esp_add_sa, espstack->ipsecesp_netstack,
3792             sadb_msg_type));
3793 }
3794 
3795 /* XXX refactor me */
3796 /*
3797  * Delete a security association.  This is REALLY likely to be code common to
3798  * both AH and ESP.  Find the association, then unlink it.
3799  */
3800 static int
3801 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3802     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3803 {
3804         sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3805         sadb_address_t *dstext =
3806             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3807         sadb_address_t *srcext =
3808             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3809         struct sockaddr_in *sin;
3810 
3811         if (assoc == NULL) {
3812                 if (dstext != NULL) {
3813                         sin = (struct sockaddr_in *)(dstext + 1);
3814                 } else if (srcext != NULL) {
3815                         sin = (struct sockaddr_in *)(srcext + 1);
3816                 } else {
3817                         *diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3818                         return (EINVAL);
3819                 }
3820                 return (sadb_purge_sa(mp, ksi,
3821                     (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3822                     &espstack->esp_sadb.s_v4, diagnostic,
3823                     espstack->esp_pfkey_q));
3824         }
3825 
3826         return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3827             espstack->esp_pfkey_q, sadb_msg_type));
3828 }
3829 
3830 /* XXX refactor me */
3831 /*
3832  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3833  * messages.
3834  */
3835 static void
3836 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3837 {
3838         int error;
3839         sadb_msg_t *samsg;
3840 
3841         /*
3842          * Dump each fanout, bailing if error is non-zero.
3843          */
3844 
3845         error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3846             &espstack->esp_sadb.s_v4);
3847         if (error != 0)
3848                 goto bail;
3849 
3850         error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3851             &espstack->esp_sadb.s_v6);
3852 bail:
3853         ASSERT(mp->b_cont != NULL);
3854         samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3855         samsg->sadb_msg_errno = (uint8_t)error;
3856         sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3857             (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3858 }
3859 
3860 /*
3861  * First-cut reality check for an inbound PF_KEY message.
3862  */
3863 static boolean_t
3864 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3865     ipsecesp_stack_t *espstack)
3866 {
3867         int diagnostic;
3868 
3869         if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3870                 diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3871                 goto badmsg;
3872         }
3873         if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3874             ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3875                 diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3876                 goto badmsg;
3877         }
3878         return (B_FALSE);       /* False ==> no failures */
3879 
3880 badmsg:
3881         sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3882             ksi->ks_in_serial);
3883         return (B_TRUE);        /* True ==> failures */
3884 }
3885 
3886 /*
3887  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3888  * error cases.  What I receive is a fully-formed, syntactically legal
3889  * PF_KEY message.  I then need to check semantics...
3890  *
3891  * This code may become common to AH and ESP.  Stay tuned.
3892  *
3893  * I also make the assumption that db_ref's are cool.  If this assumption
3894  * is wrong, this means that someone other than keysock or me has been
3895  * mucking with PF_KEY messages.
3896  */
3897 static void
3898 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3899 {
3900         mblk_t *msg = mp->b_cont;
3901         sadb_msg_t *samsg;
3902         keysock_in_t *ksi;
3903         int error;
3904         int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3905 
3906         ASSERT(msg != NULL);
3907 
3908         samsg = (sadb_msg_t *)msg->b_rptr;
3909         ksi = (keysock_in_t *)mp->b_rptr;
3910 
3911         /*
3912          * If applicable, convert unspecified AF_INET6 to unspecified
3913          * AF_INET.  And do other address reality checks.
3914          */
3915         if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3916             espstack->ipsecesp_netstack) ||
3917             esp_pfkey_reality_failures(mp, ksi, espstack)) {
3918                 return;
3919         }
3920 
3921         switch (samsg->sadb_msg_type) {
3922         case SADB_ADD:
3923                 error = esp_add_sa(mp, ksi, &diagnostic,
3924                     espstack->ipsecesp_netstack);
3925                 if (error != 0) {
3926                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3927                             diagnostic, ksi->ks_in_serial);
3928                 }
3929                 /* else esp_add_sa() took care of things. */
3930                 break;
3931         case SADB_DELETE:
3932         case SADB_X_DELPAIR:
3933                 error = esp_del_sa(mp, ksi, &diagnostic, espstack,
3934                     samsg->sadb_msg_type);
3935                 if (error != 0) {
3936                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3937                             diagnostic, ksi->ks_in_serial);
3938                 }
3939                 /* Else esp_del_sa() took care of things. */
3940                 break;
3941         case SADB_GET:
3942                 error = sadb_delget_sa(mp, ksi, &espstack->esp_sadb,
3943                     &diagnostic, espstack->esp_pfkey_q, samsg->sadb_msg_type);
3944                 if (error != 0) {
3945                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3946                             diagnostic, ksi->ks_in_serial);
3947                 }
3948                 /* Else sadb_get_sa() took care of things. */
3949                 break;
3950         case SADB_FLUSH:
3951                 sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3952                 sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3953                 break;
3954         case SADB_REGISTER:
3955                 /*
3956                  * Hmmm, let's do it!  Check for extensions (there should
3957                  * be none), extract the fields, call esp_register_out(),
3958                  * then either free or report an error.
3959                  *
3960                  * Keysock takes care of the PF_KEY bookkeeping for this.
3961                  */
3962                 if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3963                     ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) {
3964                         freemsg(mp);
3965                 } else {
3966                         /*
3967                          * Only way this path hits is if there is a memory
3968                          * failure.  It will not return B_FALSE because of
3969                          * lack of esp_pfkey_q if I am in wput().
3970                          */
3971                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3972                             diagnostic, ksi->ks_in_serial);
3973                 }
3974                 break;
3975         case SADB_UPDATE:
3976         case SADB_X_UPDATEPAIR:
3977                 /*
3978                  * Find a larval, if not there, find a full one and get
3979                  * strict.
3980                  */
3981                 error = esp_update_sa(mp, ksi, &diagnostic, espstack,
3982                     samsg->sadb_msg_type);
3983                 if (error != 0) {
3984                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3985                             diagnostic, ksi->ks_in_serial);
3986                 }
3987                 /* else esp_update_sa() took care of things. */
3988                 break;
3989         case SADB_GETSPI:
3990                 /*
3991                  * Reserve a new larval entry.
3992                  */
3993                 esp_getspi(mp, ksi, espstack);
3994                 break;
3995         case SADB_ACQUIRE:
3996                 /*
3997                  * Find larval and/or ACQUIRE record and kill it (them), I'm
3998                  * most likely an error.  Inbound ACQUIRE messages should only
3999                  * have the base header.
4000                  */
4001                 sadb_in_acquire(samsg, &espstack->esp_sadb,
4002                     espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
4003                 freemsg(mp);
4004                 break;
4005         case SADB_DUMP:
4006                 /*
4007                  * Dump all entries.
4008                  */
4009                 esp_dump(mp, ksi, espstack);
4010                 /* esp_dump will take care of the return message, etc. */
4011                 break;
4012         case SADB_EXPIRE:
4013                 /* Should never reach me. */
4014                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
4015                     diagnostic, ksi->ks_in_serial);
4016                 break;
4017         default:
4018                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
4019                     SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
4020                 break;
4021         }
4022 }
4023 
4024 /*
4025  * Handle case where PF_KEY says it can't find a keysock for one of my
4026  * ACQUIRE messages.
4027  */
4028 static void
4029 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
4030 {
4031         sadb_msg_t *samsg;
4032         keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
4033 
4034         if (mp->b_cont == NULL) {
4035                 freemsg(mp);
4036                 return;
4037         }
4038         samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
4039 
4040         /*
4041          * If keysock can't find any registered, delete the acquire record
4042          * immediately, and handle errors.
4043          */
4044         if (samsg->sadb_msg_type == SADB_ACQUIRE) {
4045                 samsg->sadb_msg_errno = kse->ks_err_errno;
4046                 samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
4047                 /*
4048                  * Use the write-side of the esp_pfkey_q
4049                  */
4050                 sadb_in_acquire(samsg, &espstack->esp_sadb,
4051                     WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
4052         }
4053 
4054         freemsg(mp);
4055 }
4056 
4057 /*
4058  * ESP module write put routine.
4059  */
4060 static void
4061 ipsecesp_wput(queue_t *q, mblk_t *mp)
4062 {
4063         ipsec_info_t *ii;
4064         struct iocblk *iocp;
4065         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
4066 
4067         esp3dbg(espstack, ("In esp_wput().\n"));
4068 
4069         /* NOTE: Each case must take care of freeing or passing mp. */
4070         switch (mp->b_datap->db_type) {
4071         case M_CTL:
4072                 if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
4073                         /* Not big enough message. */
4074                         freemsg(mp);
4075                         break;
4076                 }
4077                 ii = (ipsec_info_t *)mp->b_rptr;
4078 
4079                 switch (ii->ipsec_info_type) {
4080                 case KEYSOCK_OUT_ERR:
4081                         esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
4082                         esp_keysock_no_socket(mp, espstack);
4083                         break;
4084                 case KEYSOCK_IN:
4085                         ESP_BUMP_STAT(espstack, keysock_in);
4086                         esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
4087 
4088                         /* Parse the message. */
4089                         esp_parse_pfkey(mp, espstack);
4090                         break;
4091                 case KEYSOCK_HELLO:
4092                         sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
4093                             esp_ager, (void *)espstack, &espstack->esp_event,
4094                             SADB_SATYPE_ESP);
4095                         break;
4096                 default:
4097                         esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
4098                             ii->ipsec_info_type));
4099                         freemsg(mp);
4100                         break;
4101                 }
4102                 break;
4103         case M_IOCTL:
4104                 iocp = (struct iocblk *)mp->b_rptr;
4105                 switch (iocp->ioc_cmd) {
4106                 case ND_SET:
4107                 case ND_GET:
4108                         if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
4109                                 qreply(q, mp);
4110                                 return;
4111                         } else {
4112                                 iocp->ioc_error = ENOENT;
4113                         }
4114                         /* FALLTHRU */
4115                 default:
4116                         /* We really don't support any other ioctls, do we? */
4117 
4118                         /* Return EINVAL */
4119                         if (iocp->ioc_error != ENOENT)
4120                                 iocp->ioc_error = EINVAL;
4121                         iocp->ioc_count = 0;
4122                         mp->b_datap->db_type = M_IOCACK;
4123                         qreply(q, mp);
4124                         return;
4125                 }
4126         default:
4127                 esp3dbg(espstack,
4128                     ("Got default message, type %d, passing to IP.\n",
4129                     mp->b_datap->db_type));
4130                 putnext(q, mp);
4131         }
4132 }
4133 
4134 /*
4135  * Wrapper to allow IP to trigger an ESP association failure message
4136  * during inbound SA selection.
4137  */
4138 void
4139 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
4140     uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
4141 {
4142         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
4143         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
4144         ipsec_stack_t   *ipss = ns->netstack_ipsec;
4145 
4146         if (espstack->ipsecesp_log_unknown_spi) {
4147                 ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
4148                     addr, af, espstack->ipsecesp_netstack);
4149         }
4150 
4151         ip_drop_packet(mp, B_TRUE, ira->ira_ill,
4152             DROPPER(ipss, ipds_esp_no_sa),
4153             &espstack->esp_dropper);
4154 }
4155 
4156 /*
4157  * Initialize the ESP input and output processing functions.
4158  */
4159 void
4160 ipsecesp_init_funcs(ipsa_t *sa)
4161 {
4162         if (sa->ipsa_output_func == NULL)
4163                 sa->ipsa_output_func = esp_outbound;
4164         if (sa->ipsa_input_func == NULL)
4165                 sa->ipsa_input_func = esp_inbound;
4166 }