1 /*
   2  * CDDL HEADER START
   3  *
   4  * The contents of this file are subject to the terms of the
   5  * Common Development and Distribution License (the "License").
   6  * You may not use this file except in compliance with the License.
   7  *
   8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
   9  * or http://www.opensolaris.org/os/licensing.
  10  * See the License for the specific language governing permissions
  11  * and limitations under the License.
  12  *
  13  * When distributing Covered Code, include this CDDL HEADER in each
  14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
  15  * If applicable, add the following below this CDDL HEADER, with the
  16  * fields enclosed by brackets "[]" replaced with your own identifying
  17  * information: Portions Copyright [yyyy] [name of copyright owner]
  18  *
  19  * CDDL HEADER END
  20  */
  21 /*
  22  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
  23  * Use is subject to license terms.
  24  *
  25  * Copyright 2015 Garrett D'Amore <garrett@damore.org>
  26  */
  27 
  28 #include <sys/types.h>
  29 #include <sys/stream.h>
  30 #include <sys/stropts.h>
  31 #include <sys/errno.h>
  32 #include <sys/strlog.h>
  33 #include <sys/tihdr.h>
  34 #include <sys/socket.h>
  35 #include <sys/ddi.h>
  36 #include <sys/sunddi.h>
  37 #include <sys/kmem.h>
  38 #include <sys/zone.h>
  39 #include <sys/sysmacros.h>
  40 #include <sys/cmn_err.h>
  41 #include <sys/vtrace.h>
  42 #include <sys/debug.h>
  43 #include <sys/atomic.h>
  44 #include <sys/strsun.h>
  45 #include <sys/random.h>
  46 #include <netinet/in.h>
  47 #include <net/if.h>
  48 #include <netinet/ip6.h>
  49 #include <net/pfkeyv2.h>
  50 #include <net/pfpolicy.h>
  51 
  52 #include <inet/common.h>
  53 #include <inet/mi.h>
  54 #include <inet/nd.h>
  55 #include <inet/ip.h>
  56 #include <inet/ip_impl.h>
  57 #include <inet/ip6.h>
  58 #include <inet/ip_if.h>
  59 #include <inet/ip_ndp.h>
  60 #include <inet/sadb.h>
  61 #include <inet/ipsec_info.h>
  62 #include <inet/ipsec_impl.h>
  63 #include <inet/ipsecesp.h>
  64 #include <inet/ipdrop.h>
  65 #include <inet/tcp.h>
  66 #include <sys/kstat.h>
  67 #include <sys/policy.h>
  68 #include <sys/strsun.h>
  69 #include <sys/strsubr.h>
  70 #include <inet/udp_impl.h>
  71 #include <sys/taskq.h>
  72 #include <sys/note.h>
  73 
  74 #include <sys/tsol/tnet.h>
  75 
  76 /*
  77  * Table of ND variables supported by ipsecesp. These are loaded into
  78  * ipsecesp_g_nd in ipsecesp_init_nd.
  79  * All of these are alterable, within the min/max values given, at run time.
  80  */
  81 static  ipsecespparam_t lcl_param_arr[] = {
  82         /* min  max                     value   name */
  83         { 0,    3,                      0,      "ipsecesp_debug"},
  84         { 125,  32000, SADB_AGE_INTERVAL_DEFAULT, "ipsecesp_age_interval"},
  85         { 1,    10,                     1,      "ipsecesp_reap_delay"},
  86         { 1,    SADB_MAX_REPLAY,        64,     "ipsecesp_replay_size"},
  87         { 1,    300,                    15,     "ipsecesp_acquire_timeout"},
  88         { 1,    1800,                   90,     "ipsecesp_larval_timeout"},
  89         /* Default lifetime values for ACQUIRE messages. */
  90         { 0,    0xffffffffU,    0,      "ipsecesp_default_soft_bytes"},
  91         { 0,    0xffffffffU,    0,      "ipsecesp_default_hard_bytes"},
  92         { 0,    0xffffffffU,    24000,  "ipsecesp_default_soft_addtime"},
  93         { 0,    0xffffffffU,    28800,  "ipsecesp_default_hard_addtime"},
  94         { 0,    0xffffffffU,    0,      "ipsecesp_default_soft_usetime"},
  95         { 0,    0xffffffffU,    0,      "ipsecesp_default_hard_usetime"},
  96         { 0,    1,              0,      "ipsecesp_log_unknown_spi"},
  97         { 0,    2,              1,      "ipsecesp_padding_check"},
  98         { 0,    600,            20,     "ipsecesp_nat_keepalive_interval"},
  99 };
 100 #define ipsecesp_debug  ipsecesp_params[0].ipsecesp_param_value
 101 #define ipsecesp_age_interval ipsecesp_params[1].ipsecesp_param_value
 102 #define ipsecesp_age_int_max    ipsecesp_params[1].ipsecesp_param_max
 103 #define ipsecesp_reap_delay     ipsecesp_params[2].ipsecesp_param_value
 104 #define ipsecesp_replay_size    ipsecesp_params[3].ipsecesp_param_value
 105 #define ipsecesp_acquire_timeout        \
 106         ipsecesp_params[4].ipsecesp_param_value
 107 #define ipsecesp_larval_timeout \
 108         ipsecesp_params[5].ipsecesp_param_value
 109 #define ipsecesp_default_soft_bytes     \
 110         ipsecesp_params[6].ipsecesp_param_value
 111 #define ipsecesp_default_hard_bytes     \
 112         ipsecesp_params[7].ipsecesp_param_value
 113 #define ipsecesp_default_soft_addtime   \
 114         ipsecesp_params[8].ipsecesp_param_value
 115 #define ipsecesp_default_hard_addtime   \
 116         ipsecesp_params[9].ipsecesp_param_value
 117 #define ipsecesp_default_soft_usetime   \
 118         ipsecesp_params[10].ipsecesp_param_value
 119 #define ipsecesp_default_hard_usetime   \
 120         ipsecesp_params[11].ipsecesp_param_value
 121 #define ipsecesp_log_unknown_spi        \
 122         ipsecesp_params[12].ipsecesp_param_value
 123 #define ipsecesp_padding_check  \
 124         ipsecesp_params[13].ipsecesp_param_value
 125 /* For ipsecesp_nat_keepalive_interval, see ipsecesp.h. */
 126 
 127 #define esp0dbg(a)      printf a
 128 /* NOTE:  != 0 instead of > 0 so lint doesn't complain. */
 129 #define esp1dbg(espstack, a)    if (espstack->ipsecesp_debug != 0) printf a
 130 #define esp2dbg(espstack, a)    if (espstack->ipsecesp_debug > 1) printf a
 131 #define esp3dbg(espstack, a)    if (espstack->ipsecesp_debug > 2) printf a
 132 
 133 static int ipsecesp_open(queue_t *, dev_t *, int, int, cred_t *);
 134 static int ipsecesp_close(queue_t *);
 135 static void ipsecesp_wput(queue_t *, mblk_t *);
 136 static void     *ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns);
 137 static void     ipsecesp_stack_fini(netstackid_t stackid, void *arg);
 138 static void esp_send_acquire(ipsacq_t *, mblk_t *, netstack_t *);
 139 
 140 static void esp_prepare_udp(netstack_t *, mblk_t *, ipha_t *);
 141 static void esp_outbound_finish(mblk_t *, ip_xmit_attr_t *);
 142 static void esp_inbound_restart(mblk_t *, ip_recv_attr_t *);
 143 
 144 static boolean_t esp_register_out(uint32_t, uint32_t, uint_t,
 145     ipsecesp_stack_t *, cred_t *);
 146 static boolean_t esp_strip_header(mblk_t *, boolean_t, uint32_t,
 147     kstat_named_t **, ipsecesp_stack_t *);
 148 static mblk_t *esp_submit_req_inbound(mblk_t *, ip_recv_attr_t *,
 149     ipsa_t *, uint_t);
 150 static mblk_t *esp_submit_req_outbound(mblk_t *, ip_xmit_attr_t *,
 151     ipsa_t *, uchar_t *, uint_t);
 152 
 153 /* Setable in /etc/system */
 154 uint32_t esp_hash_size = IPSEC_DEFAULT_HASH_SIZE;
 155 
 156 static struct module_info info = {
 157         5137, "ipsecesp", 0, INFPSZ, 65536, 1024
 158 };
 159 
 160 static struct qinit rinit = {
 161         (pfi_t)putnext, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
 162         NULL
 163 };
 164 
 165 static struct qinit winit = {
 166         (pfi_t)ipsecesp_wput, NULL, ipsecesp_open, ipsecesp_close, NULL, &info,
 167         NULL
 168 };
 169 
 170 struct streamtab ipsecespinfo = {
 171         &rinit, &winit, NULL, NULL
 172 };
 173 
 174 static taskq_t *esp_taskq;
 175 
 176 /*
 177  * OTOH, this one is set at open/close, and I'm D_MTQPAIR for now.
 178  *
 179  * Question:    Do I need this, given that all instance's esps->esps_wq point
 180  *              to IP?
 181  *
 182  * Answer:      Yes, because I need to know which queue is BOUND to
 183  *              IPPROTO_ESP
 184  */
 185 
 186 /*
 187  * Stats.  This may eventually become a full-blown SNMP MIB once that spec
 188  * stabilizes.
 189  */
 190 
 191 typedef struct esp_kstats_s {
 192         kstat_named_t esp_stat_num_aalgs;
 193         kstat_named_t esp_stat_good_auth;
 194         kstat_named_t esp_stat_bad_auth;
 195         kstat_named_t esp_stat_bad_padding;
 196         kstat_named_t esp_stat_replay_failures;
 197         kstat_named_t esp_stat_replay_early_failures;
 198         kstat_named_t esp_stat_keysock_in;
 199         kstat_named_t esp_stat_out_requests;
 200         kstat_named_t esp_stat_acquire_requests;
 201         kstat_named_t esp_stat_bytes_expired;
 202         kstat_named_t esp_stat_out_discards;
 203         kstat_named_t esp_stat_crypto_sync;
 204         kstat_named_t esp_stat_crypto_async;
 205         kstat_named_t esp_stat_crypto_failures;
 206         kstat_named_t esp_stat_num_ealgs;
 207         kstat_named_t esp_stat_bad_decrypt;
 208         kstat_named_t esp_stat_sa_port_renumbers;
 209 } esp_kstats_t;
 210 
 211 /*
 212  * espstack->esp_kstats is equal to espstack->esp_ksp->ks_data if
 213  * kstat_create_netstack for espstack->esp_ksp succeeds, but when it
 214  * fails, it will be NULL. Note this is done for all stack instances,
 215  * so it *could* fail. hence a non-NULL checking is done for
 216  * ESP_BUMP_STAT and ESP_DEBUMP_STAT
 217  */
 218 #define ESP_BUMP_STAT(espstack, x)                                      \
 219 do {                                                                    \
 220         if (espstack->esp_kstats != NULL)                            \
 221                 (espstack->esp_kstats->esp_stat_ ## x).value.ui64++;      \
 222 _NOTE(CONSTCOND)                                                        \
 223 } while (0)
 224 
 225 #define ESP_DEBUMP_STAT(espstack, x)                                    \
 226 do {                                                                    \
 227         if (espstack->esp_kstats != NULL)                            \
 228                 (espstack->esp_kstats->esp_stat_ ## x).value.ui64--;      \
 229 _NOTE(CONSTCOND)                                                        \
 230 } while (0)
 231 
 232 static int      esp_kstat_update(kstat_t *, int);
 233 
 234 static boolean_t
 235 esp_kstat_init(ipsecesp_stack_t *espstack, netstackid_t stackid)
 236 {
 237         espstack->esp_ksp = kstat_create_netstack("ipsecesp", 0, "esp_stat",
 238             "net", KSTAT_TYPE_NAMED,
 239             sizeof (esp_kstats_t) / sizeof (kstat_named_t),
 240             KSTAT_FLAG_PERSISTENT, stackid);
 241 
 242         if (espstack->esp_ksp == NULL || espstack->esp_ksp->ks_data == NULL)
 243                 return (B_FALSE);
 244 
 245         espstack->esp_kstats = espstack->esp_ksp->ks_data;
 246 
 247         espstack->esp_ksp->ks_update = esp_kstat_update;
 248         espstack->esp_ksp->ks_private = (void *)(uintptr_t)stackid;
 249 
 250 #define K64 KSTAT_DATA_UINT64
 251 #define KI(x) kstat_named_init(&(espstack->esp_kstats->esp_stat_##x), #x, K64)
 252 
 253         KI(num_aalgs);
 254         KI(num_ealgs);
 255         KI(good_auth);
 256         KI(bad_auth);
 257         KI(bad_padding);
 258         KI(replay_failures);
 259         KI(replay_early_failures);
 260         KI(keysock_in);
 261         KI(out_requests);
 262         KI(acquire_requests);
 263         KI(bytes_expired);
 264         KI(out_discards);
 265         KI(crypto_sync);
 266         KI(crypto_async);
 267         KI(crypto_failures);
 268         KI(bad_decrypt);
 269         KI(sa_port_renumbers);
 270 
 271 #undef KI
 272 #undef K64
 273 
 274         kstat_install(espstack->esp_ksp);
 275 
 276         return (B_TRUE);
 277 }
 278 
 279 static int
 280 esp_kstat_update(kstat_t *kp, int rw)
 281 {
 282         esp_kstats_t *ekp;
 283         netstackid_t    stackid = (zoneid_t)(uintptr_t)kp->ks_private;
 284         netstack_t      *ns;
 285         ipsec_stack_t   *ipss;
 286 
 287         if ((kp == NULL) || (kp->ks_data == NULL))
 288                 return (EIO);
 289 
 290         if (rw == KSTAT_WRITE)
 291                 return (EACCES);
 292 
 293         ns = netstack_find_by_stackid(stackid);
 294         if (ns == NULL)
 295                 return (-1);
 296         ipss = ns->netstack_ipsec;
 297         if (ipss == NULL) {
 298                 netstack_rele(ns);
 299                 return (-1);
 300         }
 301         ekp = (esp_kstats_t *)kp->ks_data;
 302 
 303         mutex_enter(&ipss->ipsec_alg_lock);
 304         ekp->esp_stat_num_aalgs.value.ui64 =
 305             ipss->ipsec_nalgs[IPSEC_ALG_AUTH];
 306         ekp->esp_stat_num_ealgs.value.ui64 =
 307             ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
 308         mutex_exit(&ipss->ipsec_alg_lock);
 309 
 310         netstack_rele(ns);
 311         return (0);
 312 }
 313 
 314 #ifdef DEBUG
 315 /*
 316  * Debug routine, useful to see pre-encryption data.
 317  */
 318 static char *
 319 dump_msg(mblk_t *mp)
 320 {
 321         char tmp_str[3], tmp_line[256];
 322 
 323         while (mp != NULL) {
 324                 unsigned char *ptr;
 325 
 326                 printf("mblk address 0x%p, length %ld, db_ref %d "
 327                     "type %d, base 0x%p, lim 0x%p\n",
 328                     (void *) mp, (long)(mp->b_wptr - mp->b_rptr),
 329                     mp->b_datap->db_ref, mp->b_datap->db_type,
 330                     (void *)mp->b_datap->db_base, (void *)mp->b_datap->db_lim);
 331                 ptr = mp->b_rptr;
 332 
 333                 tmp_line[0] = '\0';
 334                 while (ptr < mp->b_wptr) {
 335                         uint_t diff;
 336 
 337                         diff = (ptr - mp->b_rptr);
 338                         if (!(diff & 0x1f)) {
 339                                 if (strlen(tmp_line) > 0) {
 340                                         printf("bytes: %s\n", tmp_line);
 341                                         tmp_line[0] = '\0';
 342                                 }
 343                         }
 344                         if (!(diff & 0x3))
 345                                 (void) strcat(tmp_line, " ");
 346                         (void) sprintf(tmp_str, "%02x", *ptr);
 347                         (void) strcat(tmp_line, tmp_str);
 348                         ptr++;
 349                 }
 350                 if (strlen(tmp_line) > 0)
 351                         printf("bytes: %s\n", tmp_line);
 352 
 353                 mp = mp->b_cont;
 354         }
 355 
 356         return ("\n");
 357 }
 358 
 359 #else /* DEBUG */
 360 static char *
 361 dump_msg(mblk_t *mp)
 362 {
 363         printf("Find value of mp %p.\n", mp);
 364         return ("\n");
 365 }
 366 #endif /* DEBUG */
 367 
 368 /*
 369  * Don't have to lock age_interval, as only one thread will access it at
 370  * a time, because I control the one function that does with timeout().
 371  */
 372 static void
 373 esp_ager(void *arg)
 374 {
 375         ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
 376         netstack_t      *ns = espstack->ipsecesp_netstack;
 377         hrtime_t begin = gethrtime();
 378 
 379         sadb_ager(&espstack->esp_sadb.s_v4, espstack->esp_pfkey_q,
 380             espstack->ipsecesp_reap_delay, ns);
 381         sadb_ager(&espstack->esp_sadb.s_v6, espstack->esp_pfkey_q,
 382             espstack->ipsecesp_reap_delay, ns);
 383 
 384         espstack->esp_event = sadb_retimeout(begin, espstack->esp_pfkey_q,
 385             esp_ager, espstack,
 386             &espstack->ipsecesp_age_interval, espstack->ipsecesp_age_int_max,
 387             info.mi_idnum);
 388 }
 389 
 390 /*
 391  * Get an ESP NDD parameter.
 392  */
 393 /* ARGSUSED */
 394 static int
 395 ipsecesp_param_get(q, mp, cp, cr)
 396         queue_t *q;
 397         mblk_t  *mp;
 398         caddr_t cp;
 399         cred_t *cr;
 400 {
 401         ipsecespparam_t *ipsecesppa = (ipsecespparam_t *)cp;
 402         uint_t value;
 403         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
 404 
 405         mutex_enter(&espstack->ipsecesp_param_lock);
 406         value = ipsecesppa->ipsecesp_param_value;
 407         mutex_exit(&espstack->ipsecesp_param_lock);
 408 
 409         (void) mi_mpprintf(mp, "%u", value);
 410         return (0);
 411 }
 412 
 413 /*
 414  * This routine sets an NDD variable in a ipsecespparam_t structure.
 415  */
 416 /* ARGSUSED */
 417 static int
 418 ipsecesp_param_set(q, mp, value, cp, cr)
 419         queue_t *q;
 420         mblk_t  *mp;
 421         char    *value;
 422         caddr_t cp;
 423         cred_t *cr;
 424 {
 425         ulong_t new_value;
 426         ipsecespparam_t *ipsecesppa = (ipsecespparam_t *)cp;
 427         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
 428 
 429         /*
 430          * Fail the request if the new value does not lie within the
 431          * required bounds.
 432          */
 433         if (ddi_strtoul(value, NULL, 10, &new_value) != 0 ||
 434             new_value < ipsecesppa->ipsecesp_param_min ||
 435             new_value > ipsecesppa->ipsecesp_param_max) {
 436                 return (EINVAL);
 437         }
 438 
 439         /* Set the new value */
 440         mutex_enter(&espstack->ipsecesp_param_lock);
 441         ipsecesppa->ipsecesp_param_value = new_value;
 442         mutex_exit(&espstack->ipsecesp_param_lock);
 443         return (0);
 444 }
 445 
 446 /*
 447  * Using lifetime NDD variables, fill in an extended combination's
 448  * lifetime information.
 449  */
 450 void
 451 ipsecesp_fill_defs(sadb_x_ecomb_t *ecomb, netstack_t *ns)
 452 {
 453         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
 454 
 455         ecomb->sadb_x_ecomb_soft_bytes = espstack->ipsecesp_default_soft_bytes;
 456         ecomb->sadb_x_ecomb_hard_bytes = espstack->ipsecesp_default_hard_bytes;
 457         ecomb->sadb_x_ecomb_soft_addtime =
 458             espstack->ipsecesp_default_soft_addtime;
 459         ecomb->sadb_x_ecomb_hard_addtime =
 460             espstack->ipsecesp_default_hard_addtime;
 461         ecomb->sadb_x_ecomb_soft_usetime =
 462             espstack->ipsecesp_default_soft_usetime;
 463         ecomb->sadb_x_ecomb_hard_usetime =
 464             espstack->ipsecesp_default_hard_usetime;
 465 }
 466 
 467 /*
 468  * Initialize things for ESP at module load time.
 469  */
 470 boolean_t
 471 ipsecesp_ddi_init(void)
 472 {
 473         esp_taskq = taskq_create("esp_taskq", 1, minclsyspri,
 474             IPSEC_TASKQ_MIN, IPSEC_TASKQ_MAX, 0);
 475 
 476         /*
 477          * We want to be informed each time a stack is created or
 478          * destroyed in the kernel, so we can maintain the
 479          * set of ipsecesp_stack_t's.
 480          */
 481         netstack_register(NS_IPSECESP, ipsecesp_stack_init, NULL,
 482             ipsecesp_stack_fini);
 483 
 484         return (B_TRUE);
 485 }
 486 
 487 /*
 488  * Walk through the param array specified registering each element with the
 489  * named dispatch handler.
 490  */
 491 static boolean_t
 492 ipsecesp_param_register(IDP *ndp, ipsecespparam_t *espp, int cnt)
 493 {
 494         for (; cnt-- > 0; espp++) {
 495                 if (espp->ipsecesp_param_name != NULL &&
 496                     espp->ipsecesp_param_name[0]) {
 497                         if (!nd_load(ndp,
 498                             espp->ipsecesp_param_name,
 499                             ipsecesp_param_get, ipsecesp_param_set,
 500                             (caddr_t)espp)) {
 501                                 nd_free(ndp);
 502                                 return (B_FALSE);
 503                         }
 504                 }
 505         }
 506         return (B_TRUE);
 507 }
 508 /*
 509  * Initialize things for ESP for each stack instance
 510  */
 511 static void *
 512 ipsecesp_stack_init(netstackid_t stackid, netstack_t *ns)
 513 {
 514         ipsecesp_stack_t        *espstack;
 515         ipsecespparam_t         *espp;
 516 
 517         espstack = (ipsecesp_stack_t *)kmem_zalloc(sizeof (*espstack),
 518             KM_SLEEP);
 519         espstack->ipsecesp_netstack = ns;
 520 
 521         espp = (ipsecespparam_t *)kmem_alloc(sizeof (lcl_param_arr), KM_SLEEP);
 522         espstack->ipsecesp_params = espp;
 523         bcopy(lcl_param_arr, espp, sizeof (lcl_param_arr));
 524 
 525         (void) ipsecesp_param_register(&espstack->ipsecesp_g_nd, espp,
 526             A_CNT(lcl_param_arr));
 527 
 528         (void) esp_kstat_init(espstack, stackid);
 529 
 530         espstack->esp_sadb.s_acquire_timeout =
 531             &espstack->ipsecesp_acquire_timeout;
 532         espstack->esp_sadb.s_acqfn = esp_send_acquire;
 533         sadbp_init("ESP", &espstack->esp_sadb, SADB_SATYPE_ESP, esp_hash_size,
 534             espstack->ipsecesp_netstack);
 535 
 536         mutex_init(&espstack->ipsecesp_param_lock, NULL, MUTEX_DEFAULT, 0);
 537 
 538         ip_drop_register(&espstack->esp_dropper, "IPsec ESP");
 539         return (espstack);
 540 }
 541 
 542 /*
 543  * Destroy things for ESP at module unload time.
 544  */
 545 void
 546 ipsecesp_ddi_destroy(void)
 547 {
 548         netstack_unregister(NS_IPSECESP);
 549         taskq_destroy(esp_taskq);
 550 }
 551 
 552 /*
 553  * Destroy things for ESP for one stack instance
 554  */
 555 static void
 556 ipsecesp_stack_fini(netstackid_t stackid, void *arg)
 557 {
 558         ipsecesp_stack_t *espstack = (ipsecesp_stack_t *)arg;
 559 
 560         if (espstack->esp_pfkey_q != NULL) {
 561                 (void) quntimeout(espstack->esp_pfkey_q, espstack->esp_event);
 562         }
 563         espstack->esp_sadb.s_acqfn = NULL;
 564         espstack->esp_sadb.s_acquire_timeout = NULL;
 565         sadbp_destroy(&espstack->esp_sadb, espstack->ipsecesp_netstack);
 566         ip_drop_unregister(&espstack->esp_dropper);
 567         mutex_destroy(&espstack->ipsecesp_param_lock);
 568         nd_free(&espstack->ipsecesp_g_nd);
 569 
 570         kmem_free(espstack->ipsecesp_params, sizeof (lcl_param_arr));
 571         espstack->ipsecesp_params = NULL;
 572         kstat_delete_netstack(espstack->esp_ksp, stackid);
 573         espstack->esp_ksp = NULL;
 574         espstack->esp_kstats = NULL;
 575         kmem_free(espstack, sizeof (*espstack));
 576 }
 577 
 578 /*
 579  * ESP module open routine, which is here for keysock plumbing.
 580  * Keysock is pushed over {AH,ESP} which is an artifact from the Bad Old
 581  * Days of export control, and fears that ESP would not be allowed
 582  * to be shipped at all by default.  Eventually, keysock should
 583  * either access AH and ESP via modstubs or krtld dependencies, or
 584  * perhaps be folded in with AH and ESP into a single IPsec/netsec
 585  * module ("netsec" if PF_KEY provides more than AH/ESP keying tables).
 586  */
 587 /* ARGSUSED */
 588 static int
 589 ipsecesp_open(queue_t *q, dev_t *devp, int flag, int sflag, cred_t *credp)
 590 {
 591         netstack_t              *ns;
 592         ipsecesp_stack_t        *espstack;
 593 
 594         if (secpolicy_ip_config(credp, B_FALSE) != 0)
 595                 return (EPERM);
 596 
 597         if (q->q_ptr != NULL)
 598                 return (0);  /* Re-open of an already open instance. */
 599 
 600         if (sflag != MODOPEN)
 601                 return (EINVAL);
 602 
 603         ns = netstack_find_by_cred(credp);
 604         ASSERT(ns != NULL);
 605         espstack = ns->netstack_ipsecesp;
 606         ASSERT(espstack != NULL);
 607 
 608         q->q_ptr = espstack;
 609         WR(q)->q_ptr = q->q_ptr;
 610 
 611         qprocson(q);
 612         return (0);
 613 }
 614 
 615 /*
 616  * ESP module close routine.
 617  */
 618 static int
 619 ipsecesp_close(queue_t *q)
 620 {
 621         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
 622 
 623         /*
 624          * Clean up q_ptr, if needed.
 625          */
 626         qprocsoff(q);
 627 
 628         /* Keysock queue check is safe, because of OCEXCL perimeter. */
 629 
 630         if (q == espstack->esp_pfkey_q) {
 631                 esp1dbg(espstack,
 632                     ("ipsecesp_close:  Ummm... keysock is closing ESP.\n"));
 633                 espstack->esp_pfkey_q = NULL;
 634                 /* Detach qtimeouts. */
 635                 (void) quntimeout(q, espstack->esp_event);
 636         }
 637 
 638         netstack_rele(espstack->ipsecesp_netstack);
 639         return (0);
 640 }
 641 
 642 /*
 643  * Add a number of bytes to what the SA has protected so far.  Return
 644  * B_TRUE if the SA can still protect that many bytes.
 645  *
 646  * Caller must REFRELE the passed-in assoc.  This function must REFRELE
 647  * any obtained peer SA.
 648  */
 649 static boolean_t
 650 esp_age_bytes(ipsa_t *assoc, uint64_t bytes, boolean_t inbound)
 651 {
 652         ipsa_t *inassoc, *outassoc;
 653         isaf_t *bucket;
 654         boolean_t inrc, outrc, isv6;
 655         sadb_t *sp;
 656         int outhash;
 657         netstack_t              *ns = assoc->ipsa_netstack;
 658         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
 659 
 660         /* No peer?  No problem! */
 661         if (!assoc->ipsa_haspeer) {
 662                 return (sadb_age_bytes(espstack->esp_pfkey_q, assoc, bytes,
 663                     B_TRUE));
 664         }
 665 
 666         /*
 667          * Otherwise, we want to grab both the original assoc and its peer.
 668          * There might be a race for this, but if it's a real race, two
 669          * expire messages may occur.  We limit this by only sending the
 670          * expire message on one of the peers, we'll pick the inbound
 671          * arbitrarily.
 672          *
 673          * If we need tight synchronization on the peer SA, then we need to
 674          * reconsider.
 675          */
 676 
 677         /* Use address length to select IPv6/IPv4 */
 678         isv6 = (assoc->ipsa_addrfam == AF_INET6);
 679         sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
 680 
 681         if (inbound) {
 682                 inassoc = assoc;
 683                 if (isv6) {
 684                         outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
 685                             &inassoc->ipsa_dstaddr));
 686                 } else {
 687                         outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
 688                             &inassoc->ipsa_dstaddr));
 689                 }
 690                 bucket = &sp->sdb_of[outhash];
 691                 mutex_enter(&bucket->isaf_lock);
 692                 outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
 693                     inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
 694                     inassoc->ipsa_addrfam);
 695                 mutex_exit(&bucket->isaf_lock);
 696                 if (outassoc == NULL) {
 697                         /* Q: Do we wish to set haspeer == B_FALSE? */
 698                         esp0dbg(("esp_age_bytes: "
 699                             "can't find peer for inbound.\n"));
 700                         return (sadb_age_bytes(espstack->esp_pfkey_q, inassoc,
 701                             bytes, B_TRUE));
 702                 }
 703         } else {
 704                 outassoc = assoc;
 705                 bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
 706                 mutex_enter(&bucket->isaf_lock);
 707                 inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
 708                     outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
 709                     outassoc->ipsa_addrfam);
 710                 mutex_exit(&bucket->isaf_lock);
 711                 if (inassoc == NULL) {
 712                         /* Q: Do we wish to set haspeer == B_FALSE? */
 713                         esp0dbg(("esp_age_bytes: "
 714                             "can't find peer for outbound.\n"));
 715                         return (sadb_age_bytes(espstack->esp_pfkey_q, outassoc,
 716                             bytes, B_TRUE));
 717                 }
 718         }
 719 
 720         inrc = sadb_age_bytes(espstack->esp_pfkey_q, inassoc, bytes, B_TRUE);
 721         outrc = sadb_age_bytes(espstack->esp_pfkey_q, outassoc, bytes, B_FALSE);
 722 
 723         /*
 724          * REFRELE any peer SA.
 725          *
 726          * Because of the multi-line macro nature of IPSA_REFRELE, keep
 727          * them in { }.
 728          */
 729         if (inbound) {
 730                 IPSA_REFRELE(outassoc);
 731         } else {
 732                 IPSA_REFRELE(inassoc);
 733         }
 734 
 735         return (inrc && outrc);
 736 }
 737 
 738 /*
 739  * Do incoming NAT-T manipulations for packet.
 740  * Returns NULL if the mblk chain is consumed.
 741  */
 742 static mblk_t *
 743 esp_fix_natt_checksums(mblk_t *data_mp, ipsa_t *assoc)
 744 {
 745         ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
 746         tcpha_t *tcpha;
 747         udpha_t *udpha;
 748         /* Initialize to our inbound cksum adjustment... */
 749         uint32_t sum = assoc->ipsa_inbound_cksum;
 750 
 751         switch (ipha->ipha_protocol) {
 752         case IPPROTO_TCP:
 753                 tcpha = (tcpha_t *)(data_mp->b_rptr +
 754                     IPH_HDR_LENGTH(ipha));
 755 
 756 #define DOWN_SUM(x) (x) = ((x) & 0xFFFF) +   ((x) >> 16)
 757                 sum += ~ntohs(tcpha->tha_sum) & 0xFFFF;
 758                 DOWN_SUM(sum);
 759                 DOWN_SUM(sum);
 760                 tcpha->tha_sum = ~htons(sum);
 761                 break;
 762         case IPPROTO_UDP:
 763                 udpha = (udpha_t *)(data_mp->b_rptr + IPH_HDR_LENGTH(ipha));
 764 
 765                 if (udpha->uha_checksum != 0) {
 766                         /* Adujst if the inbound one was not zero. */
 767                         sum += ~ntohs(udpha->uha_checksum) & 0xFFFF;
 768                         DOWN_SUM(sum);
 769                         DOWN_SUM(sum);
 770                         udpha->uha_checksum = ~htons(sum);
 771                         if (udpha->uha_checksum == 0)
 772                                 udpha->uha_checksum = 0xFFFF;
 773                 }
 774 #undef DOWN_SUM
 775                 break;
 776         case IPPROTO_IP:
 777                 /*
 778                  * This case is only an issue for self-encapsulated
 779                  * packets.  So for now, fall through.
 780                  */
 781                 break;
 782         }
 783         return (data_mp);
 784 }
 785 
 786 
 787 /*
 788  * Strip ESP header, check padding, and fix IP header.
 789  * Returns B_TRUE on success, B_FALSE if an error occured.
 790  */
 791 static boolean_t
 792 esp_strip_header(mblk_t *data_mp, boolean_t isv4, uint32_t ivlen,
 793     kstat_named_t **counter, ipsecesp_stack_t *espstack)
 794 {
 795         ipha_t *ipha;
 796         ip6_t *ip6h;
 797         uint_t divpoint;
 798         mblk_t *scratch;
 799         uint8_t nexthdr, padlen;
 800         uint8_t lastpad;
 801         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
 802         uint8_t *lastbyte;
 803 
 804         /*
 805          * Strip ESP data and fix IP header.
 806          *
 807          * XXX In case the beginning of esp_inbound() changes to not do a
 808          * pullup, this part of the code can remain unchanged.
 809          */
 810         if (isv4) {
 811                 ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ipha_t));
 812                 ipha = (ipha_t *)data_mp->b_rptr;
 813                 ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (esph_t) +
 814                     IPH_HDR_LENGTH(ipha));
 815                 divpoint = IPH_HDR_LENGTH(ipha);
 816         } else {
 817                 ASSERT((data_mp->b_wptr - data_mp->b_rptr) >= sizeof (ip6_t));
 818                 ip6h = (ip6_t *)data_mp->b_rptr;
 819                 divpoint = ip_hdr_length_v6(data_mp, ip6h);
 820         }
 821 
 822         scratch = data_mp;
 823         while (scratch->b_cont != NULL)
 824                 scratch = scratch->b_cont;
 825 
 826         ASSERT((scratch->b_wptr - scratch->b_rptr) >= 3);
 827 
 828         /*
 829          * "Next header" and padding length are the last two bytes in the
 830          * ESP-protected datagram, thus the explicit - 1 and - 2.
 831          * lastpad is the last byte of the padding, which can be used for
 832          * a quick check to see if the padding is correct.
 833          */
 834         lastbyte = scratch->b_wptr - 1;
 835         nexthdr = *lastbyte--;
 836         padlen = *lastbyte--;
 837 
 838         if (isv4) {
 839                 /* Fix part of the IP header. */
 840                 ipha->ipha_protocol = nexthdr;
 841                 /*
 842                  * Reality check the padlen.  The explicit - 2 is for the
 843                  * padding length and the next-header bytes.
 844                  */
 845                 if (padlen >= ntohs(ipha->ipha_length) - sizeof (ipha_t) - 2 -
 846                     sizeof (esph_t) - ivlen) {
 847                         ESP_BUMP_STAT(espstack, bad_decrypt);
 848                         ipsec_rl_strlog(espstack->ipsecesp_netstack,
 849                             info.mi_idnum, 0, 0,
 850                             SL_ERROR | SL_WARN,
 851                             "Corrupt ESP packet (padlen too big).\n");
 852                         esp1dbg(espstack, ("padlen (%d) is greater than:\n",
 853                             padlen));
 854                         esp1dbg(espstack, ("pkt len(%d) - ip hdr - esp "
 855                             "hdr - ivlen(%d) = %d.\n",
 856                             ntohs(ipha->ipha_length), ivlen,
 857                             (int)(ntohs(ipha->ipha_length) - sizeof (ipha_t) -
 858                             2 - sizeof (esph_t) - ivlen)));
 859                         *counter = DROPPER(ipss, ipds_esp_bad_padlen);
 860                         return (B_FALSE);
 861                 }
 862 
 863                 /*
 864                  * Fix the rest of the header.  The explicit - 2 is for the
 865                  * padding length and the next-header bytes.
 866                  */
 867                 ipha->ipha_length = htons(ntohs(ipha->ipha_length) - padlen -
 868                     2 - sizeof (esph_t) - ivlen);
 869                 ipha->ipha_hdr_checksum = 0;
 870                 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
 871         } else {
 872                 if (ip6h->ip6_nxt == IPPROTO_ESP) {
 873                         ip6h->ip6_nxt = nexthdr;
 874                 } else {
 875                         ip_pkt_t ipp;
 876 
 877                         bzero(&ipp, sizeof (ipp));
 878                         (void) ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp,
 879                             NULL);
 880                         if (ipp.ipp_dstopts != NULL) {
 881                                 ipp.ipp_dstopts->ip6d_nxt = nexthdr;
 882                         } else if (ipp.ipp_rthdr != NULL) {
 883                                 ipp.ipp_rthdr->ip6r_nxt = nexthdr;
 884                         } else if (ipp.ipp_hopopts != NULL) {
 885                                 ipp.ipp_hopopts->ip6h_nxt = nexthdr;
 886                         } else {
 887                                 /* Panic a DEBUG kernel. */
 888                                 ASSERT(ipp.ipp_hopopts != NULL);
 889                                 /* Otherwise, pretend it's IP + ESP. */
 890                                 cmn_err(CE_WARN, "ESP IPv6 headers wrong.\n");
 891                                 ip6h->ip6_nxt = nexthdr;
 892                         }
 893                 }
 894 
 895                 if (padlen >= ntohs(ip6h->ip6_plen) - 2 - sizeof (esph_t) -
 896                     ivlen) {
 897                         ESP_BUMP_STAT(espstack, bad_decrypt);
 898                         ipsec_rl_strlog(espstack->ipsecesp_netstack,
 899                             info.mi_idnum, 0, 0,
 900                             SL_ERROR | SL_WARN,
 901                             "Corrupt ESP packet (v6 padlen too big).\n");
 902                         esp1dbg(espstack, ("padlen (%d) is greater than:\n",
 903                             padlen));
 904                         esp1dbg(espstack,
 905                             ("pkt len(%u) - ip hdr - esp hdr - ivlen(%d) = "
 906                             "%u.\n", (unsigned)(ntohs(ip6h->ip6_plen)
 907                             + sizeof (ip6_t)), ivlen,
 908                             (unsigned)(ntohs(ip6h->ip6_plen) - 2 -
 909                             sizeof (esph_t) - ivlen)));
 910                         *counter = DROPPER(ipss, ipds_esp_bad_padlen);
 911                         return (B_FALSE);
 912                 }
 913 
 914 
 915                 /*
 916                  * Fix the rest of the header.  The explicit - 2 is for the
 917                  * padding length and the next-header bytes.  IPv6 is nice,
 918                  * because there's no hdr checksum!
 919                  */
 920                 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) - padlen -
 921                     2 - sizeof (esph_t) - ivlen);
 922         }
 923 
 924         if (espstack->ipsecesp_padding_check > 0 && padlen > 0) {
 925                 /*
 926                  * Weak padding check: compare last-byte to length, they
 927                  * should be equal.
 928                  */
 929                 lastpad = *lastbyte--;
 930 
 931                 if (padlen != lastpad) {
 932                         ipsec_rl_strlog(espstack->ipsecesp_netstack,
 933                             info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
 934                             "Corrupt ESP packet (lastpad != padlen).\n");
 935                         esp1dbg(espstack,
 936                             ("lastpad (%d) not equal to padlen (%d):\n",
 937                             lastpad, padlen));
 938                         ESP_BUMP_STAT(espstack, bad_padding);
 939                         *counter = DROPPER(ipss, ipds_esp_bad_padding);
 940                         return (B_FALSE);
 941                 }
 942 
 943                 /*
 944                  * Strong padding check: Check all pad bytes to see that
 945                  * they're ascending.  Go backwards using a descending counter
 946                  * to verify.  padlen == 1 is checked by previous block, so
 947                  * only bother if we've more than 1 byte of padding.
 948                  * Consequently, start the check one byte before the location
 949                  * of "lastpad".
 950                  */
 951                 if (espstack->ipsecesp_padding_check > 1) {
 952                         /*
 953                          * This assert may have to become an if and a pullup
 954                          * if we start accepting multi-dblk mblks. For now,
 955                          * though, any packet here will have been pulled up in
 956                          * esp_inbound.
 957                          */
 958                         ASSERT(MBLKL(scratch) >= lastpad + 3);
 959 
 960                         /*
 961                          * Use "--lastpad" because we already checked the very
 962                          * last pad byte previously.
 963                          */
 964                         while (--lastpad != 0) {
 965                                 if (lastpad != *lastbyte) {
 966                                         ipsec_rl_strlog(
 967                                             espstack->ipsecesp_netstack,
 968                                             info.mi_idnum, 0, 0,
 969                                             SL_ERROR | SL_WARN, "Corrupt ESP "
 970                                             "packet (bad padding).\n");
 971                                         esp1dbg(espstack,
 972                                             ("padding not in correct"
 973                                             " format:\n"));
 974                                         ESP_BUMP_STAT(espstack, bad_padding);
 975                                         *counter = DROPPER(ipss,
 976                                             ipds_esp_bad_padding);
 977                                         return (B_FALSE);
 978                                 }
 979                                 lastbyte--;
 980                         }
 981                 }
 982         }
 983 
 984         /* Trim off the padding. */
 985         ASSERT(data_mp->b_cont == NULL);
 986         data_mp->b_wptr -= (padlen + 2);
 987 
 988         /*
 989          * Remove the ESP header.
 990          *
 991          * The above assertions about data_mp's size will make this work.
 992          *
 993          * XXX  Question:  If I send up and get back a contiguous mblk,
 994          * would it be quicker to bcopy over, or keep doing the dupb stuff?
 995          * I go with copying for now.
 996          */
 997 
 998         if (IS_P2ALIGNED(data_mp->b_rptr, sizeof (uint32_t)) &&
 999             IS_P2ALIGNED(ivlen, sizeof (uint32_t))) {
1000                 uint8_t *start = data_mp->b_rptr;
1001                 uint32_t *src, *dst;
1002 
1003                 src = (uint32_t *)(start + divpoint);
1004                 dst = (uint32_t *)(start + divpoint + sizeof (esph_t) + ivlen);
1005 
1006                 ASSERT(IS_P2ALIGNED(dst, sizeof (uint32_t)) &&
1007                     IS_P2ALIGNED(src, sizeof (uint32_t)));
1008 
1009                 do {
1010                         src--;
1011                         dst--;
1012                         *dst = *src;
1013                 } while (src != (uint32_t *)start);
1014 
1015                 data_mp->b_rptr = (uchar_t *)dst;
1016         } else {
1017                 uint8_t *start = data_mp->b_rptr;
1018                 uint8_t *src, *dst;
1019 
1020                 src = start + divpoint;
1021                 dst = src + sizeof (esph_t) + ivlen;
1022 
1023                 do {
1024                         src--;
1025                         dst--;
1026                         *dst = *src;
1027                 } while (src != start);
1028 
1029                 data_mp->b_rptr = dst;
1030         }
1031 
1032         esp2dbg(espstack, ("data_mp after inbound ESP adjustment:\n"));
1033         esp2dbg(espstack, (dump_msg(data_mp)));
1034 
1035         return (B_TRUE);
1036 }
1037 
1038 /*
1039  * Updating use times can be tricky business if the ipsa_haspeer flag is
1040  * set.  This function is called once in an SA's lifetime.
1041  *
1042  * Caller has to REFRELE "assoc" which is passed in.  This function has
1043  * to REFRELE any peer SA that is obtained.
1044  */
1045 static void
1046 esp_set_usetime(ipsa_t *assoc, boolean_t inbound)
1047 {
1048         ipsa_t *inassoc, *outassoc;
1049         isaf_t *bucket;
1050         sadb_t *sp;
1051         int outhash;
1052         boolean_t isv6;
1053         netstack_t              *ns = assoc->ipsa_netstack;
1054         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
1055 
1056         /* No peer?  No problem! */
1057         if (!assoc->ipsa_haspeer) {
1058                 sadb_set_usetime(assoc);
1059                 return;
1060         }
1061 
1062         /*
1063          * Otherwise, we want to grab both the original assoc and its peer.
1064          * There might be a race for this, but if it's a real race, the times
1065          * will be out-of-synch by at most a second, and since our time
1066          * granularity is a second, this won't be a problem.
1067          *
1068          * If we need tight synchronization on the peer SA, then we need to
1069          * reconsider.
1070          */
1071 
1072         /* Use address length to select IPv6/IPv4 */
1073         isv6 = (assoc->ipsa_addrfam == AF_INET6);
1074         sp = isv6 ? &espstack->esp_sadb.s_v6 : &espstack->esp_sadb.s_v4;
1075 
1076         if (inbound) {
1077                 inassoc = assoc;
1078                 if (isv6) {
1079                         outhash = OUTBOUND_HASH_V6(sp, *((in6_addr_t *)
1080                             &inassoc->ipsa_dstaddr));
1081                 } else {
1082                         outhash = OUTBOUND_HASH_V4(sp, *((ipaddr_t *)
1083                             &inassoc->ipsa_dstaddr));
1084                 }
1085                 bucket = &sp->sdb_of[outhash];
1086                 mutex_enter(&bucket->isaf_lock);
1087                 outassoc = ipsec_getassocbyspi(bucket, inassoc->ipsa_spi,
1088                     inassoc->ipsa_srcaddr, inassoc->ipsa_dstaddr,
1089                     inassoc->ipsa_addrfam);
1090                 mutex_exit(&bucket->isaf_lock);
1091                 if (outassoc == NULL) {
1092                         /* Q: Do we wish to set haspeer == B_FALSE? */
1093                         esp0dbg(("esp_set_usetime: "
1094                             "can't find peer for inbound.\n"));
1095                         sadb_set_usetime(inassoc);
1096                         return;
1097                 }
1098         } else {
1099                 outassoc = assoc;
1100                 bucket = INBOUND_BUCKET(sp, outassoc->ipsa_spi);
1101                 mutex_enter(&bucket->isaf_lock);
1102                 inassoc = ipsec_getassocbyspi(bucket, outassoc->ipsa_spi,
1103                     outassoc->ipsa_srcaddr, outassoc->ipsa_dstaddr,
1104                     outassoc->ipsa_addrfam);
1105                 mutex_exit(&bucket->isaf_lock);
1106                 if (inassoc == NULL) {
1107                         /* Q: Do we wish to set haspeer == B_FALSE? */
1108                         esp0dbg(("esp_set_usetime: "
1109                             "can't find peer for outbound.\n"));
1110                         sadb_set_usetime(outassoc);
1111                         return;
1112                 }
1113         }
1114 
1115         /* Update usetime on both. */
1116         sadb_set_usetime(inassoc);
1117         sadb_set_usetime(outassoc);
1118 
1119         /*
1120          * REFRELE any peer SA.
1121          *
1122          * Because of the multi-line macro nature of IPSA_REFRELE, keep
1123          * them in { }.
1124          */
1125         if (inbound) {
1126                 IPSA_REFRELE(outassoc);
1127         } else {
1128                 IPSA_REFRELE(inassoc);
1129         }
1130 }
1131 
1132 /*
1133  * Handle ESP inbound data for IPv4 and IPv6.
1134  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1135  * mblk chain data_mp.
1136  */
1137 mblk_t *
1138 esp_inbound(mblk_t *data_mp, void *arg, ip_recv_attr_t *ira)
1139 {
1140         esph_t *esph = (esph_t *)arg;
1141         ipsa_t *ipsa = ira->ira_ipsec_esp_sa;
1142         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
1143         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1144         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1145 
1146         /*
1147          * We may wish to check replay in-range-only here as an optimization.
1148          * Include the reality check of ipsa->ipsa_replay >
1149          * ipsa->ipsa_replay_wsize for times when it's the first N packets,
1150          * where N == ipsa->ipsa_replay_wsize.
1151          *
1152          * Another check that may come here later is the "collision" check.
1153          * If legitimate packets flow quickly enough, this won't be a problem,
1154          * but collisions may cause authentication algorithm crunching to
1155          * take place when it doesn't need to.
1156          */
1157         if (!sadb_replay_peek(ipsa, esph->esph_replay)) {
1158                 ESP_BUMP_STAT(espstack, replay_early_failures);
1159                 IP_ESP_BUMP_STAT(ipss, in_discards);
1160                 ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1161                     DROPPER(ipss, ipds_esp_early_replay),
1162                     &espstack->esp_dropper);
1163                 BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1164                 return (NULL);
1165         }
1166 
1167         /*
1168          * Adjust the IP header's payload length to reflect the removal
1169          * of the ICV.
1170          */
1171         if (!(ira->ira_flags & IRAF_IS_IPV4)) {
1172                 ip6_t *ip6h = (ip6_t *)data_mp->b_rptr;
1173                 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) -
1174                     ipsa->ipsa_mac_len);
1175         } else {
1176                 ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1177                 ipha->ipha_length = htons(ntohs(ipha->ipha_length) -
1178                     ipsa->ipsa_mac_len);
1179         }
1180 
1181         /* submit the request to the crypto framework */
1182         return (esp_submit_req_inbound(data_mp, ira, ipsa,
1183             (uint8_t *)esph - data_mp->b_rptr));
1184 }
1185 
1186 /*
1187  * Perform the really difficult work of inserting the proposed situation.
1188  * Called while holding the algorithm lock.
1189  */
1190 static void
1191 esp_insert_prop(sadb_prop_t *prop, ipsacq_t *acqrec, uint_t combs,
1192     netstack_t *ns)
1193 {
1194         sadb_comb_t *comb = (sadb_comb_t *)(prop + 1);
1195         ipsec_action_t *ap;
1196         ipsec_prot_t *prot;
1197         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1198         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1199 
1200         ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1201 
1202         prop->sadb_prop_exttype = SADB_EXT_PROPOSAL;
1203         prop->sadb_prop_len = SADB_8TO64(sizeof (sadb_prop_t));
1204         *(uint32_t *)(&prop->sadb_prop_replay) = 0;      /* Quick zero-out! */
1205 
1206         prop->sadb_prop_replay = espstack->ipsecesp_replay_size;
1207 
1208         /*
1209          * Based upon algorithm properties, and what-not, prioritize a
1210          * proposal, based on the ordering of the ESP algorithms in the
1211          * alternatives in the policy rule or socket that was placed
1212          * in the acquire record.
1213          *
1214          * For each action in policy list
1215          *   Add combination.  If I've hit limit, return.
1216          */
1217 
1218         for (ap = acqrec->ipsacq_act; ap != NULL;
1219             ap = ap->ipa_next) {
1220                 ipsec_alginfo_t *ealg = NULL;
1221                 ipsec_alginfo_t *aalg = NULL;
1222 
1223                 if (ap->ipa_act.ipa_type != IPSEC_POLICY_APPLY)
1224                         continue;
1225 
1226                 prot = &ap->ipa_act.ipa_apply;
1227 
1228                 if (!(prot->ipp_use_esp))
1229                         continue;
1230 
1231                 if (prot->ipp_esp_auth_alg != 0) {
1232                         aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
1233                             [prot->ipp_esp_auth_alg];
1234                         if (aalg == NULL || !ALG_VALID(aalg))
1235                                 continue;
1236                 }
1237 
1238                 ASSERT(prot->ipp_encr_alg > 0);
1239                 ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
1240                     [prot->ipp_encr_alg];
1241                 if (ealg == NULL || !ALG_VALID(ealg))
1242                         continue;
1243 
1244                 comb->sadb_comb_flags = 0;
1245                 comb->sadb_comb_reserved = 0;
1246                 comb->sadb_comb_encrypt = ealg->alg_id;
1247                 comb->sadb_comb_encrypt_minbits =
1248                     MAX(prot->ipp_espe_minbits, ealg->alg_ef_minbits);
1249                 comb->sadb_comb_encrypt_maxbits =
1250                     MIN(prot->ipp_espe_maxbits, ealg->alg_ef_maxbits);
1251 
1252                 if (aalg == NULL) {
1253                         comb->sadb_comb_auth = 0;
1254                         comb->sadb_comb_auth_minbits = 0;
1255                         comb->sadb_comb_auth_maxbits = 0;
1256                 } else {
1257                         comb->sadb_comb_auth = aalg->alg_id;
1258                         comb->sadb_comb_auth_minbits =
1259                             MAX(prot->ipp_espa_minbits, aalg->alg_ef_minbits);
1260                         comb->sadb_comb_auth_maxbits =
1261                             MIN(prot->ipp_espa_maxbits, aalg->alg_ef_maxbits);
1262                 }
1263 
1264                 /*
1265                  * The following may be based on algorithm
1266                  * properties, but in the meantime, we just pick
1267                  * some good, sensible numbers.  Key mgmt. can
1268                  * (and perhaps should) be the place to finalize
1269                  * such decisions.
1270                  */
1271 
1272                 /*
1273                  * No limits on allocations, since we really don't
1274                  * support that concept currently.
1275                  */
1276                 comb->sadb_comb_soft_allocations = 0;
1277                 comb->sadb_comb_hard_allocations = 0;
1278 
1279                 /*
1280                  * These may want to come from policy rule..
1281                  */
1282                 comb->sadb_comb_soft_bytes =
1283                     espstack->ipsecesp_default_soft_bytes;
1284                 comb->sadb_comb_hard_bytes =
1285                     espstack->ipsecesp_default_hard_bytes;
1286                 comb->sadb_comb_soft_addtime =
1287                     espstack->ipsecesp_default_soft_addtime;
1288                 comb->sadb_comb_hard_addtime =
1289                     espstack->ipsecesp_default_hard_addtime;
1290                 comb->sadb_comb_soft_usetime =
1291                     espstack->ipsecesp_default_soft_usetime;
1292                 comb->sadb_comb_hard_usetime =
1293                     espstack->ipsecesp_default_hard_usetime;
1294 
1295                 prop->sadb_prop_len += SADB_8TO64(sizeof (*comb));
1296                 if (--combs == 0)
1297                         break;  /* out of space.. */
1298                 comb++;
1299         }
1300 }
1301 
1302 /*
1303  * Prepare and actually send the SADB_ACQUIRE message to PF_KEY.
1304  */
1305 static void
1306 esp_send_acquire(ipsacq_t *acqrec, mblk_t *extended, netstack_t *ns)
1307 {
1308         uint_t combs;
1309         sadb_msg_t *samsg;
1310         sadb_prop_t *prop;
1311         mblk_t *pfkeymp, *msgmp;
1312         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1313         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1314 
1315         ESP_BUMP_STAT(espstack, acquire_requests);
1316 
1317         if (espstack->esp_pfkey_q == NULL) {
1318                 mutex_exit(&acqrec->ipsacq_lock);
1319                 return;
1320         }
1321 
1322         /* Set up ACQUIRE. */
1323         pfkeymp = sadb_setup_acquire(acqrec, SADB_SATYPE_ESP,
1324             ns->netstack_ipsec);
1325         if (pfkeymp == NULL) {
1326                 esp0dbg(("sadb_setup_acquire failed.\n"));
1327                 mutex_exit(&acqrec->ipsacq_lock);
1328                 return;
1329         }
1330         ASSERT(MUTEX_HELD(&ipss->ipsec_alg_lock));
1331         combs = ipss->ipsec_nalgs[IPSEC_ALG_AUTH] *
1332             ipss->ipsec_nalgs[IPSEC_ALG_ENCR];
1333         msgmp = pfkeymp->b_cont;
1334         samsg = (sadb_msg_t *)(msgmp->b_rptr);
1335 
1336         /* Insert proposal here. */
1337 
1338         prop = (sadb_prop_t *)(((uint64_t *)samsg) + samsg->sadb_msg_len);
1339         esp_insert_prop(prop, acqrec, combs, ns);
1340         samsg->sadb_msg_len += prop->sadb_prop_len;
1341         msgmp->b_wptr += SADB_64TO8(samsg->sadb_msg_len);
1342 
1343         mutex_exit(&ipss->ipsec_alg_lock);
1344 
1345         /*
1346          * Must mutex_exit() before sending PF_KEY message up, in
1347          * order to avoid recursive mutex_enter() if there are no registered
1348          * listeners.
1349          *
1350          * Once I've sent the message, I'm cool anyway.
1351          */
1352         mutex_exit(&acqrec->ipsacq_lock);
1353         if (extended != NULL) {
1354                 putnext(espstack->esp_pfkey_q, extended);
1355         }
1356         putnext(espstack->esp_pfkey_q, pfkeymp);
1357 }
1358 
1359 /* XXX refactor me */
1360 /*
1361  * Handle the SADB_GETSPI message.  Create a larval SA.
1362  */
1363 static void
1364 esp_getspi(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
1365 {
1366         ipsa_t *newbie, *target;
1367         isaf_t *outbound, *inbound;
1368         int rc, diagnostic;
1369         sadb_sa_t *assoc;
1370         keysock_out_t *kso;
1371         uint32_t newspi;
1372 
1373         /*
1374          * Randomly generate a proposed SPI value
1375          */
1376         (void) random_get_pseudo_bytes((uint8_t *)&newspi, sizeof (uint32_t));
1377         newbie = sadb_getspi(ksi, newspi, &diagnostic,
1378             espstack->ipsecesp_netstack, IPPROTO_ESP);
1379 
1380         if (newbie == NULL) {
1381                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM, diagnostic,
1382                     ksi->ks_in_serial);
1383                 return;
1384         } else if (newbie == (ipsa_t *)-1) {
1385                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
1386                     ksi->ks_in_serial);
1387                 return;
1388         }
1389 
1390         /*
1391          * XXX - We may randomly collide.  We really should recover from this.
1392          *       Unfortunately, that could require spending way-too-much-time
1393          *       in here.  For now, let the user retry.
1394          */
1395 
1396         if (newbie->ipsa_addrfam == AF_INET6) {
1397                 outbound = OUTBOUND_BUCKET_V6(&espstack->esp_sadb.s_v6,
1398                     *(uint32_t *)(newbie->ipsa_dstaddr));
1399                 inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v6,
1400                     newbie->ipsa_spi);
1401         } else {
1402                 ASSERT(newbie->ipsa_addrfam == AF_INET);
1403                 outbound = OUTBOUND_BUCKET_V4(&espstack->esp_sadb.s_v4,
1404                     *(uint32_t *)(newbie->ipsa_dstaddr));
1405                 inbound = INBOUND_BUCKET(&espstack->esp_sadb.s_v4,
1406                     newbie->ipsa_spi);
1407         }
1408 
1409         mutex_enter(&outbound->isaf_lock);
1410         mutex_enter(&inbound->isaf_lock);
1411 
1412         /*
1413          * Check for collisions (i.e. did sadb_getspi() return with something
1414          * that already exists?).
1415          *
1416          * Try outbound first.  Even though SADB_GETSPI is traditionally
1417          * for inbound SAs, you never know what a user might do.
1418          */
1419         target = ipsec_getassocbyspi(outbound, newbie->ipsa_spi,
1420             newbie->ipsa_srcaddr, newbie->ipsa_dstaddr, newbie->ipsa_addrfam);
1421         if (target == NULL) {
1422                 target = ipsec_getassocbyspi(inbound, newbie->ipsa_spi,
1423                     newbie->ipsa_srcaddr, newbie->ipsa_dstaddr,
1424                     newbie->ipsa_addrfam);
1425         }
1426 
1427         /*
1428          * I don't have collisions elsewhere!
1429          * (Nor will I because I'm still holding inbound/outbound locks.)
1430          */
1431 
1432         if (target != NULL) {
1433                 rc = EEXIST;
1434                 IPSA_REFRELE(target);
1435         } else {
1436                 /*
1437                  * sadb_insertassoc() also checks for collisions, so
1438                  * if there's a colliding entry, rc will be set
1439                  * to EEXIST.
1440                  */
1441                 rc = sadb_insertassoc(newbie, inbound);
1442                 newbie->ipsa_hardexpiretime = gethrestime_sec();
1443                 newbie->ipsa_hardexpiretime +=
1444                     espstack->ipsecesp_larval_timeout;
1445         }
1446 
1447         /*
1448          * Can exit outbound mutex.  Hold inbound until we're done
1449          * with newbie.
1450          */
1451         mutex_exit(&outbound->isaf_lock);
1452 
1453         if (rc != 0) {
1454                 mutex_exit(&inbound->isaf_lock);
1455                 IPSA_REFRELE(newbie);
1456                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, rc,
1457                     SADB_X_DIAGNOSTIC_NONE, ksi->ks_in_serial);
1458                 return;
1459         }
1460 
1461 
1462         /* Can write here because I'm still holding the bucket lock. */
1463         newbie->ipsa_type = SADB_SATYPE_ESP;
1464 
1465         /*
1466          * Construct successful return message. We have one thing going
1467          * for us in PF_KEY v2.  That's the fact that
1468          *      sizeof (sadb_spirange_t) == sizeof (sadb_sa_t)
1469          */
1470         assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SPIRANGE];
1471         assoc->sadb_sa_exttype = SADB_EXT_SA;
1472         assoc->sadb_sa_spi = newbie->ipsa_spi;
1473         *((uint64_t *)(&assoc->sadb_sa_replay)) = 0;
1474         mutex_exit(&inbound->isaf_lock);
1475 
1476         /* Convert KEYSOCK_IN to KEYSOCK_OUT. */
1477         kso = (keysock_out_t *)ksi;
1478         kso->ks_out_len = sizeof (*kso);
1479         kso->ks_out_serial = ksi->ks_in_serial;
1480         kso->ks_out_type = KEYSOCK_OUT;
1481 
1482         /*
1483          * Can safely putnext() to esp_pfkey_q, because this is a turnaround
1484          * from the esp_pfkey_q.
1485          */
1486         putnext(espstack->esp_pfkey_q, mp);
1487 }
1488 
1489 /*
1490  * Insert the ESP header into a packet.  Duplicate an mblk, and insert a newly
1491  * allocated mblk with the ESP header in between the two.
1492  */
1493 static boolean_t
1494 esp_insert_esp(mblk_t *mp, mblk_t *esp_mp, uint_t divpoint,
1495     ipsecesp_stack_t *espstack)
1496 {
1497         mblk_t *split_mp = mp;
1498         uint_t wheretodiv = divpoint;
1499 
1500         while ((split_mp->b_wptr - split_mp->b_rptr) < wheretodiv) {
1501                 wheretodiv -= (split_mp->b_wptr - split_mp->b_rptr);
1502                 split_mp = split_mp->b_cont;
1503                 ASSERT(split_mp != NULL);
1504         }
1505 
1506         if (split_mp->b_wptr - split_mp->b_rptr != wheretodiv) {
1507                 mblk_t *scratch;
1508 
1509                 /* "scratch" is the 2nd half, split_mp is the first. */
1510                 scratch = dupb(split_mp);
1511                 if (scratch == NULL) {
1512                         esp1dbg(espstack,
1513                             ("esp_insert_esp: can't allocate scratch.\n"));
1514                         return (B_FALSE);
1515                 }
1516                 /* NOTE:  dupb() doesn't set b_cont appropriately. */
1517                 scratch->b_cont = split_mp->b_cont;
1518                 scratch->b_rptr += wheretodiv;
1519                 split_mp->b_wptr = split_mp->b_rptr + wheretodiv;
1520                 split_mp->b_cont = scratch;
1521         }
1522         /*
1523          * At this point, split_mp is exactly "wheretodiv" bytes long, and
1524          * holds the end of the pre-ESP part of the datagram.
1525          */
1526         esp_mp->b_cont = split_mp->b_cont;
1527         split_mp->b_cont = esp_mp;
1528 
1529         return (B_TRUE);
1530 }
1531 
1532 /*
1533  * Section 7 of RFC 3947 says:
1534  *
1535  * 7.  Recovering from the Expiring NAT Mappings
1536  *
1537  *    There are cases where NAT box decides to remove mappings that are still
1538  *    alive (for example, when the keepalive interval is too long, or when the
1539  *    NAT box is rebooted).  To recover from this, ends that are NOT behind
1540  *    NAT SHOULD use the last valid UDP encapsulated IKE or IPsec packet from
1541  *    the other end to determine which IP and port addresses should be used.
1542  *    The host behind dynamic NAT MUST NOT do this, as otherwise it opens a
1543  *    DoS attack possibility because the IP address or port of the other host
1544  *    will not change (it is not behind NAT).
1545  *
1546  *    Keepalives cannot be used for these purposes, as they are not
1547  *    authenticated, but any IKE authenticated IKE packet or ESP packet can be
1548  *    used to detect whether the IP address or the port has changed.
1549  *
1550  * The following function will check an SA and its explicitly-set pair to see
1551  * if the NAT-T remote port matches the received packet (which must have
1552  * passed ESP authentication, see esp_in_done() for the caller context).  If
1553  * there is a mismatch, the SAs are updated.  It is not important if we race
1554  * with a transmitting thread, as if there is a transmitting thread, it will
1555  * merely emit a packet that will most-likely be dropped.
1556  *
1557  * "ports" are ordered src,dst, and assoc is an inbound SA, where src should
1558  * match ipsa_remote_nat_port and dst should match ipsa_local_nat_port.
1559  */
1560 #ifdef _LITTLE_ENDIAN
1561 #define FIRST_16(x) ((x) & 0xFFFF)
1562 #define NEXT_16(x) (((x) >> 16) & 0xFFFF)
1563 #else
1564 #define FIRST_16(x) (((x) >> 16) & 0xFFFF)
1565 #define NEXT_16(x) ((x) & 0xFFFF)
1566 #endif
1567 static void
1568 esp_port_freshness(uint32_t ports, ipsa_t *assoc)
1569 {
1570         uint16_t remote = FIRST_16(ports);
1571         uint16_t local = NEXT_16(ports);
1572         ipsa_t *outbound_peer;
1573         isaf_t *bucket;
1574         ipsecesp_stack_t *espstack = assoc->ipsa_netstack->netstack_ipsecesp;
1575 
1576         /* We found a conn_t, therefore local != 0. */
1577         ASSERT(local != 0);
1578         /* Assume an IPv4 SA. */
1579         ASSERT(assoc->ipsa_addrfam == AF_INET);
1580 
1581         /*
1582          * On-the-wire rport == 0 means something's very wrong.
1583          * An unpaired SA is also useless to us.
1584          * If we are behind the NAT, don't bother.
1585          * A zero local NAT port defaults to 4500, so check that too.
1586          * And, of course, if the ports already match, we don't need to
1587          * bother.
1588          */
1589         if (remote == 0 || assoc->ipsa_otherspi == 0 ||
1590             (assoc->ipsa_flags & IPSA_F_BEHIND_NAT) ||
1591             (assoc->ipsa_remote_nat_port == 0 &&
1592             remote == htons(IPPORT_IKE_NATT)) ||
1593             remote == assoc->ipsa_remote_nat_port)
1594                 return;
1595 
1596         /* Try and snag the peer.   NOTE:  Assume IPv4 for now. */
1597         bucket = OUTBOUND_BUCKET_V4(&(espstack->esp_sadb.s_v4),
1598             assoc->ipsa_srcaddr[0]);
1599         mutex_enter(&bucket->isaf_lock);
1600         outbound_peer = ipsec_getassocbyspi(bucket, assoc->ipsa_otherspi,
1601             assoc->ipsa_dstaddr, assoc->ipsa_srcaddr, AF_INET);
1602         mutex_exit(&bucket->isaf_lock);
1603 
1604         /* We probably lost a race to a deleting or expiring thread. */
1605         if (outbound_peer == NULL)
1606                 return;
1607 
1608         /*
1609          * Hold the mutexes for both SAs so we don't race another inbound
1610          * thread.  A lock-entry order shouldn't matter, since all other
1611          * per-ipsa locks are individually held-then-released.
1612          *
1613          * Luckily, this has nothing to do with the remote-NAT address,
1614          * so we don't have to re-scribble the cached-checksum differential.
1615          */
1616         mutex_enter(&outbound_peer->ipsa_lock);
1617         mutex_enter(&assoc->ipsa_lock);
1618         outbound_peer->ipsa_remote_nat_port = assoc->ipsa_remote_nat_port =
1619             remote;
1620         mutex_exit(&assoc->ipsa_lock);
1621         mutex_exit(&outbound_peer->ipsa_lock);
1622         IPSA_REFRELE(outbound_peer);
1623         ESP_BUMP_STAT(espstack, sa_port_renumbers);
1624 }
1625 /*
1626  * Finish processing of an inbound ESP packet after processing by the
1627  * crypto framework.
1628  * - Remove the ESP header.
1629  * - Send packet back to IP.
1630  * If authentication was performed on the packet, this function is called
1631  * only if the authentication succeeded.
1632  * On success returns B_TRUE, on failure returns B_FALSE and frees the
1633  * mblk chain data_mp.
1634  */
1635 static mblk_t *
1636 esp_in_done(mblk_t *data_mp, ip_recv_attr_t *ira, ipsec_crypto_t *ic)
1637 {
1638         ipsa_t *assoc;
1639         uint_t espstart;
1640         uint32_t ivlen = 0;
1641         uint_t processed_len;
1642         esph_t *esph;
1643         kstat_named_t *counter;
1644         boolean_t is_natt;
1645         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
1646         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1647         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1648 
1649         assoc = ira->ira_ipsec_esp_sa;
1650         ASSERT(assoc != NULL);
1651 
1652         is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
1653 
1654         /* get the pointer to the ESP header */
1655         if (assoc->ipsa_encr_alg == SADB_EALG_NULL) {
1656                 /* authentication-only ESP */
1657                 espstart = ic->ic_crypto_data.cd_offset;
1658                 processed_len = ic->ic_crypto_data.cd_length;
1659         } else {
1660                 /* encryption present */
1661                 ivlen = assoc->ipsa_iv_len;
1662                 if (assoc->ipsa_auth_alg == SADB_AALG_NONE) {
1663                         /* encryption-only ESP */
1664                         espstart = ic->ic_crypto_data.cd_offset -
1665                             sizeof (esph_t) - assoc->ipsa_iv_len;
1666                         processed_len = ic->ic_crypto_data.cd_length +
1667                             ivlen;
1668                 } else {
1669                         /* encryption with authentication */
1670                         espstart = ic->ic_crypto_dual_data.dd_offset1;
1671                         processed_len = ic->ic_crypto_dual_data.dd_len2 +
1672                             ivlen;
1673                 }
1674         }
1675 
1676         esph = (esph_t *)(data_mp->b_rptr + espstart);
1677 
1678         if (assoc->ipsa_auth_alg != IPSA_AALG_NONE ||
1679             (assoc->ipsa_flags & IPSA_F_COMBINED)) {
1680                 /*
1681                  * Authentication passed if we reach this point.
1682                  * Packets with authentication will have the ICV
1683                  * after the crypto data. Adjust b_wptr before
1684                  * making padlen checks.
1685                  */
1686                 ESP_BUMP_STAT(espstack, good_auth);
1687                 data_mp->b_wptr -= assoc->ipsa_mac_len;
1688 
1689                 /*
1690                  * Check replay window here!
1691                  * For right now, assume keysock will set the replay window
1692                  * size to zero for SAs that have an unspecified sender.
1693                  * This may change...
1694                  */
1695 
1696                 if (!sadb_replay_check(assoc, esph->esph_replay)) {
1697                         /*
1698                          * Log the event. As of now we print out an event.
1699                          * Do not print the replay failure number, or else
1700                          * syslog cannot collate the error messages.  Printing
1701                          * the replay number that failed opens a denial-of-
1702                          * service attack.
1703                          */
1704                         ipsec_assocfailure(info.mi_idnum, 0, 0,
1705                             SL_ERROR | SL_WARN,
1706                             "Replay failed for ESP spi 0x%x, dst %s.\n",
1707                             assoc->ipsa_spi, assoc->ipsa_dstaddr,
1708                             assoc->ipsa_addrfam, espstack->ipsecesp_netstack);
1709                         ESP_BUMP_STAT(espstack, replay_failures);
1710                         counter = DROPPER(ipss, ipds_esp_replay);
1711                         goto drop_and_bail;
1712                 }
1713 
1714                 if (is_natt) {
1715                         ASSERT(ira->ira_flags & IRAF_ESP_UDP_PORTS);
1716                         ASSERT(ira->ira_esp_udp_ports != 0);
1717                         esp_port_freshness(ira->ira_esp_udp_ports, assoc);
1718                 }
1719         }
1720 
1721         esp_set_usetime(assoc, B_TRUE);
1722 
1723         if (!esp_age_bytes(assoc, processed_len, B_TRUE)) {
1724                 /* The ipsa has hit hard expiration, LOG and AUDIT. */
1725                 ipsec_assocfailure(info.mi_idnum, 0, 0,
1726                     SL_ERROR | SL_WARN,
1727                     "ESP association 0x%x, dst %s had bytes expire.\n",
1728                     assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1729                     espstack->ipsecesp_netstack);
1730                 ESP_BUMP_STAT(espstack, bytes_expired);
1731                 counter = DROPPER(ipss, ipds_esp_bytes_expire);
1732                 goto drop_and_bail;
1733         }
1734 
1735         /*
1736          * Remove ESP header and padding from packet.  I hope the compiler
1737          * spews "branch, predict taken" code for this.
1738          */
1739 
1740         if (esp_strip_header(data_mp, (ira->ira_flags & IRAF_IS_IPV4),
1741             ivlen, &counter, espstack)) {
1742 
1743                 if (is_system_labeled() && assoc->ipsa_tsl != NULL) {
1744                         if (!ip_recv_attr_replace_label(ira, assoc->ipsa_tsl)) {
1745                                 ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
1746                                     DROPPER(ipss, ipds_ah_nomem),
1747                                     &espstack->esp_dropper);
1748                                 BUMP_MIB(ira->ira_ill->ill_ip_mib,
1749                                     ipIfStatsInDiscards);
1750                                 return (NULL);
1751                         }
1752                 }
1753                 if (is_natt)
1754                         return (esp_fix_natt_checksums(data_mp, assoc));
1755 
1756                 if (assoc->ipsa_state == IPSA_STATE_IDLE) {
1757                         /*
1758                          * Cluster buffering case.  Tell caller that we're
1759                          * handling the packet.
1760                          */
1761                         sadb_buf_pkt(assoc, data_mp, ira);
1762                         return (NULL);
1763                 }
1764 
1765                 return (data_mp);
1766         }
1767 
1768         esp1dbg(espstack, ("esp_in_done: esp_strip_header() failed\n"));
1769 drop_and_bail:
1770         IP_ESP_BUMP_STAT(ipss, in_discards);
1771         ip_drop_packet(data_mp, B_TRUE, ira->ira_ill, counter,
1772             &espstack->esp_dropper);
1773         BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1774         return (NULL);
1775 }
1776 
1777 /*
1778  * Called upon failing the inbound ICV check. The message passed as
1779  * argument is freed.
1780  */
1781 static void
1782 esp_log_bad_auth(mblk_t *mp, ip_recv_attr_t *ira)
1783 {
1784         ipsa_t          *assoc = ira->ira_ipsec_esp_sa;
1785         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
1786         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
1787         ipsec_stack_t   *ipss = ns->netstack_ipsec;
1788 
1789         /*
1790          * Log the event. Don't print to the console, block
1791          * potential denial-of-service attack.
1792          */
1793         ESP_BUMP_STAT(espstack, bad_auth);
1794 
1795         ipsec_assocfailure(info.mi_idnum, 0, 0, SL_ERROR | SL_WARN,
1796             "ESP Authentication failed for spi 0x%x, dst %s.\n",
1797             assoc->ipsa_spi, assoc->ipsa_dstaddr, assoc->ipsa_addrfam,
1798             espstack->ipsecesp_netstack);
1799 
1800         IP_ESP_BUMP_STAT(ipss, in_discards);
1801         ip_drop_packet(mp, B_TRUE, ira->ira_ill,
1802             DROPPER(ipss, ipds_esp_bad_auth),
1803             &espstack->esp_dropper);
1804 }
1805 
1806 
1807 /*
1808  * Invoked for outbound packets after ESP processing. If the packet
1809  * also requires AH, performs the AH SA selection and AH processing.
1810  * Returns B_TRUE if the AH processing was not needed or if it was
1811  * performed successfully. Returns B_FALSE and consumes the passed mblk
1812  * if AH processing was required but could not be performed.
1813  *
1814  * Returns data_mp unless data_mp was consumed/queued.
1815  */
1816 static mblk_t *
1817 esp_do_outbound_ah(mblk_t *data_mp, ip_xmit_attr_t *ixa)
1818 {
1819         ipsec_action_t *ap;
1820 
1821         ap = ixa->ixa_ipsec_action;
1822         if (ap == NULL) {
1823                 ipsec_policy_t *pp = ixa->ixa_ipsec_policy;
1824                 ap = pp->ipsp_act;
1825         }
1826 
1827         if (!ap->ipa_want_ah)
1828                 return (data_mp);
1829 
1830         /*
1831          * Normally the AH SA would have already been put in place
1832          * but it could have been flushed so we need to look for it.
1833          */
1834         if (ixa->ixa_ipsec_ah_sa == NULL) {
1835                 if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_AH)) {
1836                         sadb_acquire(data_mp, ixa, B_TRUE, B_FALSE);
1837                         return (NULL);
1838                 }
1839         }
1840         ASSERT(ixa->ixa_ipsec_ah_sa != NULL);
1841 
1842         data_mp = ixa->ixa_ipsec_ah_sa->ipsa_output_func(data_mp, ixa);
1843         return (data_mp);
1844 }
1845 
1846 
1847 /*
1848  * Kernel crypto framework callback invoked after completion of async
1849  * crypto requests for outbound packets.
1850  */
1851 static void
1852 esp_kcf_callback_outbound(void *arg, int status)
1853 {
1854         mblk_t          *mp = (mblk_t *)arg;
1855         mblk_t          *async_mp;
1856         netstack_t      *ns;
1857         ipsec_stack_t   *ipss;
1858         ipsecesp_stack_t *espstack;
1859         mblk_t          *data_mp;
1860         ip_xmit_attr_t  ixas;
1861         ipsec_crypto_t  *ic;
1862         ill_t           *ill;
1863 
1864         /*
1865          * First remove the ipsec_crypto_t mblk
1866          * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1867          */
1868         async_mp = ipsec_remove_crypto_data(mp, &ic);
1869         ASSERT(async_mp != NULL);
1870 
1871         /*
1872          * Extract the ip_xmit_attr_t from the first mblk.
1873          * Verifies that the netstack and ill is still around; could
1874          * have vanished while kEf was doing its work.
1875          * On succesful return we have a nce_t and the ill/ipst can't
1876          * disappear until we do the nce_refrele in ixa_cleanup.
1877          */
1878         data_mp = async_mp->b_cont;
1879         async_mp->b_cont = NULL;
1880         if (!ip_xmit_attr_from_mblk(async_mp, &ixas)) {
1881                 /* Disappeared on us - no ill/ipst for MIB */
1882                 /* We have nowhere to do stats since ixa_ipst could be NULL */
1883                 if (ixas.ixa_nce != NULL) {
1884                         ill = ixas.ixa_nce->nce_ill;
1885                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1886                         ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
1887                 }
1888                 freemsg(data_mp);
1889                 goto done;
1890         }
1891         ns = ixas.ixa_ipst->ips_netstack;
1892         espstack = ns->netstack_ipsecesp;
1893         ipss = ns->netstack_ipsec;
1894         ill = ixas.ixa_nce->nce_ill;
1895 
1896         if (status == CRYPTO_SUCCESS) {
1897                 /*
1898                  * If a ICV was computed, it was stored by the
1899                  * crypto framework at the end of the packet.
1900                  */
1901                 ipha_t *ipha = (ipha_t *)data_mp->b_rptr;
1902 
1903                 esp_set_usetime(ixas.ixa_ipsec_esp_sa, B_FALSE);
1904                 /* NAT-T packet. */
1905                 if (IPH_HDR_VERSION(ipha) == IP_VERSION &&
1906                     ipha->ipha_protocol == IPPROTO_UDP)
1907                         esp_prepare_udp(ns, data_mp, ipha);
1908 
1909                 /* do AH processing if needed */
1910                 data_mp = esp_do_outbound_ah(data_mp, &ixas);
1911                 if (data_mp == NULL)
1912                         goto done;
1913 
1914                 (void) ip_output_post_ipsec(data_mp, &ixas);
1915         } else {
1916                 /* Outbound shouldn't see invalid MAC */
1917                 ASSERT(status != CRYPTO_INVALID_MAC);
1918 
1919                 esp1dbg(espstack,
1920                     ("esp_kcf_callback_outbound: crypto failed with 0x%x\n",
1921                     status));
1922                 ESP_BUMP_STAT(espstack, crypto_failures);
1923                 ESP_BUMP_STAT(espstack, out_discards);
1924                 ip_drop_packet(data_mp, B_FALSE, ill,
1925                     DROPPER(ipss, ipds_esp_crypto_failed),
1926                     &espstack->esp_dropper);
1927                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
1928         }
1929 done:
1930         ixa_cleanup(&ixas);
1931         (void) ipsec_free_crypto_data(mp);
1932 }
1933 
1934 /*
1935  * Kernel crypto framework callback invoked after completion of async
1936  * crypto requests for inbound packets.
1937  */
1938 static void
1939 esp_kcf_callback_inbound(void *arg, int status)
1940 {
1941         mblk_t          *mp = (mblk_t *)arg;
1942         mblk_t          *async_mp;
1943         netstack_t      *ns;
1944         ipsecesp_stack_t *espstack;
1945         ipsec_stack_t   *ipss;
1946         mblk_t          *data_mp;
1947         ip_recv_attr_t  iras;
1948         ipsec_crypto_t  *ic;
1949 
1950         /*
1951          * First remove the ipsec_crypto_t mblk
1952          * Note that we need to ipsec_free_crypto_data(mp) once done with ic.
1953          */
1954         async_mp = ipsec_remove_crypto_data(mp, &ic);
1955         ASSERT(async_mp != NULL);
1956 
1957         /*
1958          * Extract the ip_recv_attr_t from the first mblk.
1959          * Verifies that the netstack and ill is still around; could
1960          * have vanished while kEf was doing its work.
1961          */
1962         data_mp = async_mp->b_cont;
1963         async_mp->b_cont = NULL;
1964         if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
1965                 /* The ill or ip_stack_t disappeared on us */
1966                 ip_drop_input("ip_recv_attr_from_mblk", data_mp, NULL);
1967                 freemsg(data_mp);
1968                 goto done;
1969         }
1970 
1971         ns = iras.ira_ill->ill_ipst->ips_netstack;
1972         espstack = ns->netstack_ipsecesp;
1973         ipss = ns->netstack_ipsec;
1974 
1975         if (status == CRYPTO_SUCCESS) {
1976                 data_mp = esp_in_done(data_mp, &iras, ic);
1977                 if (data_mp == NULL)
1978                         goto done;
1979 
1980                 /* finish IPsec processing */
1981                 ip_input_post_ipsec(data_mp, &iras);
1982         } else if (status == CRYPTO_INVALID_MAC) {
1983                 esp_log_bad_auth(data_mp, &iras);
1984         } else {
1985                 esp1dbg(espstack,
1986                     ("esp_kcf_callback: crypto failed with 0x%x\n",
1987                     status));
1988                 ESP_BUMP_STAT(espstack, crypto_failures);
1989                 IP_ESP_BUMP_STAT(ipss, in_discards);
1990                 ip_drop_packet(data_mp, B_TRUE, iras.ira_ill,
1991                     DROPPER(ipss, ipds_esp_crypto_failed),
1992                     &espstack->esp_dropper);
1993                 BUMP_MIB(iras.ira_ill->ill_ip_mib, ipIfStatsInDiscards);
1994         }
1995 done:
1996         ira_cleanup(&iras, B_TRUE);
1997         (void) ipsec_free_crypto_data(mp);
1998 }
1999 
2000 /*
2001  * Invoked on crypto framework failure during inbound and outbound processing.
2002  */
2003 static void
2004 esp_crypto_failed(mblk_t *data_mp, boolean_t is_inbound, int kef_rc,
2005     ill_t *ill, ipsecesp_stack_t *espstack)
2006 {
2007         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
2008 
2009         esp1dbg(espstack, ("crypto failed for %s ESP with 0x%x\n",
2010             is_inbound ? "inbound" : "outbound", kef_rc));
2011         ip_drop_packet(data_mp, is_inbound, ill,
2012             DROPPER(ipss, ipds_esp_crypto_failed),
2013             &espstack->esp_dropper);
2014         ESP_BUMP_STAT(espstack, crypto_failures);
2015         if (is_inbound)
2016                 IP_ESP_BUMP_STAT(ipss, in_discards);
2017         else
2018                 ESP_BUMP_STAT(espstack, out_discards);
2019 }
2020 
2021 /*
2022  * A statement-equivalent macro, _cr MUST point to a modifiable
2023  * crypto_call_req_t.
2024  */
2025 #define ESP_INIT_CALLREQ(_cr, _mp, _callback)                           \
2026         (_cr)->cr_flag = CRYPTO_SKIP_REQID|CRYPTO_ALWAYS_QUEUE;      \
2027         (_cr)->cr_callback_arg = (_mp);                              \
2028         (_cr)->cr_callback_func = (_callback)
2029 
2030 #define ESP_INIT_CRYPTO_MAC(mac, icvlen, icvbuf) {                      \
2031         (mac)->cd_format = CRYPTO_DATA_RAW;                          \
2032         (mac)->cd_offset = 0;                                                \
2033         (mac)->cd_length = icvlen;                                   \
2034         (mac)->cd_raw.iov_base = (char *)icvbuf;                     \
2035         (mac)->cd_raw.iov_len = icvlen;                                      \
2036 }
2037 
2038 #define ESP_INIT_CRYPTO_DATA(data, mp, off, len) {                      \
2039         if (MBLKL(mp) >= (len) + (off)) {                            \
2040                 (data)->cd_format = CRYPTO_DATA_RAW;                 \
2041                 (data)->cd_raw.iov_base = (char *)(mp)->b_rptr;           \
2042                 (data)->cd_raw.iov_len = MBLKL(mp);                  \
2043                 (data)->cd_offset = off;                             \
2044         } else {                                                        \
2045                 (data)->cd_format = CRYPTO_DATA_MBLK;                        \
2046                 (data)->cd_mp = mp;                                  \
2047                 (data)->cd_offset = off;                             \
2048         }                                                               \
2049         (data)->cd_length = len;                                     \
2050 }
2051 
2052 #define ESP_INIT_CRYPTO_DUAL_DATA(data, mp, off1, len1, off2, len2) {   \
2053         (data)->dd_format = CRYPTO_DATA_MBLK;                                \
2054         (data)->dd_mp = mp;                                          \
2055         (data)->dd_len1 = len1;                                              \
2056         (data)->dd_offset1 = off1;                                   \
2057         (data)->dd_len2 = len2;                                              \
2058         (data)->dd_offset2 = off2;                                   \
2059 }
2060 
2061 /*
2062  * Returns data_mp if successfully completed the request. Returns
2063  * NULL if it failed (and increments InDiscards) or if it is pending.
2064  */
2065 static mblk_t *
2066 esp_submit_req_inbound(mblk_t *esp_mp, ip_recv_attr_t *ira,
2067     ipsa_t *assoc, uint_t esph_offset)
2068 {
2069         uint_t auth_offset, msg_len, auth_len;
2070         crypto_call_req_t call_req, *callrp;
2071         mblk_t *mp;
2072         esph_t *esph_ptr;
2073         int kef_rc;
2074         uint_t icv_len = assoc->ipsa_mac_len;
2075         crypto_ctx_template_t auth_ctx_tmpl;
2076         boolean_t do_auth, do_encr, force;
2077         uint_t encr_offset, encr_len;
2078         uint_t iv_len = assoc->ipsa_iv_len;
2079         crypto_ctx_template_t encr_ctx_tmpl;
2080         ipsec_crypto_t  *ic, icstack;
2081         uchar_t *iv_ptr;
2082         netstack_t *ns = ira->ira_ill->ill_ipst->ips_netstack;
2083         ipsec_stack_t *ipss = ns->netstack_ipsec;
2084         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2085 
2086         do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2087         do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2088         force = (assoc->ipsa_flags & IPSA_F_ASYNC);
2089 
2090 #ifdef IPSEC_LATENCY_TEST
2091         kef_rc = CRYPTO_SUCCESS;
2092 #else
2093         kef_rc = CRYPTO_FAILED;
2094 #endif
2095 
2096         /*
2097          * An inbound packet is of the form:
2098          * [IP,options,ESP,IV,data,ICV,pad]
2099          */
2100         esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
2101         iv_ptr = (uchar_t *)(esph_ptr + 1);
2102         /* Packet length starting at IP header ending after ESP ICV. */
2103         msg_len = MBLKL(esp_mp);
2104 
2105         encr_offset = esph_offset + sizeof (esph_t) + iv_len;
2106         encr_len = msg_len - encr_offset;
2107 
2108         /*
2109          * Counter mode algs need a nonce. This is setup in sadb_common_add().
2110          * If for some reason we are using a SA which does not have a nonce
2111          * then we must fail here.
2112          */
2113         if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
2114             (assoc->ipsa_nonce == NULL)) {
2115                 ip_drop_packet(esp_mp, B_TRUE, ira->ira_ill,
2116                     DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2117                 return (NULL);
2118         }
2119 
2120         if (force) {
2121                 /* We are doing asynch; allocate mblks to hold state */
2122                 if ((mp = ip_recv_attr_to_mblk(ira)) == NULL ||
2123                     (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
2124                         BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
2125                         ip_drop_input("ipIfStatsInDiscards", esp_mp,
2126                             ira->ira_ill);
2127                         return (NULL);
2128                 }
2129                 linkb(mp, esp_mp);
2130                 callrp = &call_req;
2131                 ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_inbound);
2132         } else {
2133                 /*
2134                  * If we know we are going to do sync then ipsec_crypto_t
2135                  * should be on the stack.
2136                  */
2137                 ic = &icstack;
2138                 bzero(ic, sizeof (*ic));
2139                 callrp = NULL;
2140         }
2141 
2142         if (do_auth) {
2143                 /* authentication context template */
2144                 IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2145                     auth_ctx_tmpl);
2146 
2147                 /* ICV to be verified */
2148                 ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
2149                     icv_len, esp_mp->b_wptr - icv_len);
2150 
2151                 /* authentication starts at the ESP header */
2152                 auth_offset = esph_offset;
2153                 auth_len = msg_len - auth_offset - icv_len;
2154                 if (!do_encr) {
2155                         /* authentication only */
2156                         /* initialize input data argument */
2157                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2158                             esp_mp, auth_offset, auth_len);
2159 
2160                         /* call the crypto framework */
2161                         kef_rc = crypto_mac_verify(&assoc->ipsa_amech,
2162                             &ic->ic_crypto_data,
2163                             &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2164                             &ic->ic_crypto_mac, callrp);
2165                 }
2166         }
2167 
2168         if (do_encr) {
2169                 /* encryption template */
2170                 IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2171                     encr_ctx_tmpl);
2172 
2173                 /* Call the nonce update function. Also passes in IV */
2174                 (assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, encr_len,
2175                     iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
2176 
2177                 if (!do_auth) {
2178                         /* decryption only */
2179                         /* initialize input data argument */
2180                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2181                             esp_mp, encr_offset, encr_len);
2182 
2183                         /* call the crypto framework */
2184                         kef_rc = crypto_decrypt((crypto_mechanism_t *)
2185                             &ic->ic_cmm, &ic->ic_crypto_data,
2186                             &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2187                             NULL, callrp);
2188                 }
2189         }
2190 
2191         if (do_auth && do_encr) {
2192                 /* dual operation */
2193                 /* initialize input data argument */
2194                 ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
2195                     esp_mp, auth_offset, auth_len,
2196                     encr_offset, encr_len - icv_len);
2197 
2198                 /* specify IV */
2199                 ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
2200 
2201                 /* call the framework */
2202                 kef_rc = crypto_mac_verify_decrypt(&assoc->ipsa_amech,
2203                     &assoc->ipsa_emech, &ic->ic_crypto_dual_data,
2204                     &assoc->ipsa_kcfauthkey, &assoc->ipsa_kcfencrkey,
2205                     auth_ctx_tmpl, encr_ctx_tmpl, &ic->ic_crypto_mac,
2206                     NULL, callrp);
2207         }
2208 
2209         switch (kef_rc) {
2210         case CRYPTO_SUCCESS:
2211                 ESP_BUMP_STAT(espstack, crypto_sync);
2212                 esp_mp = esp_in_done(esp_mp, ira, ic);
2213                 if (force) {
2214                         /* Free mp after we are done with ic */
2215                         mp = ipsec_free_crypto_data(mp);
2216                         (void) ip_recv_attr_free_mblk(mp);
2217                 }
2218                 return (esp_mp);
2219         case CRYPTO_QUEUED:
2220                 /* esp_kcf_callback_inbound() will be invoked on completion */
2221                 ESP_BUMP_STAT(espstack, crypto_async);
2222                 return (NULL);
2223         case CRYPTO_INVALID_MAC:
2224                 if (force) {
2225                         mp = ipsec_free_crypto_data(mp);
2226                         esp_mp = ip_recv_attr_free_mblk(mp);
2227                 }
2228                 ESP_BUMP_STAT(espstack, crypto_sync);
2229                 BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
2230                 esp_log_bad_auth(esp_mp, ira);
2231                 /* esp_mp was passed to ip_drop_packet */
2232                 return (NULL);
2233         }
2234 
2235         if (force) {
2236                 mp = ipsec_free_crypto_data(mp);
2237                 esp_mp = ip_recv_attr_free_mblk(mp);
2238         }
2239         BUMP_MIB(ira->ira_ill->ill_ip_mib, ipIfStatsInDiscards);
2240         esp_crypto_failed(esp_mp, B_TRUE, kef_rc, ira->ira_ill, espstack);
2241         /* esp_mp was passed to ip_drop_packet */
2242         return (NULL);
2243 }
2244 
2245 /*
2246  * Compute the IP and UDP checksums -- common code for both keepalives and
2247  * actual ESP-in-UDP packets.  Be flexible with multiple mblks because ESP
2248  * uses mblk-insertion to insert the UDP header.
2249  * TODO - If there is an easy way to prep a packet for HW checksums, make
2250  * it happen here.
2251  * Note that this is used before both before calling ip_output_simple and
2252  * in the esp datapath. The former could use IXAF_SET_ULP_CKSUM but not the
2253  * latter.
2254  */
2255 static void
2256 esp_prepare_udp(netstack_t *ns, mblk_t *mp, ipha_t *ipha)
2257 {
2258         int offset;
2259         uint32_t cksum;
2260         uint16_t *arr;
2261         mblk_t *udpmp = mp;
2262         uint_t hlen = IPH_HDR_LENGTH(ipha);
2263 
2264         ASSERT(MBLKL(mp) >= sizeof (ipha_t));
2265 
2266         ipha->ipha_hdr_checksum = 0;
2267         ipha->ipha_hdr_checksum = ip_csum_hdr(ipha);
2268 
2269         if (ns->netstack_udp->us_do_checksum) {
2270                 ASSERT(MBLKL(udpmp) >= sizeof (udpha_t));
2271                 /* arr points to the IP header. */
2272                 arr = (uint16_t *)ipha;
2273                 IP_STAT(ns->netstack_ip, ip_out_sw_cksum);
2274                 IP_STAT_UPDATE(ns->netstack_ip, ip_out_sw_cksum_bytes,
2275                     ntohs(htons(ipha->ipha_length) - hlen));
2276                 /* arr[6-9] are the IP addresses. */
2277                 cksum = IP_UDP_CSUM_COMP + arr[6] + arr[7] + arr[8] + arr[9] +
2278                     ntohs(htons(ipha->ipha_length) - hlen);
2279                 cksum = IP_CSUM(mp, hlen, cksum);
2280                 offset = hlen + UDP_CHECKSUM_OFFSET;
2281                 while (offset >= MBLKL(udpmp)) {
2282                         offset -= MBLKL(udpmp);
2283                         udpmp = udpmp->b_cont;
2284                 }
2285                 /* arr points to the UDP header's checksum field. */
2286                 arr = (uint16_t *)(udpmp->b_rptr + offset);
2287                 *arr = cksum;
2288         }
2289 }
2290 
2291 /*
2292  * taskq handler so we can send the NAT-T keepalive on a separate thread.
2293  */
2294 static void
2295 actually_send_keepalive(void *arg)
2296 {
2297         mblk_t *mp = (mblk_t *)arg;
2298         ip_xmit_attr_t ixas;
2299         netstack_t      *ns;
2300         netstackid_t    stackid;
2301 
2302         stackid = (netstackid_t)(uintptr_t)mp->b_prev;
2303         mp->b_prev = NULL;
2304         ns = netstack_find_by_stackid(stackid);
2305         if (ns == NULL) {
2306                 /* Disappeared */
2307                 ip_drop_output("ipIfStatsOutDiscards", mp, NULL);
2308                 freemsg(mp);
2309                 return;
2310         }
2311 
2312         bzero(&ixas, sizeof (ixas));
2313         ixas.ixa_zoneid = ALL_ZONES;
2314         ixas.ixa_cred = kcred;
2315         ixas.ixa_cpid = NOPID;
2316         ixas.ixa_tsl = NULL;
2317         ixas.ixa_ipst = ns->netstack_ip;
2318         /* No ULP checksum; done by esp_prepare_udp */
2319         ixas.ixa_flags = (IXAF_IS_IPV4 | IXAF_NO_IPSEC | IXAF_VERIFY_SOURCE);
2320 
2321         (void) ip_output_simple(mp, &ixas);
2322         ixa_cleanup(&ixas);
2323         netstack_rele(ns);
2324 }
2325 
2326 /*
2327  * Send a one-byte UDP NAT-T keepalive.
2328  */
2329 void
2330 ipsecesp_send_keepalive(ipsa_t *assoc)
2331 {
2332         mblk_t          *mp;
2333         ipha_t          *ipha;
2334         udpha_t         *udpha;
2335         netstack_t      *ns = assoc->ipsa_netstack;
2336 
2337         ASSERT(MUTEX_NOT_HELD(&assoc->ipsa_lock));
2338 
2339         mp = allocb(sizeof (ipha_t) + sizeof (udpha_t) + 1, BPRI_HI);
2340         if (mp == NULL)
2341                 return;
2342         ipha = (ipha_t *)mp->b_rptr;
2343         ipha->ipha_version_and_hdr_length = IP_SIMPLE_HDR_VERSION;
2344         ipha->ipha_type_of_service = 0;
2345         ipha->ipha_length = htons(sizeof (ipha_t) + sizeof (udpha_t) + 1);
2346         /* Use the low-16 of the SPI so we have some clue where it came from. */
2347         ipha->ipha_ident = *(((uint16_t *)(&assoc->ipsa_spi)) + 1);
2348         ipha->ipha_fragment_offset_and_flags = 0;  /* Too small to fragment! */
2349         ipha->ipha_ttl = 0xFF;
2350         ipha->ipha_protocol = IPPROTO_UDP;
2351         ipha->ipha_hdr_checksum = 0;
2352         ipha->ipha_src = assoc->ipsa_srcaddr[0];
2353         ipha->ipha_dst = assoc->ipsa_dstaddr[0];
2354         udpha = (udpha_t *)(ipha + 1);
2355         udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2356             assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2357         udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2358             assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2359         udpha->uha_length = htons(sizeof (udpha_t) + 1);
2360         udpha->uha_checksum = 0;
2361         mp->b_wptr = (uint8_t *)(udpha + 1);
2362         *(mp->b_wptr++) = 0xFF;
2363 
2364         esp_prepare_udp(ns, mp, ipha);
2365 
2366         /*
2367          * We're holding an isaf_t bucket lock, so pawn off the actual
2368          * packet transmission to another thread.  Just in case syncq
2369          * processing causes a same-bucket packet to be processed.
2370          */
2371         mp->b_prev = (mblk_t *)(uintptr_t)ns->netstack_stackid;
2372 
2373         if (taskq_dispatch(esp_taskq, actually_send_keepalive, mp,
2374             TQ_NOSLEEP) == 0) {
2375                 /* Assume no memory if taskq_dispatch() fails. */
2376                 mp->b_prev = NULL;
2377                 ip_drop_packet(mp, B_FALSE, NULL,
2378                     DROPPER(ns->netstack_ipsec, ipds_esp_nomem),
2379                     &ns->netstack_ipsecesp->esp_dropper);
2380         }
2381 }
2382 
2383 /*
2384  * Returns mp if successfully completed the request. Returns
2385  * NULL if it failed (and increments InDiscards) or if it is pending.
2386  */
2387 static mblk_t *
2388 esp_submit_req_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa, ipsa_t *assoc,
2389     uchar_t *icv_buf, uint_t payload_len)
2390 {
2391         uint_t auth_len;
2392         crypto_call_req_t call_req, *callrp;
2393         mblk_t *esp_mp;
2394         esph_t *esph_ptr;
2395         mblk_t *mp;
2396         int kef_rc = CRYPTO_FAILED;
2397         uint_t icv_len = assoc->ipsa_mac_len;
2398         crypto_ctx_template_t auth_ctx_tmpl;
2399         boolean_t do_auth, do_encr, force;
2400         uint_t iv_len = assoc->ipsa_iv_len;
2401         crypto_ctx_template_t encr_ctx_tmpl;
2402         boolean_t is_natt = ((assoc->ipsa_flags & IPSA_F_NATT) != 0);
2403         size_t esph_offset = (is_natt ? UDPH_SIZE : 0);
2404         netstack_t      *ns = ixa->ixa_ipst->ips_netstack;
2405         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2406         ipsec_crypto_t  *ic, icstack;
2407         uchar_t         *iv_ptr;
2408         crypto_data_t   *cd_ptr = NULL;
2409         ill_t           *ill = ixa->ixa_nce->nce_ill;
2410         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2411 
2412         esp3dbg(espstack, ("esp_submit_req_outbound:%s",
2413             is_natt ? "natt" : "not natt"));
2414 
2415         do_encr = assoc->ipsa_encr_alg != SADB_EALG_NULL;
2416         do_auth = assoc->ipsa_auth_alg != SADB_AALG_NONE;
2417         force = (assoc->ipsa_flags & IPSA_F_ASYNC);
2418 
2419 #ifdef IPSEC_LATENCY_TEST
2420         kef_rc = CRYPTO_SUCCESS;
2421 #else
2422         kef_rc = CRYPTO_FAILED;
2423 #endif
2424 
2425         /*
2426          * Outbound IPsec packets are of the form:
2427          * [IP,options] -> [ESP,IV] -> [data] -> [pad,ICV]
2428          * unless it's NATT, then it's
2429          * [IP,options] -> [udp][ESP,IV] -> [data] -> [pad,ICV]
2430          * Get a pointer to the mblk containing the ESP header.
2431          */
2432         ASSERT(data_mp->b_cont != NULL);
2433         esp_mp = data_mp->b_cont;
2434         esph_ptr = (esph_t *)(esp_mp->b_rptr + esph_offset);
2435         iv_ptr = (uchar_t *)(esph_ptr + 1);
2436 
2437         /*
2438          * Combined mode algs need a nonce. This is setup in sadb_common_add().
2439          * If for some reason we are using a SA which does not have a nonce
2440          * then we must fail here.
2441          */
2442         if ((assoc->ipsa_flags & IPSA_F_COUNTERMODE) &&
2443             (assoc->ipsa_nonce == NULL)) {
2444                 ip_drop_packet(data_mp, B_FALSE, NULL,
2445                     DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2446                 return (NULL);
2447         }
2448 
2449         if (force) {
2450                 /* We are doing asynch; allocate mblks to hold state */
2451                 if ((mp = ip_xmit_attr_to_mblk(ixa)) == NULL ||
2452                     (mp = ipsec_add_crypto_data(mp, &ic)) == NULL) {
2453                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2454                         ip_drop_output("ipIfStatsOutDiscards", data_mp, ill);
2455                         freemsg(data_mp);
2456                         return (NULL);
2457                 }
2458 
2459                 linkb(mp, data_mp);
2460                 callrp = &call_req;
2461                 ESP_INIT_CALLREQ(callrp, mp, esp_kcf_callback_outbound);
2462         } else {
2463                 /*
2464                  * If we know we are going to do sync then ipsec_crypto_t
2465                  * should be on the stack.
2466                  */
2467                 ic = &icstack;
2468                 bzero(ic, sizeof (*ic));
2469                 callrp = NULL;
2470         }
2471 
2472 
2473         if (do_auth) {
2474                 /* authentication context template */
2475                 IPSEC_CTX_TMPL(assoc, ipsa_authtmpl, IPSEC_ALG_AUTH,
2476                     auth_ctx_tmpl);
2477 
2478                 /* where to store the computed mac */
2479                 ESP_INIT_CRYPTO_MAC(&ic->ic_crypto_mac,
2480                     icv_len, icv_buf);
2481 
2482                 /* authentication starts at the ESP header */
2483                 auth_len = payload_len + iv_len + sizeof (esph_t);
2484                 if (!do_encr) {
2485                         /* authentication only */
2486                         /* initialize input data argument */
2487                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2488                             esp_mp, esph_offset, auth_len);
2489 
2490                         /* call the crypto framework */
2491                         kef_rc = crypto_mac(&assoc->ipsa_amech,
2492                             &ic->ic_crypto_data,
2493                             &assoc->ipsa_kcfauthkey, auth_ctx_tmpl,
2494                             &ic->ic_crypto_mac, callrp);
2495                 }
2496         }
2497 
2498         if (do_encr) {
2499                 /* encryption context template */
2500                 IPSEC_CTX_TMPL(assoc, ipsa_encrtmpl, IPSEC_ALG_ENCR,
2501                     encr_ctx_tmpl);
2502                 /* Call the nonce update function. */
2503                 (assoc->ipsa_noncefunc)(assoc, (uchar_t *)esph_ptr, payload_len,
2504                     iv_ptr, &ic->ic_cmm, &ic->ic_crypto_data);
2505 
2506                 if (!do_auth) {
2507                         /* encryption only, skip mblk that contains ESP hdr */
2508                         /* initialize input data argument */
2509                         ESP_INIT_CRYPTO_DATA(&ic->ic_crypto_data,
2510                             esp_mp->b_cont, 0, payload_len);
2511 
2512                         /*
2513                          * For combined mode ciphers, the ciphertext is the same
2514                          * size as the clear text, the ICV should follow the
2515                          * ciphertext. To convince the kcf to allow in-line
2516                          * encryption, with an ICV, use ipsec_out_crypto_mac
2517                          * to point to the same buffer as the data. The calling
2518                          * function need to ensure the buffer is large enough to
2519                          * include the ICV.
2520                          *
2521                          * The IV is already written to the packet buffer, the
2522                          * nonce setup function copied it to the params struct
2523                          * for the cipher to use.
2524                          */
2525                         if (assoc->ipsa_flags & IPSA_F_COMBINED) {
2526                                 bcopy(&ic->ic_crypto_data,
2527                                     &ic->ic_crypto_mac,
2528                                     sizeof (crypto_data_t));
2529                                 ic->ic_crypto_mac.cd_length =
2530                                     payload_len + icv_len;
2531                                 cd_ptr = &ic->ic_crypto_mac;
2532                         }
2533 
2534                         /* call the crypto framework */
2535                         kef_rc = crypto_encrypt((crypto_mechanism_t *)
2536                             &ic->ic_cmm, &ic->ic_crypto_data,
2537                             &assoc->ipsa_kcfencrkey, encr_ctx_tmpl,
2538                             cd_ptr, callrp);
2539 
2540                 }
2541         }
2542 
2543         if (do_auth && do_encr) {
2544                 /*
2545                  * Encryption and authentication:
2546                  * Pass the pointer to the mblk chain starting at the ESP
2547                  * header to the framework. Skip the ESP header mblk
2548                  * for encryption, which is reflected by an encryption
2549                  * offset equal to the length of that mblk. Start
2550                  * the authentication at the ESP header, i.e. use an
2551                  * authentication offset of zero.
2552                  */
2553                 ESP_INIT_CRYPTO_DUAL_DATA(&ic->ic_crypto_dual_data,
2554                     esp_mp, MBLKL(esp_mp), payload_len, esph_offset, auth_len);
2555 
2556                 /* specify IV */
2557                 ic->ic_crypto_dual_data.dd_miscdata = (char *)iv_ptr;
2558 
2559                 /* call the framework */
2560                 kef_rc = crypto_encrypt_mac(&assoc->ipsa_emech,
2561                     &assoc->ipsa_amech, NULL,
2562                     &assoc->ipsa_kcfencrkey, &assoc->ipsa_kcfauthkey,
2563                     encr_ctx_tmpl, auth_ctx_tmpl,
2564                     &ic->ic_crypto_dual_data,
2565                     &ic->ic_crypto_mac, callrp);
2566         }
2567 
2568         switch (kef_rc) {
2569         case CRYPTO_SUCCESS:
2570                 ESP_BUMP_STAT(espstack, crypto_sync);
2571                 esp_set_usetime(assoc, B_FALSE);
2572                 if (force) {
2573                         mp = ipsec_free_crypto_data(mp);
2574                         data_mp = ip_xmit_attr_free_mblk(mp);
2575                 }
2576                 if (is_natt)
2577                         esp_prepare_udp(ns, data_mp, (ipha_t *)data_mp->b_rptr);
2578                 return (data_mp);
2579         case CRYPTO_QUEUED:
2580                 /* esp_kcf_callback_outbound() will be invoked on completion */
2581                 ESP_BUMP_STAT(espstack, crypto_async);
2582                 return (NULL);
2583         }
2584 
2585         if (force) {
2586                 mp = ipsec_free_crypto_data(mp);
2587                 data_mp = ip_xmit_attr_free_mblk(mp);
2588         }
2589         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2590         esp_crypto_failed(data_mp, B_FALSE, kef_rc, NULL, espstack);
2591         /* data_mp was passed to ip_drop_packet */
2592         return (NULL);
2593 }
2594 
2595 /*
2596  * Handle outbound IPsec processing for IPv4 and IPv6
2597  *
2598  * Returns data_mp if successfully completed the request. Returns
2599  * NULL if it failed (and increments InDiscards) or if it is pending.
2600  */
2601 static mblk_t *
2602 esp_outbound(mblk_t *data_mp, ip_xmit_attr_t *ixa)
2603 {
2604         mblk_t *espmp, *tailmp;
2605         ipha_t *ipha;
2606         ip6_t *ip6h;
2607         esph_t *esph_ptr, *iv_ptr;
2608         uint_t af;
2609         uint8_t *nhp;
2610         uintptr_t divpoint, datalen, adj, padlen, i, alloclen;
2611         uintptr_t esplen = sizeof (esph_t);
2612         uint8_t protocol;
2613         ipsa_t *assoc;
2614         uint_t iv_len, block_size, mac_len = 0;
2615         uchar_t *icv_buf;
2616         udpha_t *udpha;
2617         boolean_t is_natt = B_FALSE;
2618         netstack_t      *ns = ixa->ixa_ipst->ips_netstack;
2619         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2620         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2621         ill_t           *ill = ixa->ixa_nce->nce_ill;
2622         boolean_t       need_refrele = B_FALSE;
2623 
2624         ESP_BUMP_STAT(espstack, out_requests);
2625 
2626         /*
2627          * <sigh> We have to copy the message here, because TCP (for example)
2628          * keeps a dupb() of the message lying around for retransmission.
2629          * Since ESP changes the whole of the datagram, we have to create our
2630          * own copy lest we clobber TCP's data.  Since we have to copy anyway,
2631          * we might as well make use of msgpullup() and get the mblk into one
2632          * contiguous piece!
2633          */
2634         tailmp = msgpullup(data_mp, -1);
2635         if (tailmp == NULL) {
2636                 esp0dbg(("esp_outbound: msgpullup() failed, "
2637                     "dropping packet.\n"));
2638                 ip_drop_packet(data_mp, B_FALSE, ill,
2639                     DROPPER(ipss, ipds_esp_nomem),
2640                     &espstack->esp_dropper);
2641                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2642                 return (NULL);
2643         }
2644         freemsg(data_mp);
2645         data_mp = tailmp;
2646 
2647         assoc = ixa->ixa_ipsec_esp_sa;
2648         ASSERT(assoc != NULL);
2649 
2650         /*
2651          * Get the outer IP header in shape to escape this system..
2652          */
2653         if (is_system_labeled() && (assoc->ipsa_otsl != NULL)) {
2654                 /*
2655                  * Need to update packet with any CIPSO option and update
2656                  * ixa_tsl to capture the new label.
2657                  * We allocate a separate ixa for that purpose.
2658                  */
2659                 ixa = ip_xmit_attr_duplicate(ixa);
2660                 if (ixa == NULL) {
2661                         ip_drop_packet(data_mp, B_FALSE, ill,
2662                             DROPPER(ipss, ipds_esp_nomem),
2663                             &espstack->esp_dropper);
2664                         return (NULL);
2665                 }
2666                 need_refrele = B_TRUE;
2667 
2668                 label_hold(assoc->ipsa_otsl);
2669                 ip_xmit_attr_replace_tsl(ixa, assoc->ipsa_otsl);
2670 
2671                 data_mp = sadb_whack_label(data_mp, assoc, ixa,
2672                     DROPPER(ipss, ipds_esp_nomem), &espstack->esp_dropper);
2673                 if (data_mp == NULL) {
2674                         /* Packet dropped by sadb_whack_label */
2675                         ixa_refrele(ixa);
2676                         return (NULL);
2677                 }
2678         }
2679 
2680         /*
2681          * Reality check....
2682          */
2683         ipha = (ipha_t *)data_mp->b_rptr;  /* So we can call esp_acquire(). */
2684 
2685         if (ixa->ixa_flags & IXAF_IS_IPV4) {
2686                 ASSERT(IPH_HDR_VERSION(ipha) == IPV4_VERSION);
2687 
2688                 af = AF_INET;
2689                 divpoint = IPH_HDR_LENGTH(ipha);
2690                 datalen = ntohs(ipha->ipha_length) - divpoint;
2691                 nhp = (uint8_t *)&ipha->ipha_protocol;
2692         } else {
2693                 ip_pkt_t ipp;
2694 
2695                 ASSERT(IPH_HDR_VERSION(ipha) == IPV6_VERSION);
2696 
2697                 af = AF_INET6;
2698                 ip6h = (ip6_t *)ipha;
2699                 bzero(&ipp, sizeof (ipp));
2700                 divpoint = ip_find_hdr_v6(data_mp, ip6h, B_FALSE, &ipp, NULL);
2701                 if (ipp.ipp_dstopts != NULL &&
2702                     ipp.ipp_dstopts->ip6d_nxt != IPPROTO_ROUTING) {
2703                         /*
2704                          * Destination options are tricky.  If we get in here,
2705                          * then we have a terminal header following the
2706                          * destination options.  We need to adjust backwards
2707                          * so we insert ESP BEFORE the destination options
2708                          * bag.  (So that the dstopts get encrypted!)
2709                          *
2710                          * Since this is for outbound packets only, we know
2711                          * that non-terminal destination options only precede
2712                          * routing headers.
2713                          */
2714                         divpoint -= ipp.ipp_dstoptslen;
2715                 }
2716                 datalen = ntohs(ip6h->ip6_plen) + sizeof (ip6_t) - divpoint;
2717 
2718                 if (ipp.ipp_rthdr != NULL) {
2719                         nhp = &ipp.ipp_rthdr->ip6r_nxt;
2720                 } else if (ipp.ipp_hopopts != NULL) {
2721                         nhp = &ipp.ipp_hopopts->ip6h_nxt;
2722                 } else {
2723                         ASSERT(divpoint == sizeof (ip6_t));
2724                         /* It's probably IP + ESP. */
2725                         nhp = &ip6h->ip6_nxt;
2726                 }
2727         }
2728 
2729         mac_len = assoc->ipsa_mac_len;
2730 
2731         if (assoc->ipsa_flags & IPSA_F_NATT) {
2732                 /* wedge in UDP header */
2733                 is_natt = B_TRUE;
2734                 esplen += UDPH_SIZE;
2735         }
2736 
2737         /*
2738          * Set up ESP header and encryption padding for ENCR PI request.
2739          */
2740 
2741         /* Determine the padding length.  Pad to 4-bytes for no-encryption. */
2742         if (assoc->ipsa_encr_alg != SADB_EALG_NULL) {
2743                 iv_len = assoc->ipsa_iv_len;
2744                 block_size = assoc->ipsa_datalen;
2745 
2746                 /*
2747                  * Pad the data to the length of the cipher block size.
2748                  * Include the two additional bytes (hence the - 2) for the
2749                  * padding length and the next header.  Take this into account
2750                  * when calculating the actual length of the padding.
2751                  */
2752                 ASSERT(ISP2(iv_len));
2753                 padlen = ((unsigned)(block_size - datalen - 2)) &
2754                     (block_size - 1);
2755         } else {
2756                 iv_len = 0;
2757                 padlen = ((unsigned)(sizeof (uint32_t) - datalen - 2)) &
2758                     (sizeof (uint32_t) - 1);
2759         }
2760 
2761         /* Allocate ESP header and IV. */
2762         esplen += iv_len;
2763 
2764         /*
2765          * Update association byte-count lifetimes.  Don't forget to take
2766          * into account the padding length and next-header (hence the + 2).
2767          *
2768          * Use the amount of data fed into the "encryption algorithm".  This
2769          * is the IV, the data length, the padding length, and the final two
2770          * bytes (padlen, and next-header).
2771          *
2772          */
2773 
2774         if (!esp_age_bytes(assoc, datalen + padlen + iv_len + 2, B_FALSE)) {
2775                 ip_drop_packet(data_mp, B_FALSE, ill,
2776                     DROPPER(ipss, ipds_esp_bytes_expire),
2777                     &espstack->esp_dropper);
2778                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2779                 if (need_refrele)
2780                         ixa_refrele(ixa);
2781                 return (NULL);
2782         }
2783 
2784         espmp = allocb(esplen, BPRI_HI);
2785         if (espmp == NULL) {
2786                 ESP_BUMP_STAT(espstack, out_discards);
2787                 esp1dbg(espstack, ("esp_outbound: can't allocate espmp.\n"));
2788                 ip_drop_packet(data_mp, B_FALSE, ill,
2789                     DROPPER(ipss, ipds_esp_nomem),
2790                     &espstack->esp_dropper);
2791                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2792                 if (need_refrele)
2793                         ixa_refrele(ixa);
2794                 return (NULL);
2795         }
2796         espmp->b_wptr += esplen;
2797         esph_ptr = (esph_t *)espmp->b_rptr;
2798 
2799         if (is_natt) {
2800                 esp3dbg(espstack, ("esp_outbound: NATT"));
2801 
2802                 udpha = (udpha_t *)espmp->b_rptr;
2803                 udpha->uha_src_port = (assoc->ipsa_local_nat_port != 0) ?
2804                     assoc->ipsa_local_nat_port : htons(IPPORT_IKE_NATT);
2805                 udpha->uha_dst_port = (assoc->ipsa_remote_nat_port != 0) ?
2806                     assoc->ipsa_remote_nat_port : htons(IPPORT_IKE_NATT);
2807                 /*
2808                  * Set the checksum to 0, so that the esp_prepare_udp() call
2809                  * can do the right thing.
2810                  */
2811                 udpha->uha_checksum = 0;
2812                 esph_ptr = (esph_t *)(udpha + 1);
2813         }
2814 
2815         esph_ptr->esph_spi = assoc->ipsa_spi;
2816 
2817         esph_ptr->esph_replay = htonl(atomic_inc_32_nv(&assoc->ipsa_replay));
2818         if (esph_ptr->esph_replay == 0 && assoc->ipsa_replay_wsize != 0) {
2819                 /*
2820                  * XXX We have replay counter wrapping.
2821                  * We probably want to nuke this SA (and its peer).
2822                  */
2823                 ipsec_assocfailure(info.mi_idnum, 0, 0,
2824                     SL_ERROR | SL_CONSOLE | SL_WARN,
2825                     "Outbound ESP SA (0x%x, %s) has wrapped sequence.\n",
2826                     esph_ptr->esph_spi, assoc->ipsa_dstaddr, af,
2827                     espstack->ipsecesp_netstack);
2828 
2829                 ESP_BUMP_STAT(espstack, out_discards);
2830                 sadb_replay_delete(assoc);
2831                 ip_drop_packet(data_mp, B_FALSE, ill,
2832                     DROPPER(ipss, ipds_esp_replay),
2833                     &espstack->esp_dropper);
2834                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2835                 if (need_refrele)
2836                         ixa_refrele(ixa);
2837                 return (NULL);
2838         }
2839 
2840         iv_ptr = (esph_ptr + 1);
2841         /*
2842          * iv_ptr points to the mblk which will contain the IV once we have
2843          * written it there. This mblk will be part of a mblk chain that
2844          * will make up the packet.
2845          *
2846          * For counter mode algorithms, the IV is a 64 bit quantity, it
2847          * must NEVER repeat in the lifetime of the SA, otherwise an
2848          * attacker who had recorded enough packets might be able to
2849          * determine some clear text.
2850          *
2851          * To ensure this does not happen, the IV is stored in the SA and
2852          * incremented for each packet, the IV is then copied into the
2853          * "packet" for transmission to the receiving system. The IV will
2854          * also be copied into the nonce, when the packet is encrypted.
2855          *
2856          * CBC mode algorithms use a random IV for each packet. We do not
2857          * require the highest quality random bits, but for best security
2858          * with CBC mode ciphers, the value must be unlikely to repeat and
2859          * must not be known in advance to an adversary capable of influencing
2860          * the clear text.
2861          */
2862         if (!update_iv((uint8_t *)iv_ptr, espstack->esp_pfkey_q, assoc,
2863             espstack)) {
2864                 ip_drop_packet(data_mp, B_FALSE, ill,
2865                     DROPPER(ipss, ipds_esp_iv_wrap), &espstack->esp_dropper);
2866                 if (need_refrele)
2867                         ixa_refrele(ixa);
2868                 return (NULL);
2869         }
2870 
2871         /* Fix the IP header. */
2872         alloclen = padlen + 2 + mac_len;
2873         adj = alloclen + (espmp->b_wptr - espmp->b_rptr);
2874 
2875         protocol = *nhp;
2876 
2877         if (ixa->ixa_flags & IXAF_IS_IPV4) {
2878                 ipha->ipha_length = htons(ntohs(ipha->ipha_length) + adj);
2879                 if (is_natt) {
2880                         *nhp = IPPROTO_UDP;
2881                         udpha->uha_length = htons(ntohs(ipha->ipha_length) -
2882                             IPH_HDR_LENGTH(ipha));
2883                 } else {
2884                         *nhp = IPPROTO_ESP;
2885                 }
2886                 ipha->ipha_hdr_checksum = 0;
2887                 ipha->ipha_hdr_checksum = (uint16_t)ip_csum_hdr(ipha);
2888         } else {
2889                 ip6h->ip6_plen = htons(ntohs(ip6h->ip6_plen) + adj);
2890                 *nhp = IPPROTO_ESP;
2891         }
2892 
2893         /* I've got the two ESP mblks, now insert them. */
2894 
2895         esp2dbg(espstack, ("data_mp before outbound ESP adjustment:\n"));
2896         esp2dbg(espstack, (dump_msg(data_mp)));
2897 
2898         if (!esp_insert_esp(data_mp, espmp, divpoint, espstack)) {
2899                 ESP_BUMP_STAT(espstack, out_discards);
2900                 /* NOTE:  esp_insert_esp() only fails if there's no memory. */
2901                 ip_drop_packet(data_mp, B_FALSE, ill,
2902                     DROPPER(ipss, ipds_esp_nomem),
2903                     &espstack->esp_dropper);
2904                 freeb(espmp);
2905                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2906                 if (need_refrele)
2907                         ixa_refrele(ixa);
2908                 return (NULL);
2909         }
2910 
2911         /* Append padding (and leave room for ICV). */
2912         for (tailmp = data_mp; tailmp->b_cont != NULL; tailmp = tailmp->b_cont)
2913                 ;
2914         if (tailmp->b_wptr + alloclen > tailmp->b_datap->db_lim) {
2915                 tailmp->b_cont = allocb(alloclen, BPRI_HI);
2916                 if (tailmp->b_cont == NULL) {
2917                         ESP_BUMP_STAT(espstack, out_discards);
2918                         esp0dbg(("esp_outbound:  Can't allocate tailmp.\n"));
2919                         ip_drop_packet(data_mp, B_FALSE, ill,
2920                             DROPPER(ipss, ipds_esp_nomem),
2921                             &espstack->esp_dropper);
2922                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
2923                         if (need_refrele)
2924                                 ixa_refrele(ixa);
2925                         return (NULL);
2926                 }
2927                 tailmp = tailmp->b_cont;
2928         }
2929 
2930         /*
2931          * If there's padding, N bytes of padding must be of the form 0x1,
2932          * 0x2, 0x3... 0xN.
2933          */
2934         for (i = 0; i < padlen; ) {
2935                 i++;
2936                 *tailmp->b_wptr++ = i;
2937         }
2938         *tailmp->b_wptr++ = i;
2939         *tailmp->b_wptr++ = protocol;
2940 
2941         esp2dbg(espstack, ("data_Mp before encryption:\n"));
2942         esp2dbg(espstack, (dump_msg(data_mp)));
2943 
2944         /*
2945          * Okay.  I've set up the pre-encryption ESP.  Let's do it!
2946          */
2947 
2948         if (mac_len > 0) {
2949                 ASSERT(tailmp->b_wptr + mac_len <= tailmp->b_datap->db_lim);
2950                 icv_buf = tailmp->b_wptr;
2951                 tailmp->b_wptr += mac_len;
2952         } else {
2953                 icv_buf = NULL;
2954         }
2955 
2956         data_mp = esp_submit_req_outbound(data_mp, ixa, assoc, icv_buf,
2957             datalen + padlen + 2);
2958         if (need_refrele)
2959                 ixa_refrele(ixa);
2960         return (data_mp);
2961 }
2962 
2963 /*
2964  * IP calls this to validate the ICMP errors that
2965  * we got from the network.
2966  */
2967 mblk_t *
2968 ipsecesp_icmp_error(mblk_t *data_mp, ip_recv_attr_t *ira)
2969 {
2970         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
2971         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
2972         ipsec_stack_t   *ipss = ns->netstack_ipsec;
2973 
2974         /*
2975          * Unless we get an entire packet back, this function is useless.
2976          * Why?
2977          *
2978          * 1.)  Partial packets are useless, because the "next header"
2979          *      is at the end of the decrypted ESP packet.  Without the
2980          *      whole packet, this is useless.
2981          *
2982          * 2.)  If we every use a stateful cipher, such as a stream or a
2983          *      one-time pad, we can't do anything.
2984          *
2985          * Since the chances of us getting an entire packet back are very
2986          * very small, we discard here.
2987          */
2988         IP_ESP_BUMP_STAT(ipss, in_discards);
2989         ip_drop_packet(data_mp, B_TRUE, ira->ira_ill,
2990             DROPPER(ipss, ipds_esp_icmp),
2991             &espstack->esp_dropper);
2992         return (NULL);
2993 }
2994 
2995 /*
2996  * Construct an SADB_REGISTER message with the current algorithms.
2997  * This function gets called when 'ipsecalgs -s' is run or when
2998  * in.iked (or other KMD) starts.
2999  */
3000 static boolean_t
3001 esp_register_out(uint32_t sequence, uint32_t pid, uint_t serial,
3002     ipsecesp_stack_t *espstack, cred_t *cr)
3003 {
3004         mblk_t *pfkey_msg_mp, *keysock_out_mp;
3005         sadb_msg_t *samsg;
3006         sadb_supported_t *sasupp_auth = NULL;
3007         sadb_supported_t *sasupp_encr = NULL;
3008         sadb_alg_t *saalg;
3009         uint_t allocsize = sizeof (*samsg);
3010         uint_t i, numalgs_snap;
3011         int current_aalgs;
3012         ipsec_alginfo_t **authalgs;
3013         uint_t num_aalgs;
3014         int current_ealgs;
3015         ipsec_alginfo_t **encralgs;
3016         uint_t num_ealgs;
3017         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3018         sadb_sens_t *sens;
3019         size_t sens_len = 0;
3020         sadb_ext_t *nextext;
3021         ts_label_t *sens_tsl = NULL;
3022 
3023         /* Allocate the KEYSOCK_OUT. */
3024         keysock_out_mp = sadb_keysock_out(serial);
3025         if (keysock_out_mp == NULL) {
3026                 esp0dbg(("esp_register_out: couldn't allocate mblk.\n"));
3027                 return (B_FALSE);
3028         }
3029 
3030         if (is_system_labeled() && (cr != NULL)) {
3031                 sens_tsl = crgetlabel(cr);
3032                 if (sens_tsl != NULL) {
3033                         sens_len = sadb_sens_len_from_label(sens_tsl);
3034                         allocsize += sens_len;
3035                 }
3036         }
3037 
3038         /*
3039          * Allocate the PF_KEY message that follows KEYSOCK_OUT.
3040          */
3041 
3042         mutex_enter(&ipss->ipsec_alg_lock);
3043         /*
3044          * Fill SADB_REGISTER message's algorithm descriptors.  Hold
3045          * down the lock while filling it.
3046          *
3047          * Return only valid algorithms, so the number of algorithms
3048          * to send up may be less than the number of algorithm entries
3049          * in the table.
3050          */
3051         authalgs = ipss->ipsec_alglists[IPSEC_ALG_AUTH];
3052         for (num_aalgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
3053                 if (authalgs[i] != NULL && ALG_VALID(authalgs[i]))
3054                         num_aalgs++;
3055 
3056         if (num_aalgs != 0) {
3057                 allocsize += (num_aalgs * sizeof (*saalg));
3058                 allocsize += sizeof (*sasupp_auth);
3059         }
3060         encralgs = ipss->ipsec_alglists[IPSEC_ALG_ENCR];
3061         for (num_ealgs = 0, i = 0; i < IPSEC_MAX_ALGS; i++)
3062                 if (encralgs[i] != NULL && ALG_VALID(encralgs[i]))
3063                         num_ealgs++;
3064 
3065         if (num_ealgs != 0) {
3066                 allocsize += (num_ealgs * sizeof (*saalg));
3067                 allocsize += sizeof (*sasupp_encr);
3068         }
3069         keysock_out_mp->b_cont = allocb(allocsize, BPRI_HI);
3070         if (keysock_out_mp->b_cont == NULL) {
3071                 mutex_exit(&ipss->ipsec_alg_lock);
3072                 freemsg(keysock_out_mp);
3073                 return (B_FALSE);
3074         }
3075         pfkey_msg_mp = keysock_out_mp->b_cont;
3076         pfkey_msg_mp->b_wptr += allocsize;
3077 
3078         nextext = (sadb_ext_t *)(pfkey_msg_mp->b_rptr + sizeof (*samsg));
3079 
3080         if (num_aalgs != 0) {
3081                 sasupp_auth = (sadb_supported_t *)nextext;
3082                 saalg = (sadb_alg_t *)(sasupp_auth + 1);
3083 
3084                 ASSERT(((ulong_t)saalg & 0x7) == 0);
3085 
3086                 numalgs_snap = 0;
3087                 for (i = 0;
3088                     ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_aalgs));
3089                     i++) {
3090                         if (authalgs[i] == NULL || !ALG_VALID(authalgs[i]))
3091                                 continue;
3092 
3093                         saalg->sadb_alg_id = authalgs[i]->alg_id;
3094                         saalg->sadb_alg_ivlen = 0;
3095                         saalg->sadb_alg_minbits      = authalgs[i]->alg_ef_minbits;
3096                         saalg->sadb_alg_maxbits      = authalgs[i]->alg_ef_maxbits;
3097                         saalg->sadb_x_alg_increment =
3098                             authalgs[i]->alg_increment;
3099                         saalg->sadb_x_alg_saltbits = SADB_8TO1(
3100                             authalgs[i]->alg_saltlen);
3101                         numalgs_snap++;
3102                         saalg++;
3103                 }
3104                 ASSERT(numalgs_snap == num_aalgs);
3105 #ifdef DEBUG
3106                 /*
3107                  * Reality check to make sure I snagged all of the
3108                  * algorithms.
3109                  */
3110                 for (; i < IPSEC_MAX_ALGS; i++) {
3111                         if (authalgs[i] != NULL && ALG_VALID(authalgs[i])) {
3112                                 cmn_err(CE_PANIC, "esp_register_out()! "
3113                                     "Missed aalg #%d.\n", i);
3114                         }
3115                 }
3116 #endif /* DEBUG */
3117                 nextext = (sadb_ext_t *)saalg;
3118         }
3119 
3120         if (num_ealgs != 0) {
3121                 sasupp_encr = (sadb_supported_t *)nextext;
3122                 saalg = (sadb_alg_t *)(sasupp_encr + 1);
3123 
3124                 numalgs_snap = 0;
3125                 for (i = 0;
3126                     ((i < IPSEC_MAX_ALGS) && (numalgs_snap < num_ealgs)); i++) {
3127                         if (encralgs[i] == NULL || !ALG_VALID(encralgs[i]))
3128                                 continue;
3129                         saalg->sadb_alg_id = encralgs[i]->alg_id;
3130                         saalg->sadb_alg_ivlen = encralgs[i]->alg_ivlen;
3131                         saalg->sadb_alg_minbits      = encralgs[i]->alg_ef_minbits;
3132                         saalg->sadb_alg_maxbits      = encralgs[i]->alg_ef_maxbits;
3133                         /*
3134                          * We could advertise the ICV length, except there
3135                          * is not a value in sadb_x_algb to do this.
3136                          * saalg->sadb_alg_maclen = encralgs[i]->alg_maclen;
3137                          */
3138                         saalg->sadb_x_alg_increment =
3139                             encralgs[i]->alg_increment;
3140                         saalg->sadb_x_alg_saltbits =
3141                             SADB_8TO1(encralgs[i]->alg_saltlen);
3142 
3143                         numalgs_snap++;
3144                         saalg++;
3145                 }
3146                 ASSERT(numalgs_snap == num_ealgs);
3147 #ifdef DEBUG
3148                 /*
3149                  * Reality check to make sure I snagged all of the
3150                  * algorithms.
3151                  */
3152                 for (; i < IPSEC_MAX_ALGS; i++) {
3153                         if (encralgs[i] != NULL && ALG_VALID(encralgs[i])) {
3154                                 cmn_err(CE_PANIC, "esp_register_out()! "
3155                                     "Missed ealg #%d.\n", i);
3156                         }
3157                 }
3158 #endif /* DEBUG */
3159                 nextext = (sadb_ext_t *)saalg;
3160         }
3161 
3162         current_aalgs = num_aalgs;
3163         current_ealgs = num_ealgs;
3164 
3165         mutex_exit(&ipss->ipsec_alg_lock);
3166 
3167         if (sens_tsl != NULL) {
3168                 sens = (sadb_sens_t *)nextext;
3169                 sadb_sens_from_label(sens, SADB_EXT_SENSITIVITY,
3170                     sens_tsl, sens_len);
3171 
3172                 nextext = (sadb_ext_t *)(((uint8_t *)sens) + sens_len);
3173         }
3174 
3175         /* Now fill the rest of the SADB_REGISTER message. */
3176 
3177         samsg = (sadb_msg_t *)pfkey_msg_mp->b_rptr;
3178         samsg->sadb_msg_version = PF_KEY_V2;
3179         samsg->sadb_msg_type = SADB_REGISTER;
3180         samsg->sadb_msg_errno = 0;
3181         samsg->sadb_msg_satype = SADB_SATYPE_ESP;
3182         samsg->sadb_msg_len = SADB_8TO64(allocsize);
3183         samsg->sadb_msg_reserved = 0;
3184         /*
3185          * Assume caller has sufficient sequence/pid number info.  If it's one
3186          * from me over a new alg., I could give two hoots about sequence.
3187          */
3188         samsg->sadb_msg_seq = sequence;
3189         samsg->sadb_msg_pid = pid;
3190 
3191         if (sasupp_auth != NULL) {
3192                 sasupp_auth->sadb_supported_len = SADB_8TO64(
3193                     sizeof (*sasupp_auth) + sizeof (*saalg) * current_aalgs);
3194                 sasupp_auth->sadb_supported_exttype = SADB_EXT_SUPPORTED_AUTH;
3195                 sasupp_auth->sadb_supported_reserved = 0;
3196         }
3197 
3198         if (sasupp_encr != NULL) {
3199                 sasupp_encr->sadb_supported_len = SADB_8TO64(
3200                     sizeof (*sasupp_encr) + sizeof (*saalg) * current_ealgs);
3201                 sasupp_encr->sadb_supported_exttype =
3202                     SADB_EXT_SUPPORTED_ENCRYPT;
3203                 sasupp_encr->sadb_supported_reserved = 0;
3204         }
3205 
3206         if (espstack->esp_pfkey_q != NULL)
3207                 putnext(espstack->esp_pfkey_q, keysock_out_mp);
3208         else {
3209                 freemsg(keysock_out_mp);
3210                 return (B_FALSE);
3211         }
3212 
3213         return (B_TRUE);
3214 }
3215 
3216 /*
3217  * Invoked when the algorithm table changes. Causes SADB_REGISTER
3218  * messages continaining the current list of algorithms to be
3219  * sent up to the ESP listeners.
3220  */
3221 void
3222 ipsecesp_algs_changed(netstack_t *ns)
3223 {
3224         ipsecesp_stack_t        *espstack = ns->netstack_ipsecesp;
3225 
3226         /*
3227          * Time to send a PF_KEY SADB_REGISTER message to ESP listeners
3228          * everywhere.  (The function itself checks for NULL esp_pfkey_q.)
3229          */
3230         (void) esp_register_out(0, 0, 0, espstack, NULL);
3231 }
3232 
3233 /*
3234  * Stub function that taskq_dispatch() invokes to take the mblk (in arg)
3235  * and send() it into ESP and IP again.
3236  */
3237 static void
3238 inbound_task(void *arg)
3239 {
3240         mblk_t          *mp = (mblk_t *)arg;
3241         mblk_t          *async_mp;
3242         ip_recv_attr_t  iras;
3243 
3244         async_mp = mp;
3245         mp = async_mp->b_cont;
3246         async_mp->b_cont = NULL;
3247         if (!ip_recv_attr_from_mblk(async_mp, &iras)) {
3248                 /* The ill or ip_stack_t disappeared on us */
3249                 ip_drop_input("ip_recv_attr_from_mblk", mp, NULL);
3250                 freemsg(mp);
3251                 goto done;
3252         }
3253 
3254         esp_inbound_restart(mp, &iras);
3255 done:
3256         ira_cleanup(&iras, B_TRUE);
3257 }
3258 
3259 /*
3260  * Restart ESP after the SA has been added.
3261  */
3262 static void
3263 esp_inbound_restart(mblk_t *mp, ip_recv_attr_t *ira)
3264 {
3265         esph_t          *esph;
3266         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
3267         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3268 
3269         esp2dbg(espstack, ("in ESP inbound_task"));
3270         ASSERT(espstack != NULL);
3271 
3272         mp = ipsec_inbound_esp_sa(mp, ira, &esph);
3273         if (mp == NULL)
3274                 return;
3275 
3276         ASSERT(esph != NULL);
3277         ASSERT(ira->ira_flags & IRAF_IPSEC_SECURE);
3278         ASSERT(ira->ira_ipsec_esp_sa != NULL);
3279 
3280         mp = ira->ira_ipsec_esp_sa->ipsa_input_func(mp, esph, ira);
3281         if (mp == NULL) {
3282                 /*
3283                  * Either it failed or is pending. In the former case
3284                  * ipIfStatsInDiscards was increased.
3285                  */
3286                 return;
3287         }
3288 
3289         ip_input_post_ipsec(mp, ira);
3290 }
3291 
3292 /*
3293  * Now that weak-key passed, actually ADD the security association, and
3294  * send back a reply ADD message.
3295  */
3296 static int
3297 esp_add_sa_finish(mblk_t *mp, sadb_msg_t *samsg, keysock_in_t *ksi,
3298     int *diagnostic, ipsecesp_stack_t *espstack)
3299 {
3300         isaf_t *primary = NULL, *secondary;
3301         boolean_t clone = B_FALSE, is_inbound = B_FALSE;
3302         ipsa_t *larval = NULL;
3303         ipsacq_t *acqrec;
3304         iacqf_t *acq_bucket;
3305         mblk_t *acq_msgs = NULL;
3306         int rc;
3307         mblk_t *lpkt;
3308         int error;
3309         ipsa_query_t sq;
3310         ipsec_stack_t   *ipss = espstack->ipsecesp_netstack->netstack_ipsec;
3311 
3312         /*
3313          * Locate the appropriate table(s).
3314          */
3315         sq.spp = &espstack->esp_sadb;    /* XXX */
3316         error = sadb_form_query(ksi, IPSA_Q_SA|IPSA_Q_DST,
3317             IPSA_Q_SA|IPSA_Q_DST|IPSA_Q_INBOUND|IPSA_Q_OUTBOUND,
3318             &sq, diagnostic);
3319         if (error)
3320                 return (error);
3321 
3322         /*
3323          * Use the direction flags provided by the KMD to determine
3324          * if the inbound or outbound table should be the primary
3325          * for this SA. If these flags were absent then make this
3326          * decision based on the addresses.
3327          */
3328         if (sq.assoc->sadb_sa_flags & IPSA_F_INBOUND) {
3329                 primary = sq.inbound;
3330                 secondary = sq.outbound;
3331                 is_inbound = B_TRUE;
3332                 if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND)
3333                         clone = B_TRUE;
3334         } else if (sq.assoc->sadb_sa_flags & IPSA_F_OUTBOUND) {
3335                 primary = sq.outbound;
3336                 secondary = sq.inbound;
3337         }
3338 
3339         if (primary == NULL) {
3340                 /*
3341                  * The KMD did not set a direction flag, determine which
3342                  * table to insert the SA into based on addresses.
3343                  */
3344                 switch (ksi->ks_in_dsttype) {
3345                 case KS_IN_ADDR_MBCAST:
3346                         clone = B_TRUE; /* All mcast SAs can be bidirectional */
3347                         sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3348                         /* FALLTHRU */
3349                 /*
3350                  * If the source address is either one of mine, or unspecified
3351                  * (which is best summed up by saying "not 'not mine'"),
3352                  * then the association is potentially bi-directional,
3353                  * in that it can be used for inbound traffic and outbound
3354                  * traffic.  The best example of such an SA is a multicast
3355                  * SA (which allows me to receive the outbound traffic).
3356                  */
3357                 case KS_IN_ADDR_ME:
3358                         sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3359                         primary = sq.inbound;
3360                         secondary = sq.outbound;
3361                         if (ksi->ks_in_srctype != KS_IN_ADDR_NOTME)
3362                                 clone = B_TRUE;
3363                         is_inbound = B_TRUE;
3364                         break;
3365                 /*
3366                  * If the source address literally not mine (either
3367                  * unspecified or not mine), then this SA may have an
3368                  * address that WILL be mine after some configuration.
3369                  * We pay the price for this by making it a bi-directional
3370                  * SA.
3371                  */
3372                 case KS_IN_ADDR_NOTME:
3373                         sq.assoc->sadb_sa_flags |= IPSA_F_OUTBOUND;
3374                         primary = sq.outbound;
3375                         secondary = sq.inbound;
3376                         if (ksi->ks_in_srctype != KS_IN_ADDR_ME) {
3377                                 sq.assoc->sadb_sa_flags |= IPSA_F_INBOUND;
3378                                 clone = B_TRUE;
3379                         }
3380                         break;
3381                 default:
3382                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_DST;
3383                         return (EINVAL);
3384                 }
3385         }
3386 
3387         /*
3388          * Find a ACQUIRE list entry if possible.  If we've added an SA that
3389          * suits the needs of an ACQUIRE list entry, we can eliminate the
3390          * ACQUIRE list entry and transmit the enqueued packets.  Use the
3391          * high-bit of the sequence number to queue it.  Key off destination
3392          * addr, and change acqrec's state.
3393          */
3394 
3395         if (samsg->sadb_msg_seq & IACQF_LOWEST_SEQ) {
3396                 acq_bucket = &(sq.sp->sdb_acq[sq.outhash]);
3397                 mutex_enter(&acq_bucket->iacqf_lock);
3398                 for (acqrec = acq_bucket->iacqf_ipsacq; acqrec != NULL;
3399                     acqrec = acqrec->ipsacq_next) {
3400                         mutex_enter(&acqrec->ipsacq_lock);
3401                         /*
3402                          * Q:  I only check sequence.  Should I check dst?
3403                          * A: Yes, check dest because those are the packets
3404                          *    that are queued up.
3405                          */
3406                         if (acqrec->ipsacq_seq == samsg->sadb_msg_seq &&
3407                             IPSA_ARE_ADDR_EQUAL(sq.dstaddr,
3408                             acqrec->ipsacq_dstaddr, acqrec->ipsacq_addrfam))
3409                                 break;
3410                         mutex_exit(&acqrec->ipsacq_lock);
3411                 }
3412                 if (acqrec != NULL) {
3413                         /*
3414                          * AHA!  I found an ACQUIRE record for this SA.
3415                          * Grab the msg list, and free the acquire record.
3416                          * I already am holding the lock for this record,
3417                          * so all I have to do is free it.
3418                          */
3419                         acq_msgs = acqrec->ipsacq_mp;
3420                         acqrec->ipsacq_mp = NULL;
3421                         mutex_exit(&acqrec->ipsacq_lock);
3422                         sadb_destroy_acquire(acqrec,
3423                             espstack->ipsecesp_netstack);
3424                 }
3425                 mutex_exit(&acq_bucket->iacqf_lock);
3426         }
3427 
3428         /*
3429          * Find PF_KEY message, and see if I'm an update.  If so, find entry
3430          * in larval list (if there).
3431          */
3432         if (samsg->sadb_msg_type == SADB_UPDATE) {
3433                 mutex_enter(&sq.inbound->isaf_lock);
3434                 larval = ipsec_getassocbyspi(sq.inbound, sq.assoc->sadb_sa_spi,
3435                     ALL_ZEROES_PTR, sq.dstaddr, sq.dst->sin_family);
3436                 mutex_exit(&sq.inbound->isaf_lock);
3437 
3438                 if ((larval == NULL) ||
3439                     (larval->ipsa_state != IPSA_STATE_LARVAL)) {
3440                         *diagnostic = SADB_X_DIAGNOSTIC_SA_NOTFOUND;
3441                         if (larval != NULL) {
3442                                 IPSA_REFRELE(larval);
3443                         }
3444                         esp0dbg(("Larval update, but larval disappeared.\n"));
3445                         return (ESRCH);
3446                 } /* Else sadb_common_add unlinks it for me! */
3447         }
3448 
3449         if (larval != NULL) {
3450                 /*
3451                  * Hold again, because sadb_common_add() consumes a reference,
3452                  * and we don't want to clear_lpkt() without a reference.
3453                  */
3454                 IPSA_REFHOLD(larval);
3455         }
3456 
3457         rc = sadb_common_add(espstack->esp_pfkey_q,
3458             mp, samsg, ksi, primary, secondary, larval, clone, is_inbound,
3459             diagnostic, espstack->ipsecesp_netstack, &espstack->esp_sadb);
3460 
3461         if (larval != NULL) {
3462                 if (rc == 0) {
3463                         lpkt = sadb_clear_lpkt(larval);
3464                         if (lpkt != NULL) {
3465                                 rc = !taskq_dispatch(esp_taskq, inbound_task,
3466                                     lpkt, TQ_NOSLEEP);
3467                         }
3468                 }
3469                 IPSA_REFRELE(larval);
3470         }
3471 
3472         /*
3473          * How much more stack will I create with all of these
3474          * esp_outbound() calls?
3475          */
3476 
3477         /* Handle the packets queued waiting for the SA */
3478         while (acq_msgs != NULL) {
3479                 mblk_t          *asyncmp;
3480                 mblk_t          *data_mp;
3481                 ip_xmit_attr_t  ixas;
3482                 ill_t           *ill;
3483 
3484                 asyncmp = acq_msgs;
3485                 acq_msgs = acq_msgs->b_next;
3486                 asyncmp->b_next = NULL;
3487 
3488                 /*
3489                  * Extract the ip_xmit_attr_t from the first mblk.
3490                  * Verifies that the netstack and ill is still around; could
3491                  * have vanished while iked was doing its work.
3492                  * On succesful return we have a nce_t and the ill/ipst can't
3493                  * disappear until we do the nce_refrele in ixa_cleanup.
3494                  */
3495                 data_mp = asyncmp->b_cont;
3496                 asyncmp->b_cont = NULL;
3497                 if (!ip_xmit_attr_from_mblk(asyncmp, &ixas)) {
3498                         ESP_BUMP_STAT(espstack, out_discards);
3499                         ip_drop_packet(data_mp, B_FALSE, NULL,
3500                             DROPPER(ipss, ipds_sadb_acquire_timeout),
3501                             &espstack->esp_dropper);
3502                 } else if (rc != 0) {
3503                         ill = ixas.ixa_nce->nce_ill;
3504                         ESP_BUMP_STAT(espstack, out_discards);
3505                         ip_drop_packet(data_mp, B_FALSE, ill,
3506                             DROPPER(ipss, ipds_sadb_acquire_timeout),
3507                             &espstack->esp_dropper);
3508                         BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3509                 } else {
3510                         esp_outbound_finish(data_mp, &ixas);
3511                 }
3512                 ixa_cleanup(&ixas);
3513         }
3514 
3515         return (rc);
3516 }
3517 
3518 /*
3519  * Process one of the queued messages (from ipsacq_mp) once the SA
3520  * has been added.
3521  */
3522 static void
3523 esp_outbound_finish(mblk_t *data_mp, ip_xmit_attr_t *ixa)
3524 {
3525         netstack_t      *ns = ixa->ixa_ipst->ips_netstack;
3526         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3527         ipsec_stack_t   *ipss = ns->netstack_ipsec;
3528         ill_t           *ill = ixa->ixa_nce->nce_ill;
3529 
3530         if (!ipsec_outbound_sa(data_mp, ixa, IPPROTO_ESP)) {
3531                 ESP_BUMP_STAT(espstack, out_discards);
3532                 ip_drop_packet(data_mp, B_FALSE, ill,
3533                     DROPPER(ipss, ipds_sadb_acquire_timeout),
3534                     &espstack->esp_dropper);
3535                 BUMP_MIB(ill->ill_ip_mib, ipIfStatsOutDiscards);
3536                 return;
3537         }
3538 
3539         data_mp = esp_outbound(data_mp, ixa);
3540         if (data_mp == NULL)
3541                 return;
3542 
3543         /* do AH processing if needed */
3544         data_mp = esp_do_outbound_ah(data_mp, ixa);
3545         if (data_mp == NULL)
3546                 return;
3547 
3548         (void) ip_output_post_ipsec(data_mp, ixa);
3549 }
3550 
3551 /*
3552  * Add new ESP security association.  This may become a generic AH/ESP
3553  * routine eventually.
3554  */
3555 static int
3556 esp_add_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic, netstack_t *ns)
3557 {
3558         sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3559         sadb_address_t *srcext =
3560             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3561         sadb_address_t *dstext =
3562             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3563         sadb_address_t *isrcext =
3564             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_SRC];
3565         sadb_address_t *idstext =
3566             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_INNER_DST];
3567         sadb_address_t *nttext_loc =
3568             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_LOC];
3569         sadb_address_t *nttext_rem =
3570             (sadb_address_t *)ksi->ks_in_extv[SADB_X_EXT_ADDRESS_NATT_REM];
3571         sadb_key_t *akey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_AUTH];
3572         sadb_key_t *ekey = (sadb_key_t *)ksi->ks_in_extv[SADB_EXT_KEY_ENCRYPT];
3573         struct sockaddr_in *src, *dst;
3574         struct sockaddr_in *natt_loc, *natt_rem;
3575         struct sockaddr_in6 *natt_loc6, *natt_rem6;
3576         sadb_lifetime_t *soft =
3577             (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_SOFT];
3578         sadb_lifetime_t *hard =
3579             (sadb_lifetime_t *)ksi->ks_in_extv[SADB_EXT_LIFETIME_HARD];
3580         sadb_lifetime_t *idle =
3581             (sadb_lifetime_t *)ksi->ks_in_extv[SADB_X_EXT_LIFETIME_IDLE];
3582         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
3583         ipsec_stack_t   *ipss = ns->netstack_ipsec;
3584 
3585 
3586 
3587         /* I need certain extensions present for an ADD message. */
3588         if (srcext == NULL) {
3589                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_SRC;
3590                 return (EINVAL);
3591         }
3592         if (dstext == NULL) {
3593                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3594                 return (EINVAL);
3595         }
3596         if (isrcext == NULL && idstext != NULL) {
3597                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_SRC;
3598                 return (EINVAL);
3599         }
3600         if (isrcext != NULL && idstext == NULL) {
3601                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_INNER_DST;
3602                 return (EINVAL);
3603         }
3604         if (assoc == NULL) {
3605                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3606                 return (EINVAL);
3607         }
3608         if (ekey == NULL && assoc->sadb_sa_encrypt != SADB_EALG_NULL) {
3609                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_EKEY;
3610                 return (EINVAL);
3611         }
3612 
3613         src = (struct sockaddr_in *)(srcext + 1);
3614         dst = (struct sockaddr_in *)(dstext + 1);
3615         natt_loc = (struct sockaddr_in *)(nttext_loc + 1);
3616         natt_loc6 = (struct sockaddr_in6 *)(nttext_loc + 1);
3617         natt_rem = (struct sockaddr_in *)(nttext_rem + 1);
3618         natt_rem6 = (struct sockaddr_in6 *)(nttext_rem + 1);
3619 
3620         /* Sundry ADD-specific reality checks. */
3621         /* XXX STATS :  Logging/stats here? */
3622 
3623         if ((assoc->sadb_sa_state != SADB_SASTATE_MATURE) &&
3624             (assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE_ELSEWHERE)) {
3625                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_SASTATE;
3626                 return (EINVAL);
3627         }
3628         if (assoc->sadb_sa_encrypt == SADB_EALG_NONE) {
3629                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3630                 return (EINVAL);
3631         }
3632 
3633 #ifndef IPSEC_LATENCY_TEST
3634         if (assoc->sadb_sa_encrypt == SADB_EALG_NULL &&
3635             assoc->sadb_sa_auth == SADB_AALG_NONE) {
3636                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3637                 return (EINVAL);
3638         }
3639 #endif
3640 
3641         if (assoc->sadb_sa_flags & ~espstack->esp_sadb.s_addflags) {
3642                 *diagnostic = SADB_X_DIAGNOSTIC_BAD_SAFLAGS;
3643                 return (EINVAL);
3644         }
3645 
3646         if ((*diagnostic = sadb_hardsoftchk(hard, soft, idle)) != 0) {
3647                 return (EINVAL);
3648         }
3649         ASSERT(src->sin_family == dst->sin_family);
3650 
3651         if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_LOC) {
3652                 if (nttext_loc == NULL) {
3653                         *diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_LOC;
3654                         return (EINVAL);
3655                 }
3656 
3657                 if (natt_loc->sin_family == AF_INET6 &&
3658                     !IN6_IS_ADDR_V4MAPPED(&natt_loc6->sin6_addr)) {
3659                         *diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_LOC;
3660                         return (EINVAL);
3661                 }
3662         }
3663 
3664         if (assoc->sadb_sa_flags & SADB_X_SAFLAGS_NATT_REM) {
3665                 if (nttext_rem == NULL) {
3666                         *diagnostic = SADB_X_DIAGNOSTIC_MISSING_NATT_REM;
3667                         return (EINVAL);
3668                 }
3669                 if (natt_rem->sin_family == AF_INET6 &&
3670                     !IN6_IS_ADDR_V4MAPPED(&natt_rem6->sin6_addr)) {
3671                         *diagnostic = SADB_X_DIAGNOSTIC_MALFORMED_NATT_REM;
3672                         return (EINVAL);
3673                 }
3674         }
3675 
3676 
3677         /* Stuff I don't support, for now.  XXX Diagnostic? */
3678         if (ksi->ks_in_extv[SADB_EXT_LIFETIME_CURRENT] != NULL)
3679                 return (EOPNOTSUPP);
3680 
3681         if ((*diagnostic = sadb_labelchk(ksi)) != 0)
3682                 return (EINVAL);
3683 
3684         /*
3685          * XXX Policy :  I'm not checking identities at this time,
3686          * but if I did, I'd do them here, before I sent
3687          * the weak key check up to the algorithm.
3688          */
3689 
3690         mutex_enter(&ipss->ipsec_alg_lock);
3691 
3692         /*
3693          * First locate the authentication algorithm.
3694          */
3695 #ifdef IPSEC_LATENCY_TEST
3696         if (akey != NULL && assoc->sadb_sa_auth != SADB_AALG_NONE) {
3697 #else
3698         if (akey != NULL) {
3699 #endif
3700                 ipsec_alginfo_t *aalg;
3701 
3702                 aalg = ipss->ipsec_alglists[IPSEC_ALG_AUTH]
3703                     [assoc->sadb_sa_auth];
3704                 if (aalg == NULL || !ALG_VALID(aalg)) {
3705                         mutex_exit(&ipss->ipsec_alg_lock);
3706                         esp1dbg(espstack, ("Couldn't find auth alg #%d.\n",
3707                             assoc->sadb_sa_auth));
3708                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_AALG;
3709                         return (EINVAL);
3710                 }
3711 
3712                 /*
3713                  * Sanity check key sizes.
3714                  * Note: It's not possible to use SADB_AALG_NONE because
3715                  * this auth_alg is not defined with ALG_FLAG_VALID. If this
3716                  * ever changes, the same check for SADB_AALG_NONE and
3717                  * a auth_key != NULL should be made here ( see below).
3718                  */
3719                 if (!ipsec_valid_key_size(akey->sadb_key_bits, aalg)) {
3720                         mutex_exit(&ipss->ipsec_alg_lock);
3721                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_AKEYBITS;
3722                         return (EINVAL);
3723                 }
3724                 ASSERT(aalg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3725 
3726                 /* check key and fix parity if needed */
3727                 if (ipsec_check_key(aalg->alg_mech_type, akey, B_TRUE,
3728                     diagnostic) != 0) {
3729                         mutex_exit(&ipss->ipsec_alg_lock);
3730                         return (EINVAL);
3731                 }
3732         }
3733 
3734         /*
3735          * Then locate the encryption algorithm.
3736          */
3737         if (ekey != NULL) {
3738                 uint_t keybits;
3739                 ipsec_alginfo_t *ealg;
3740 
3741                 ealg = ipss->ipsec_alglists[IPSEC_ALG_ENCR]
3742                     [assoc->sadb_sa_encrypt];
3743                 if (ealg == NULL || !ALG_VALID(ealg)) {
3744                         mutex_exit(&ipss->ipsec_alg_lock);
3745                         esp1dbg(espstack, ("Couldn't find encr alg #%d.\n",
3746                             assoc->sadb_sa_encrypt));
3747                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_EALG;
3748                         return (EINVAL);
3749                 }
3750 
3751                 /*
3752                  * Sanity check key sizes. If the encryption algorithm is
3753                  * SADB_EALG_NULL but the encryption key is NOT
3754                  * NULL then complain.
3755                  *
3756                  * The keying material includes salt bits if required by
3757                  * algorithm and optionally the Initial IV, check the
3758                  * length of whats left.
3759                  */
3760                 keybits = ekey->sadb_key_bits;
3761                 keybits -= ekey->sadb_key_reserved;
3762                 keybits -= SADB_8TO1(ealg->alg_saltlen);
3763                 if ((assoc->sadb_sa_encrypt == SADB_EALG_NULL) ||
3764                     (!ipsec_valid_key_size(keybits, ealg))) {
3765                         mutex_exit(&ipss->ipsec_alg_lock);
3766                         *diagnostic = SADB_X_DIAGNOSTIC_BAD_EKEYBITS;
3767                         return (EINVAL);
3768                 }
3769                 ASSERT(ealg->alg_mech_type != CRYPTO_MECHANISM_INVALID);
3770 
3771                 /* check key */
3772                 if (ipsec_check_key(ealg->alg_mech_type, ekey, B_FALSE,
3773                     diagnostic) != 0) {
3774                         mutex_exit(&ipss->ipsec_alg_lock);
3775                         return (EINVAL);
3776                 }
3777         }
3778         mutex_exit(&ipss->ipsec_alg_lock);
3779 
3780         return (esp_add_sa_finish(mp, (sadb_msg_t *)mp->b_cont->b_rptr, ksi,
3781             diagnostic, espstack));
3782 }
3783 
3784 /*
3785  * Update a security association.  Updates come in two varieties.  The first
3786  * is an update of lifetimes on a non-larval SA.  The second is an update of
3787  * a larval SA, which ends up looking a lot more like an add.
3788  */
3789 static int
3790 esp_update_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3791     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3792 {
3793         sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3794         mblk_t    *buf_pkt;
3795         int rcode;
3796 
3797         sadb_address_t *dstext =
3798             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3799 
3800         if (dstext == NULL) {
3801                 *diagnostic = SADB_X_DIAGNOSTIC_MISSING_DST;
3802                 return (EINVAL);
3803         }
3804 
3805         rcode = sadb_update_sa(mp, ksi, &buf_pkt, &espstack->esp_sadb,
3806             diagnostic, espstack->esp_pfkey_q, esp_add_sa,
3807             espstack->ipsecesp_netstack, sadb_msg_type);
3808 
3809         if ((assoc->sadb_sa_state != SADB_X_SASTATE_ACTIVE) ||
3810             (rcode != 0)) {
3811                 return (rcode);
3812         }
3813 
3814         HANDLE_BUF_PKT(esp_taskq, espstack->ipsecesp_netstack->netstack_ipsec,
3815             espstack->esp_dropper, buf_pkt);
3816 
3817         return (rcode);
3818 }
3819 
3820 /* XXX refactor me */
3821 /*
3822  * Delete a security association.  This is REALLY likely to be code common to
3823  * both AH and ESP.  Find the association, then unlink it.
3824  */
3825 static int
3826 esp_del_sa(mblk_t *mp, keysock_in_t *ksi, int *diagnostic,
3827     ipsecesp_stack_t *espstack, uint8_t sadb_msg_type)
3828 {
3829         sadb_sa_t *assoc = (sadb_sa_t *)ksi->ks_in_extv[SADB_EXT_SA];
3830         sadb_address_t *dstext =
3831             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_DST];
3832         sadb_address_t *srcext =
3833             (sadb_address_t *)ksi->ks_in_extv[SADB_EXT_ADDRESS_SRC];
3834         struct sockaddr_in *sin;
3835 
3836         if (assoc == NULL) {
3837                 if (dstext != NULL) {
3838                         sin = (struct sockaddr_in *)(dstext + 1);
3839                 } else if (srcext != NULL) {
3840                         sin = (struct sockaddr_in *)(srcext + 1);
3841                 } else {
3842                         *diagnostic = SADB_X_DIAGNOSTIC_MISSING_SA;
3843                         return (EINVAL);
3844                 }
3845                 return (sadb_purge_sa(mp, ksi,
3846                     (sin->sin_family == AF_INET6) ? &espstack->esp_sadb.s_v6 :
3847                     &espstack->esp_sadb.s_v4, diagnostic,
3848                     espstack->esp_pfkey_q));
3849         }
3850 
3851         return (sadb_delget_sa(mp, ksi, &espstack->esp_sadb, diagnostic,
3852             espstack->esp_pfkey_q, sadb_msg_type));
3853 }
3854 
3855 /* XXX refactor me */
3856 /*
3857  * Convert the entire contents of all of ESP's SA tables into PF_KEY SADB_DUMP
3858  * messages.
3859  */
3860 static void
3861 esp_dump(mblk_t *mp, keysock_in_t *ksi, ipsecesp_stack_t *espstack)
3862 {
3863         int error;
3864         sadb_msg_t *samsg;
3865 
3866         /*
3867          * Dump each fanout, bailing if error is non-zero.
3868          */
3869 
3870         error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3871             &espstack->esp_sadb.s_v4);
3872         if (error != 0)
3873                 goto bail;
3874 
3875         error = sadb_dump(espstack->esp_pfkey_q, mp, ksi,
3876             &espstack->esp_sadb.s_v6);
3877 bail:
3878         ASSERT(mp->b_cont != NULL);
3879         samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
3880         samsg->sadb_msg_errno = (uint8_t)error;
3881         sadb_pfkey_echo(espstack->esp_pfkey_q, mp,
3882             (sadb_msg_t *)mp->b_cont->b_rptr, ksi, NULL);
3883 }
3884 
3885 /*
3886  * First-cut reality check for an inbound PF_KEY message.
3887  */
3888 static boolean_t
3889 esp_pfkey_reality_failures(mblk_t *mp, keysock_in_t *ksi,
3890     ipsecesp_stack_t *espstack)
3891 {
3892         int diagnostic;
3893 
3894         if (ksi->ks_in_extv[SADB_EXT_PROPOSAL] != NULL) {
3895                 diagnostic = SADB_X_DIAGNOSTIC_PROP_PRESENT;
3896                 goto badmsg;
3897         }
3898         if (ksi->ks_in_extv[SADB_EXT_SUPPORTED_AUTH] != NULL ||
3899             ksi->ks_in_extv[SADB_EXT_SUPPORTED_ENCRYPT] != NULL) {
3900                 diagnostic = SADB_X_DIAGNOSTIC_SUPP_PRESENT;
3901                 goto badmsg;
3902         }
3903         return (B_FALSE);       /* False ==> no failures */
3904 
3905 badmsg:
3906         sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL, diagnostic,
3907             ksi->ks_in_serial);
3908         return (B_TRUE);        /* True ==> failures */
3909 }
3910 
3911 /*
3912  * ESP parsing of PF_KEY messages.  Keysock did most of the really silly
3913  * error cases.  What I receive is a fully-formed, syntactically legal
3914  * PF_KEY message.  I then need to check semantics...
3915  *
3916  * This code may become common to AH and ESP.  Stay tuned.
3917  *
3918  * I also make the assumption that db_ref's are cool.  If this assumption
3919  * is wrong, this means that someone other than keysock or me has been
3920  * mucking with PF_KEY messages.
3921  */
3922 static void
3923 esp_parse_pfkey(mblk_t *mp, ipsecesp_stack_t *espstack)
3924 {
3925         mblk_t *msg = mp->b_cont;
3926         sadb_msg_t *samsg;
3927         keysock_in_t *ksi;
3928         int error;
3929         int diagnostic = SADB_X_DIAGNOSTIC_NONE;
3930 
3931         ASSERT(msg != NULL);
3932 
3933         samsg = (sadb_msg_t *)msg->b_rptr;
3934         ksi = (keysock_in_t *)mp->b_rptr;
3935 
3936         /*
3937          * If applicable, convert unspecified AF_INET6 to unspecified
3938          * AF_INET.  And do other address reality checks.
3939          */
3940         if (!sadb_addrfix(ksi, espstack->esp_pfkey_q, mp,
3941             espstack->ipsecesp_netstack) ||
3942             esp_pfkey_reality_failures(mp, ksi, espstack)) {
3943                 return;
3944         }
3945 
3946         switch (samsg->sadb_msg_type) {
3947         case SADB_ADD:
3948                 error = esp_add_sa(mp, ksi, &diagnostic,
3949                     espstack->ipsecesp_netstack);
3950                 if (error != 0) {
3951                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3952                             diagnostic, ksi->ks_in_serial);
3953                 }
3954                 /* else esp_add_sa() took care of things. */
3955                 break;
3956         case SADB_DELETE:
3957         case SADB_X_DELPAIR:
3958         case SADB_X_DELPAIR_STATE:
3959                 error = esp_del_sa(mp, ksi, &diagnostic, espstack,
3960                     samsg->sadb_msg_type);
3961                 if (error != 0) {
3962                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3963                             diagnostic, ksi->ks_in_serial);
3964                 }
3965                 /* Else esp_del_sa() took care of things. */
3966                 break;
3967         case SADB_GET:
3968                 error = sadb_delget_sa(mp, ksi, &espstack->esp_sadb,
3969                     &diagnostic, espstack->esp_pfkey_q, samsg->sadb_msg_type);
3970                 if (error != 0) {
3971                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
3972                             diagnostic, ksi->ks_in_serial);
3973                 }
3974                 /* Else sadb_get_sa() took care of things. */
3975                 break;
3976         case SADB_FLUSH:
3977                 sadbp_flush(&espstack->esp_sadb, espstack->ipsecesp_netstack);
3978                 sadb_pfkey_echo(espstack->esp_pfkey_q, mp, samsg, ksi, NULL);
3979                 break;
3980         case SADB_REGISTER:
3981                 /*
3982                  * Hmmm, let's do it!  Check for extensions (there should
3983                  * be none), extract the fields, call esp_register_out(),
3984                  * then either free or report an error.
3985                  *
3986                  * Keysock takes care of the PF_KEY bookkeeping for this.
3987                  */
3988                 if (esp_register_out(samsg->sadb_msg_seq, samsg->sadb_msg_pid,
3989                     ksi->ks_in_serial, espstack, msg_getcred(mp, NULL))) {
3990                         freemsg(mp);
3991                 } else {
3992                         /*
3993                          * Only way this path hits is if there is a memory
3994                          * failure.  It will not return B_FALSE because of
3995                          * lack of esp_pfkey_q if I am in wput().
3996                          */
3997                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, ENOMEM,
3998                             diagnostic, ksi->ks_in_serial);
3999                 }
4000                 break;
4001         case SADB_UPDATE:
4002         case SADB_X_UPDATEPAIR:
4003                 /*
4004                  * Find a larval, if not there, find a full one and get
4005                  * strict.
4006                  */
4007                 error = esp_update_sa(mp, ksi, &diagnostic, espstack,
4008                     samsg->sadb_msg_type);
4009                 if (error != 0) {
4010                         sadb_pfkey_error(espstack->esp_pfkey_q, mp, error,
4011                             diagnostic, ksi->ks_in_serial);
4012                 }
4013                 /* else esp_update_sa() took care of things. */
4014                 break;
4015         case SADB_GETSPI:
4016                 /*
4017                  * Reserve a new larval entry.
4018                  */
4019                 esp_getspi(mp, ksi, espstack);
4020                 break;
4021         case SADB_ACQUIRE:
4022                 /*
4023                  * Find larval and/or ACQUIRE record and kill it (them), I'm
4024                  * most likely an error.  Inbound ACQUIRE messages should only
4025                  * have the base header.
4026                  */
4027                 sadb_in_acquire(samsg, &espstack->esp_sadb,
4028                     espstack->esp_pfkey_q, espstack->ipsecesp_netstack);
4029                 freemsg(mp);
4030                 break;
4031         case SADB_DUMP:
4032                 /*
4033                  * Dump all entries.
4034                  */
4035                 esp_dump(mp, ksi, espstack);
4036                 /* esp_dump will take care of the return message, etc. */
4037                 break;
4038         case SADB_EXPIRE:
4039                 /* Should never reach me. */
4040                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, EOPNOTSUPP,
4041                     diagnostic, ksi->ks_in_serial);
4042                 break;
4043         default:
4044                 sadb_pfkey_error(espstack->esp_pfkey_q, mp, EINVAL,
4045                     SADB_X_DIAGNOSTIC_UNKNOWN_MSG, ksi->ks_in_serial);
4046                 break;
4047         }
4048 }
4049 
4050 /*
4051  * Handle case where PF_KEY says it can't find a keysock for one of my
4052  * ACQUIRE messages.
4053  */
4054 static void
4055 esp_keysock_no_socket(mblk_t *mp, ipsecesp_stack_t *espstack)
4056 {
4057         sadb_msg_t *samsg;
4058         keysock_out_err_t *kse = (keysock_out_err_t *)mp->b_rptr;
4059 
4060         if (mp->b_cont == NULL) {
4061                 freemsg(mp);
4062                 return;
4063         }
4064         samsg = (sadb_msg_t *)mp->b_cont->b_rptr;
4065 
4066         /*
4067          * If keysock can't find any registered, delete the acquire record
4068          * immediately, and handle errors.
4069          */
4070         if (samsg->sadb_msg_type == SADB_ACQUIRE) {
4071                 samsg->sadb_msg_errno = kse->ks_err_errno;
4072                 samsg->sadb_msg_len = SADB_8TO64(sizeof (*samsg));
4073                 /*
4074                  * Use the write-side of the esp_pfkey_q
4075                  */
4076                 sadb_in_acquire(samsg, &espstack->esp_sadb,
4077                     WR(espstack->esp_pfkey_q), espstack->ipsecesp_netstack);
4078         }
4079 
4080         freemsg(mp);
4081 }
4082 
4083 /*
4084  * ESP module write put routine.
4085  */
4086 static void
4087 ipsecesp_wput(queue_t *q, mblk_t *mp)
4088 {
4089         ipsec_info_t *ii;
4090         struct iocblk *iocp;
4091         ipsecesp_stack_t        *espstack = (ipsecesp_stack_t *)q->q_ptr;
4092 
4093         esp3dbg(espstack, ("In esp_wput().\n"));
4094 
4095         /* NOTE: Each case must take care of freeing or passing mp. */
4096         switch (mp->b_datap->db_type) {
4097         case M_CTL:
4098                 if ((mp->b_wptr - mp->b_rptr) < sizeof (ipsec_info_t)) {
4099                         /* Not big enough message. */
4100                         freemsg(mp);
4101                         break;
4102                 }
4103                 ii = (ipsec_info_t *)mp->b_rptr;
4104 
4105                 switch (ii->ipsec_info_type) {
4106                 case KEYSOCK_OUT_ERR:
4107                         esp1dbg(espstack, ("Got KEYSOCK_OUT_ERR message.\n"));
4108                         esp_keysock_no_socket(mp, espstack);
4109                         break;
4110                 case KEYSOCK_IN:
4111                         ESP_BUMP_STAT(espstack, keysock_in);
4112                         esp3dbg(espstack, ("Got KEYSOCK_IN message.\n"));
4113 
4114                         /* Parse the message. */
4115                         esp_parse_pfkey(mp, espstack);
4116                         break;
4117                 case KEYSOCK_HELLO:
4118                         sadb_keysock_hello(&espstack->esp_pfkey_q, q, mp,
4119                             esp_ager, (void *)espstack, &espstack->esp_event,
4120                             SADB_SATYPE_ESP);
4121                         break;
4122                 default:
4123                         esp2dbg(espstack, ("Got M_CTL from above of 0x%x.\n",
4124                             ii->ipsec_info_type));
4125                         freemsg(mp);
4126                         break;
4127                 }
4128                 break;
4129         case M_IOCTL:
4130                 iocp = (struct iocblk *)mp->b_rptr;
4131                 switch (iocp->ioc_cmd) {
4132                 case ND_SET:
4133                 case ND_GET:
4134                         if (nd_getset(q, espstack->ipsecesp_g_nd, mp)) {
4135                                 qreply(q, mp);
4136                                 return;
4137                         } else {
4138                                 iocp->ioc_error = ENOENT;
4139                         }
4140                         /* FALLTHRU */
4141                 default:
4142                         /* We really don't support any other ioctls, do we? */
4143 
4144                         /* Return EINVAL */
4145                         if (iocp->ioc_error != ENOENT)
4146                                 iocp->ioc_error = EINVAL;
4147                         iocp->ioc_count = 0;
4148                         mp->b_datap->db_type = M_IOCACK;
4149                         qreply(q, mp);
4150                         return;
4151                 }
4152         default:
4153                 esp3dbg(espstack,
4154                     ("Got default message, type %d, passing to IP.\n",
4155                     mp->b_datap->db_type));
4156                 putnext(q, mp);
4157         }
4158 }
4159 
4160 /*
4161  * Wrapper to allow IP to trigger an ESP association failure message
4162  * during inbound SA selection.
4163  */
4164 void
4165 ipsecesp_in_assocfailure(mblk_t *mp, char level, ushort_t sl, char *fmt,
4166     uint32_t spi, void *addr, int af, ip_recv_attr_t *ira)
4167 {
4168         netstack_t      *ns = ira->ira_ill->ill_ipst->ips_netstack;
4169         ipsecesp_stack_t *espstack = ns->netstack_ipsecesp;
4170         ipsec_stack_t   *ipss = ns->netstack_ipsec;
4171 
4172         if (espstack->ipsecesp_log_unknown_spi) {
4173                 ipsec_assocfailure(info.mi_idnum, 0, level, sl, fmt, spi,
4174                     addr, af, espstack->ipsecesp_netstack);
4175         }
4176 
4177         ip_drop_packet(mp, B_TRUE, ira->ira_ill,
4178             DROPPER(ipss, ipds_esp_no_sa),
4179             &espstack->esp_dropper);
4180 }
4181 
4182 /*
4183  * Initialize the ESP input and output processing functions.
4184  */
4185 void
4186 ipsecesp_init_funcs(ipsa_t *sa)
4187 {
4188         if (sa->ipsa_output_func == NULL)
4189                 sa->ipsa_output_func = esp_outbound;
4190         if (sa->ipsa_input_func == NULL)
4191                 sa->ipsa_input_func = esp_inbound;
4192 }