Print this page
10472 Limit number of multicast NCEs
Reviewed by: Cody Peter Mello <melloc@writev.io>
Reviewed by: Jason King <jason.king@joyent.com>
Reviewed by: Robert Mustacchi <rm@joyent.com>

Split Close
Expand all
Collapse all
          --- old/usr/src/uts/common/inet/ip/ip_ndp.c
          +++ new/usr/src/uts/common/inet/ip/ip_ndp.c
↓ open down ↓ 15 lines elided ↑ open up ↑
  16   16   * fields enclosed by brackets "[]" replaced with your own identifying
  17   17   * information: Portions Copyright [yyyy] [name of copyright owner]
  18   18   *
  19   19   * CDDL HEADER END
  20   20   */
  21   21  /*
  22   22   * Copyright (c) 1999, 2010, Oracle and/or its affiliates. All rights reserved.
  23   23   */
  24   24  
  25   25  /*
  26      - * Copyright (c) 2018, Joyent, Inc.
       26 + * Copyright (c) 2019, Joyent, Inc.
  27   27   */
  28   28  
  29   29  #include <sys/types.h>
  30   30  #include <sys/stream.h>
  31   31  #include <sys/stropts.h>
  32   32  #include <sys/strsun.h>
  33   33  #include <sys/sysmacros.h>
  34   34  #include <sys/errno.h>
  35   35  #include <sys/dlpi.h>
  36   36  #include <sys/socket.h>
↓ open down ↓ 83 lines elided ↑ open up ↑
 120  120  static  int     nce_set_multicast_v4(ill_t *ill, const in_addr_t *dst,
 121  121      uint16_t ncec_flags, nce_t **newnce);
 122  122  static  boolean_t       ndp_xmit(ill_t *ill, uint32_t operation,
 123  123      uint8_t *hwaddr, uint_t hwaddr_len, const in6_addr_t *sender,
 124  124      const in6_addr_t *target, int flag);
 125  125  static void     ncec_refhold_locked(ncec_t *);
 126  126  static boolean_t ill_defend_rate_limit(ill_t *, ncec_t *);
 127  127  static  void    nce_queue_mp_common(ncec_t *, mblk_t *, boolean_t);
 128  128  static  int     nce_add_common(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 129  129      uint16_t, uint16_t, nce_t **);
 130      -static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *);
 131      -static nce_t *nce_add(ill_t *, ncec_t *);
      130 +static nce_t *nce_add_impl(ill_t *, ncec_t *, nce_t *, mblk_t *, list_t *);
      131 +static nce_t *nce_add(ill_t *, ncec_t *, list_t *);
 132  132  static void nce_inactive(nce_t *);
 133  133  extern nce_t    *nce_lookup(ill_t *, const in6_addr_t *);
 134  134  static nce_t *nce_ill_lookup_then_add(ill_t *, ncec_t *);
 135  135  static int      nce_add_v6(ill_t *, uchar_t *, uint_t, const in6_addr_t *,
 136  136      uint16_t, uint16_t, nce_t **);
 137  137  static int      nce_add_v4(ill_t *, uchar_t *, uint_t, const in_addr_t *,
 138  138      uint16_t, uint16_t, nce_t **);
 139  139  static int  nce_add_v6_postprocess(nce_t *);
 140  140  static int  nce_add_v4_postprocess(nce_t *);
 141  141  static ill_t *nce_resolve_src(ncec_t *, in6_addr_t *);
↓ open down ↓ 856 lines elided ↑ open up ↑
 998  998   * Note that ill can be NULL hence can't derive the ipst from it.
 999  999   */
1000 1000  void
1001 1001  ncec_walk(ill_t *ill, ncec_walk_cb_t cbf, void *arg1, ip_stack_t *ipst)
1002 1002  {
1003 1003          ncec_walk_common(ipst->ips_ndp4, ill, cbf, arg1, B_TRUE);
1004 1004          ncec_walk_common(ipst->ips_ndp6, ill, cbf, arg1, B_TRUE);
1005 1005  }
1006 1006  
1007 1007  /*
     1008 + * Cheesy globals (i.e. all netstacks) for both a limit on per-ill multicast
     1009 + * NCEs, and the number to reclaim if we hit the limit.  Used by
     1010 + * nce_set_multicast_v[46]() to limit the linked-list length of ill_nce. Until
     1011 + * we solve the multicast-mappings-shouldn't-be-NCEs problem, use this.
     1012 + */
     1013 +
     1014 +/* Maximum number of multicast NCEs on an ill. */
     1015 +uint_t ip_max_ill_mcast_nces = 16384;
     1016 +/*
     1017 + * Number of NCEs to delete if we hit the maximum above.  0 means *don't* and
     1018 + * return an error.  Non-zero means delete so many, and if the number is >=
     1019 + * the max above, that means delete them all.
     1020 + */
     1021 +uint_t ip_ill_mcast_reclaim = 256;
     1022 +
     1023 +/*
     1024 + * Encapsulate multicast ill capping in a function, for easier DTrace
     1025 + * detections.  Return a list of refheld NCEs to destroy-via-refrele.  That
     1026 + * list can be NULL, but can only be non-NULL if we successfully reclaimed.
     1027 + *
     1028 + * NOTE:  This function must be called while holding the ill_lock AND
     1029 + * JUST PRIOR to making the insertion into the ill_nce list.
     1030 + *
     1031 + * We can't release the ones we delete ourselves because the ill_lock is held
     1032 + * by the caller. They are, instead, passed back in a list_t for deletion
     1033 + * outside of the ill_lock hold. nce_graveyard_free() actually frees them.
     1034 + *
     1035 + * While this covers nce_t, ncec_t gets done even further down the road.  See
     1036 + * nce_graveyard_free() for why.
     1037 + */
     1038 +static boolean_t
     1039 +nce_too_many_mcast(ill_t *ill, list_t *graveyard)
     1040 +{
     1041 +        uint_t reclaim_count, max_count, reclaimed = 0;
     1042 +        boolean_t too_many;
     1043 +        nce_t *nce, *deadman;
     1044 +
     1045 +        ASSERT(graveyard != NULL);
     1046 +        ASSERT(list_is_empty(graveyard));
     1047 +        ASSERT(MUTEX_HELD(&ill->ill_lock));
     1048 +
     1049 +        /*
     1050 +         * NOTE: Some grinning weirdo may have lowered the global max beyond
     1051 +         * what this ill currently has.  The behavior in this case will be
     1052 +         * trim-back just by the reclaim amount for any new ones.
     1053 +         */
     1054 +        max_count = ip_max_ill_mcast_nces;
     1055 +        reclaim_count = min(ip_ill_mcast_reclaim, max_count);
     1056 +
     1057 +        /* All good? */
     1058 +        if (ill->ill_mcast_nces < max_count)
     1059 +                return (B_FALSE);       /* Yes, all good. */
     1060 +
     1061 +        if (reclaim_count == 0)
     1062 +                return (B_TRUE);        /* Don't bother - we're stuck. */
     1063 +
     1064 +        /* We need to reclaim now.  Exploit our held ill_lock. */
     1065 +
     1066 +        /*
     1067 +         * Start at the tail and work backwards, new nces are head-inserted,
     1068 +         * so we'll be reaping the oldest entries.
     1069 +         */
     1070 +        nce = list_tail(&ill->ill_nce);
     1071 +        while (reclaimed < reclaim_count) {
     1072 +                /* Skip ahead to a multicast NCE. */
     1073 +                while (nce != NULL &&
     1074 +                    (nce->nce_common->ncec_flags & NCE_F_MCAST) == 0) {
     1075 +                        nce = list_prev(&ill->ill_nce, nce);
     1076 +                }
     1077 +                if (nce == NULL)
     1078 +                        break;
     1079 +
     1080 +                /*
     1081 +                 * NOTE: For now, we just delete the first one(s) we find.
     1082 +                 * This is not optimal, and may require some inspection of nce
     1083 +                 * & its ncec to be better.
     1084 +                 */
     1085 +                deadman = nce;
     1086 +                nce = list_prev(&ill->ill_nce, nce);
     1087 +
     1088 +                /* nce_delete() requires caller holds... */
     1089 +                nce_refhold(deadman);
     1090 +                nce_delete(deadman);    /* Bumps down ill_mcast_nces. */
     1091 +
     1092 +                /* Link the dead ones singly, still refheld... */
     1093 +                list_insert_tail(graveyard, deadman);
     1094 +                reclaimed++;
     1095 +        }
     1096 +
     1097 +        if (reclaimed != reclaim_count) {
     1098 +                /* We didn't have enough to reach reclaim_count. Why?!? */
     1099 +                DTRACE_PROBE3(ill__mcast__nce__reclaim__mismatch, ill_t *, ill,
     1100 +                    uint_t, reclaimed, uint_t, reclaim_count);
     1101 +
     1102 +                /* In case for some REALLY weird reason we found none! */
     1103 +                too_many = (reclaimed == 0);
     1104 +        } else {
     1105 +                too_many = B_FALSE;
     1106 +        }
     1107 +
     1108 +        return (too_many);
     1109 +}
     1110 +
     1111 +static void
     1112 +ncec_mcast_reap_one(ncec_t *ncec, void *arg)
     1113 +{
     1114 +        boolean_t reapit;
     1115 +        ill_t *ill = (ill_t *)arg;
     1116 +
     1117 +        /* Obvious no-lock-needed checks... */
     1118 +        if (ncec == NULL || ncec->ncec_ill != ill ||
     1119 +            (ncec->ncec_flags & NCE_F_MCAST) == 0)
     1120 +                return;
     1121 +
     1122 +        mutex_enter(&ncec->ncec_lock);
     1123 +        /*
     1124 +         * It's refheld by the walk infrastructure. It has one reference for
     1125 +         * being in the ndp_g_hash, and if an nce_t exists, that's one more.
     1126 +         * We want ones without an nce_t, so 2 is the magic number.  If it's
     1127 +         * LESS than 2, we have much bigger problems anyway.
     1128 +         */
     1129 +        ASSERT(ncec->ncec_refcnt >= 2);
     1130 +        reapit = (ncec->ncec_refcnt == 2);
     1131 +        mutex_exit(&ncec->ncec_lock);
     1132 +
     1133 +        if (reapit) {
     1134 +                IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_deleted);
     1135 +                ncec_delete(ncec);
     1136 +        }
     1137 +}
     1138 +
     1139 +/*
     1140 + * Attempt to reap stray multicast ncec_t structures left in the wake of
     1141 + * nce_graveyard_free(). This is a taskq servicing routine, as it's well
     1142 + * outside any netstack-global locks being held - ndp_g_lock in this case.  We
     1143 + * have a reference hold on the ill, which will prevent any unplumbing races.
     1144 + */
     1145 +static void
     1146 +ncec_mcast_reap(void *arg)
     1147 +{
     1148 +        ill_t *ill = (ill_t *)arg;
     1149 +
     1150 +        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_calls);
     1151 +        ncec_walk(ill, ncec_mcast_reap_one, ill, ill->ill_ipst);
     1152 +        mutex_enter(&ill->ill_lock);
     1153 +        ill->ill_mcast_ncec_cleanup = B_FALSE;
     1154 +        /*
     1155 +         * Inline a _notr() version of ill_refrele. See nce_graveyard_free()
     1156 +         * below for why.
     1157 +         */
     1158 +        ill->ill_refcnt--;
     1159 +        if (ill->ill_refcnt == 0)
     1160 +                ipif_ill_refrele_tail(ill);     /* Drops ill_lock. */
     1161 +        else
     1162 +                mutex_exit(&ill->ill_lock);
     1163 +}
     1164 +
     1165 +/*
     1166 + * Free a list (including handling an empty list or NULL list) of
     1167 + * reference-held NCEs that were reaped from a nce_too_many_mcast()
     1168 + * call. Separate because the caller must have dropped ndp_g_lock first.
     1169 + *
     1170 + * This also schedules a taskq task to unlink underlying NCECs from the
     1171 + * ndp_g_hash, which are protected by ndp_g_lock.
     1172 + */
     1173 +static void
     1174 +nce_graveyard_free(list_t *graveyard)
     1175 +{
     1176 +        nce_t *deadman, *current;
     1177 +        ill_t *ill;
     1178 +        boolean_t doit;
     1179 +
     1180 +        if (graveyard == NULL)
     1181 +                return;
     1182 +
     1183 +        current = list_head(graveyard);
     1184 +        if (current == NULL) {
     1185 +                list_destroy(graveyard);
     1186 +                return;
     1187 +        }
     1188 +
     1189 +        ill = current->nce_ill;
     1190 +        /*
     1191 +         * Normally one should ill_refhold(ill) here.  There's no _notr()
     1192 +         * variant like there is for ire_t, dce_t, or even ncec_t, but this is
     1193 +         * the ONLY case that'll break the mh_trace that IP debugging uses for
     1194 +         * reference counts (i.e. they assume same thread releases as
     1195 +         * holds). Instead, we inline ill_refhold() here.  We must do the same
     1196 +         * in the release done by the ncec_mcast_reap() above.
     1197 +         */
     1198 +        mutex_enter(&ill->ill_lock);
     1199 +        ill->ill_refcnt++;
     1200 +        mutex_exit(&ill->ill_lock);
     1201 +
     1202 +        while (current != NULL) {
     1203 +                ASSERT3P(ill, ==, current->nce_ill);
     1204 +                deadman = current;
     1205 +                current = list_next(graveyard, deadman);
     1206 +                list_remove(graveyard, deadman);
     1207 +                ASSERT3U((deadman->nce_common->ncec_flags & NCE_F_MCAST), !=,
     1208 +                    0);
     1209 +                nce_refrele(deadman);
     1210 +        }
     1211 +        list_destroy(graveyard);
     1212 +
     1213 +        mutex_enter(&ill->ill_lock);
     1214 +        if (ill->ill_mcast_ncec_cleanup)
     1215 +                doit = B_FALSE;
     1216 +        else {
     1217 +                ill->ill_mcast_ncec_cleanup = B_TRUE;
     1218 +                doit = B_TRUE;
     1219 +        }
     1220 +        mutex_exit(&ill->ill_lock);
     1221 +        if (!doit || taskq_dispatch(system_taskq, ncec_mcast_reap,
     1222 +            ill, TQ_NOSLEEP) == NULL) {
     1223 +                mutex_enter(&ill->ill_lock);
     1224 +                if (doit) {
     1225 +                        IP_STAT(ill->ill_ipst, ip_nce_mcast_reclaim_tqfail);
     1226 +                        ill->ill_mcast_ncec_cleanup = B_FALSE;
     1227 +                }
     1228 +                /* There's no _notr() for ill_refrele(), so inline it here. */
     1229 +                ill->ill_refcnt--;
     1230 +                if (ill->ill_refcnt == 0)
     1231 +                        ipif_ill_refrele_tail(ill);     /* Drops ill_lock */
     1232 +                else
     1233 +                        mutex_exit(&ill->ill_lock);
     1234 +        }
     1235 +}
     1236 +
     1237 +/*
1008 1238   * For each interface an entry is added for the unspecified multicast group.
1009 1239   * Here that mapping is used to form the multicast cache entry for a particular
1010 1240   * multicast destination.
1011 1241   */
1012 1242  static int
1013 1243  nce_set_multicast_v6(ill_t *ill, const in6_addr_t *dst,
1014 1244      uint16_t flags, nce_t **newnce)
1015 1245  {
1016 1246          uchar_t         *hw_addr;
1017 1247          int             err = 0;
↓ open down ↓ 25 lines elided ↑ open up ↑
1043 1273                  /* No hw_addr is needed for IRE_IF_NORESOLVER. */
1044 1274                  hw_addr = NULL;
1045 1275          }
1046 1276          ASSERT((flags & NCE_F_MCAST) != 0);
1047 1277          ASSERT((flags & NCE_F_NONUD) != 0);
1048 1278          /* nce_state will be computed by nce_add_common() */
1049 1279          err = nce_add_v6(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
1050 1280              ND_UNCHANGED, &nce);
1051 1281          mutex_exit(&ipst->ips_ndp6->ndp_g_lock);
1052 1282          if (err == 0)
1053      -                err = nce_add_v6_postprocess(nce);
     1283 +                err = (nce != NULL) ? nce_add_v6_postprocess(nce) : ENOMEM;
1054 1284          if (hw_addr != NULL)
1055 1285                  kmem_free(hw_addr, ill->ill_nd_lla_len);
1056 1286          if (err != 0) {
1057 1287                  ip1dbg(("nce_set_multicast_v6: create failed" "%d\n", err));
1058 1288                  return (err);
1059 1289          }
1060 1290  done:
1061 1291          ASSERT(nce->nce_common->ncec_state == ND_REACHABLE);
1062 1292          if (newnce != NULL)
1063 1293                  *newnce = nce;
↓ open down ↓ 2029 lines elided ↑ open up ↑
3093 3323   * Note that since ill_fastpath_probe() copies the mblk there is
3094 3324   * no need to hold the nce or ncec beyond this function.
3095 3325   *
3096 3326   * If the caller has passed in a non-null ncec_nce to nce_fastpath() that
3097 3327   * ncec_nce must correspond to the nce for ncec with nce_ill == ncec->ncec_ill
3098 3328   * and will be returned back by this function, so that no extra nce_refrele
3099 3329   * is required for the caller. The calls from nce_add_common() use this
3100 3330   * method. All other callers (that pass in NULL ncec_nce) will have to do a
3101 3331   * nce_refrele of the returned nce (when it is non-null).
3102 3332   */
3103      -nce_t *
     3333 +static nce_t *
3104 3334  nce_fastpath(ncec_t *ncec, boolean_t trigger_fp_req, nce_t *ncec_nce)
3105 3335  {
3106 3336          nce_t *nce;
3107 3337          ill_t *ill = ncec->ncec_ill;
3108 3338  
3109 3339          ASSERT(ill != NULL);
3110 3340  
3111 3341          if (IS_IPMP(ill) && trigger_fp_req) {
3112 3342                  trigger_fp_req = B_FALSE;
3113 3343                  ipmp_ncec_refresh_nce(ncec);
↓ open down ↓ 37 lines elided ↑ open up ↑
3151 3381           * deleted.
3152 3382           */
3153 3383          if (res != 0 && res != EAGAIN && res != ENOTSUP)
3154 3384                  nce_fastpath_list_delete(ill, ncec, NULL);
3155 3385  }
3156 3386  
3157 3387  /*
3158 3388   * Add ncec to the nce fastpath list on ill.
3159 3389   */
3160 3390  static nce_t *
3161      -nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec)
     3391 +nce_ill_lookup_then_add_locked(ill_t *ill, ncec_t *ncec, list_t *graveyard)
3162 3392  {
3163 3393          nce_t *nce = NULL;
3164 3394  
3165 3395          ASSERT(MUTEX_HELD(&ill->ill_lock));
3166 3396          /*
3167 3397           * Atomically ensure that the ill is not CONDEMNED and is not going
3168 3398           * down, before adding the NCE.
3169 3399           */
3170 3400          if (ill->ill_state_flags & ILL_CONDEMNED)
3171 3401                  return (NULL);
3172 3402          mutex_enter(&ncec->ncec_lock);
3173 3403          /*
3174 3404           * if ncec has not been deleted and
3175 3405           * is not already in the list add it.
3176 3406           */
3177 3407          if (!NCE_ISCONDEMNED(ncec)) {
3178 3408                  nce = nce_lookup(ill, &ncec->ncec_addr);
3179 3409                  if (nce != NULL)
3180 3410                          goto done;
3181      -                nce = nce_add(ill, ncec);
     3411 +                nce = nce_add(ill, ncec, graveyard);
3182 3412          }
3183 3413  done:
3184 3414          mutex_exit(&ncec->ncec_lock);
3185 3415          return (nce);
3186 3416  }
3187 3417  
3188      -nce_t *
     3418 +static nce_t *
3189 3419  nce_ill_lookup_then_add(ill_t *ill, ncec_t *ncec)
3190 3420  {
3191 3421          nce_t *nce;
     3422 +        list_t graveyard;
3192 3423  
     3424 +        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3193 3425          mutex_enter(&ill->ill_lock);
3194      -        nce = nce_ill_lookup_then_add_locked(ill, ncec);
     3426 +        nce = nce_ill_lookup_then_add_locked(ill, ncec, &graveyard);
3195 3427          mutex_exit(&ill->ill_lock);
     3428 +        nce_graveyard_free(&graveyard);
3196 3429          return (nce);
3197 3430  }
3198 3431  
3199 3432  
3200 3433  /*
3201 3434   * remove ncec from the ill_nce list. If 'dead' is non-null, the deleted
3202 3435   * nce is added to the 'dead' list, and the caller must nce_refrele() the
3203 3436   * entry after all locks have been dropped.
3204 3437   */
3205 3438  void
↓ open down ↓ 30 lines elided ↑ open up ↑
3236 3469   * when the fastpath response does not fit in the datab
3237 3470   * associated with the existing nce_fp_mp, we delete and
3238 3471   * add the nce to retrigger fastpath based on the information
3239 3472   * in the ncec_t.
3240 3473   */
3241 3474  static nce_t *
3242 3475  nce_delete_then_add(nce_t *nce)
3243 3476  {
3244 3477          ill_t           *ill = nce->nce_ill;
3245 3478          nce_t           *newnce = NULL;
     3479 +        list_t          graveyard;
3246 3480  
     3481 +        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
3247 3482          ip0dbg(("nce_delete_then_add nce %p ill %s\n",
3248 3483              (void *)nce, ill->ill_name));
3249 3484          mutex_enter(&ill->ill_lock);
3250 3485          mutex_enter(&nce->nce_common->ncec_lock);
3251 3486          nce_delete(nce);
3252 3487          /*
3253 3488           * Make sure that ncec is not condemned before adding. We hold the
3254 3489           * ill_lock and ncec_lock to synchronize with ncec_delete() and
3255 3490           * ipmp_ncec_delete_nce()
3256 3491           */
3257 3492          if (!NCE_ISCONDEMNED(nce->nce_common))
3258      -                newnce = nce_add(ill, nce->nce_common);
     3493 +                newnce = nce_add(ill, nce->nce_common, &graveyard);
3259 3494          mutex_exit(&nce->nce_common->ncec_lock);
3260 3495          mutex_exit(&ill->ill_lock);
     3496 +        nce_graveyard_free(&graveyard);
3261 3497          nce_refrele(nce);
3262 3498          return (newnce); /* could be null if nomem */
3263 3499  }
3264 3500  
3265 3501  typedef struct nce_fp_match_s {
3266 3502          nce_t   *nce_fp_match_res;
3267 3503          mblk_t  *nce_fp_match_ack_mp;
3268 3504  } nce_fp_match_t;
3269 3505  
3270 3506  /* ARGSUSED */
↓ open down ↓ 694 lines elided ↑ open up ↑
3965 4201                   */
3966 4202                  hw_addr = NULL;
3967 4203          }
3968 4204          ASSERT(flags & NCE_F_MCAST);
3969 4205          ASSERT(flags & NCE_F_NONUD);
3970 4206          /* nce_state will be computed by nce_add_common() */
3971 4207          err = nce_add_v4(ill, hw_addr, ill->ill_phys_addr_length, dst, flags,
3972 4208              ND_UNCHANGED, &nce);
3973 4209          mutex_exit(&ipst->ips_ndp4->ndp_g_lock);
3974 4210          if (err == 0)
3975      -                err = nce_add_v4_postprocess(nce);
     4211 +                err = (nce != NULL) ? nce_add_v4_postprocess(nce) : ENOMEM;
3976 4212          if (hw_addr != NULL)
3977 4213                  kmem_free(hw_addr, ill->ill_phys_addr_length);
3978 4214          if (err != 0) {
3979 4215                  ip1dbg(("nce_set_multicast_v4: create failed" "%d\n", err));
3980 4216                  return (err);
3981 4217          }
3982 4218  done:
3983 4219          if (newnce != NULL)
3984 4220                  *newnce = nce;
3985 4221          else
↓ open down ↓ 403 lines elided ↑ open up ↑
4389 4625          static  ncec_t          nce_nil;
4390 4626          uchar_t                 *template = NULL;
4391 4627          int                     err;
4392 4628          ncec_t                  *ncec;
4393 4629          ncec_t                  **ncep;
4394 4630          ip_stack_t              *ipst = ill->ill_ipst;
4395 4631          uint16_t                state;
4396 4632          boolean_t               fastprobe = B_FALSE;
4397 4633          struct ndp_g_s          *ndp;
4398 4634          nce_t                   *nce = NULL;
     4635 +        list_t                  graveyard;
4399 4636          mblk_t                  *dlur_mp = NULL;
4400 4637  
4401 4638          if (ill->ill_isv6)
4402 4639                  ndp = ill->ill_ipst->ips_ndp6;
4403 4640          else
4404 4641                  ndp = ill->ill_ipst->ips_ndp4;
4405 4642  
4406 4643          *retnce = NULL;
4407 4644  
4408 4645          ASSERT(MUTEX_HELD(&ndp->ndp_g_lock));
↓ open down ↓ 270 lines elided ↑ open up ↑
4679 4916          ncec->ncec_ptpn = ncep;
4680 4917  
4681 4918          /* Bump up the number of ncec's referencing this ill */
4682 4919          DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4683 4920              (char *), "ncec", (void *), ncec);
4684 4921          ill->ill_ncec_cnt++;
4685 4922          /*
4686 4923           * Since we hold the ncec_lock at this time, the ncec cannot be
4687 4924           * condemned, and we can safely add the nce.
4688 4925           */
4689      -        *retnce = nce_add_impl(ill, ncec, nce, dlur_mp);
     4926 +        list_create(&graveyard, sizeof (nce_t), offsetof(nce_t, nce_node));
     4927 +        *retnce = nce_add_impl(ill, ncec, nce, dlur_mp, &graveyard);
4690 4928          mutex_exit(&ncec->ncec_lock);
4691 4929          mutex_exit(&ill->ill_lock);
     4930 +        nce_graveyard_free(&graveyard);
4692 4931  
4693 4932          /* caller must trigger fastpath on *retnce */
4694 4933          return (0);
4695 4934  
4696 4935  err_ret:
4697 4936          if (ncec != NULL)
4698 4937                  kmem_cache_free(ncec_cache, ncec);
4699 4938          if (nce != NULL)
4700 4939                  kmem_cache_free(nce_cache, nce);
4701 4940          freemsg(dlur_mp);
↓ open down ↓ 65 lines elided ↑ open up ↑
4767 5006          } else {
4768 5007                  mutex_exit(&ill->ill_lock);
4769 5008          }
4770 5009  
4771 5010          mutex_destroy(&nce->nce_lock);
4772 5011          kmem_cache_free(nce_cache, nce);
4773 5012  }
4774 5013  
4775 5014  /*
4776 5015   * Add an nce to the ill_nce list.
     5016 + *
     5017 + * Adding multicast NCEs is subject to a per-ill limit. This function returns
     5018 + * NULL if that's the case, and it may reap a number of multicast nces.
     5019 + * Callers (and upstack) must be able to cope with NULL returns.
4777 5020   */
4778 5021  static nce_t *
4779      -nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp)
     5022 +nce_add_impl(ill_t *ill, ncec_t *ncec, nce_t *nce, mblk_t *dlur_mp,
     5023 +    list_t *graveyard)
4780 5024  {
     5025 +        ASSERT(MUTEX_HELD(&ill->ill_lock));
     5026 +
     5027 +        if ((ncec->ncec_flags & NCE_F_MCAST) != 0) {
     5028 +                if (nce_too_many_mcast(ill, graveyard)) {
     5029 +                        kmem_cache_free(nce_cache, nce);
     5030 +                        return (NULL);
     5031 +                }
     5032 +                ill->ill_mcast_nces++;
     5033 +        }
     5034 +
4781 5035          bzero(nce, sizeof (*nce));
4782 5036          mutex_init(&nce->nce_lock, NULL, MUTEX_DEFAULT, NULL);
4783 5037          nce->nce_common = ncec;
4784 5038          nce->nce_addr = ncec->ncec_addr;
4785 5039          nce->nce_ill = ill;
4786 5040          DTRACE_PROBE3(ill__incr__cnt, (ill_t *), ill,
4787 5041              (char *), "nce", (void *), nce);
4788 5042          ill->ill_nce_cnt++;
4789 5043  
4790 5044          nce->nce_refcnt = 1; /* for the thread */
4791 5045          ncec->ncec_refcnt++; /* want ncec_refhold_locked_notr(ncec) */
4792 5046          nce->nce_dlur_mp = dlur_mp;
4793 5047  
4794 5048          /* add nce to the ill's fastpath list.  */
4795 5049          nce->nce_refcnt++; /* for the list */
4796 5050          list_insert_head(&ill->ill_nce, nce);
4797 5051          return (nce);
4798 5052  }
4799 5053  
4800 5054  static nce_t *
4801      -nce_add(ill_t *ill, ncec_t *ncec)
     5055 +nce_add(ill_t *ill, ncec_t *ncec, list_t *graveyard)
4802 5056  {
4803 5057          nce_t   *nce;
4804 5058          mblk_t  *dlur_mp = NULL;
4805 5059  
4806 5060          ASSERT(MUTEX_HELD(&ill->ill_lock));
4807 5061          ASSERT(MUTEX_HELD(&ncec->ncec_lock));
4808 5062  
4809 5063          nce = kmem_cache_alloc(nce_cache, KM_NOSLEEP);
4810 5064          if (nce == NULL)
4811 5065                  return (NULL);
4812 5066          if (ncec->ncec_lladdr != NULL ||
4813 5067              ill->ill_net_type == IRE_IF_NORESOLVER) {
4814 5068                  dlur_mp = ill_dlur_gen(ncec->ncec_lladdr,
4815 5069                      ill->ill_phys_addr_length, ill->ill_sap,
4816 5070                      ill->ill_sap_length);
4817 5071                  if (dlur_mp == NULL) {
4818 5072                          kmem_cache_free(nce_cache, nce);
4819 5073                          return (NULL);
4820 5074                  }
4821 5075          }
4822      -        return (nce_add_impl(ill, ncec, nce, dlur_mp));
     5076 +        /*
     5077 +         * If nce_add_impl() returns NULL due to on multicast limiting, caller
     5078 +         * will (correctly) assume ENOMEM.
     5079 +         */
     5080 +        return (nce_add_impl(ill, ncec, nce, dlur_mp, graveyard));
4823 5081  }
4824 5082  
4825 5083  /*
4826 5084   * remove the nce from the ill_faspath list
4827 5085   */
4828 5086  void
4829 5087  nce_delete(nce_t *nce)
4830 5088  {
4831 5089          ill_t   *ill = nce->nce_ill;
4832 5090  
↓ open down ↓ 3 lines elided ↑ open up ↑
4836 5094          if (nce->nce_is_condemned) {
4837 5095                  /*
4838 5096                   * some other thread has removed this nce from the ill_nce list
4839 5097                   */
4840 5098                  mutex_exit(&nce->nce_lock);
4841 5099                  return;
4842 5100          }
4843 5101          nce->nce_is_condemned = B_TRUE;
4844 5102          mutex_exit(&nce->nce_lock);
4845 5103  
     5104 +        /* Update the count of multicast NCEs. */
     5105 +        if ((nce->nce_common->ncec_flags & NCE_F_MCAST) == NCE_F_MCAST)
     5106 +                ill->ill_mcast_nces--;
     5107 +
4846 5108          list_remove(&ill->ill_nce, nce);
4847 5109          /*
4848 5110           * even though we are holding the ill_lock, it is ok to
4849 5111           * call nce_refrele here because we know that we should have
4850 5112           * at least 2 refs on the nce: one for the thread, and one
4851 5113           * for the list. The refrele below will release the one for
4852 5114           * the list.
4853 5115           */
4854 5116          nce_refrele(nce);
4855 5117  }
↓ open down ↓ 227 lines elided ↑ open up ↑
XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX